summaryrefslogtreecommitdiff
path: root/regex.c
diff options
context:
space:
mode:
authorWormHeamer2025-12-31 05:01:16 -0500
committerWormHeamer2025-12-31 05:01:16 -0500
commitd723f8a5d54f098f0cf378e5dcf3a7d4ec049822 (patch)
tree0fe9aa493307d04df9d431536b7b3e90d6eb0814 /regex.c
parent026da321ebf2f09d17e3978aacf10b513d46460c (diff)
add regex search (only forwards for now)
Diffstat (limited to 'regex.c')
-rw-r--r--regex.c912
1 files changed, 912 insertions, 0 deletions
diff --git a/regex.c b/regex.c
new file mode 100644
index 0000000..710593f
--- /dev/null
+++ b/regex.c
@@ -0,0 +1,912 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "wrmr.h"
+#include "dynarr.h"
+#include "str.h"
+#include "regex.h"
+
+/* bitsets */
+
+#define RE_BITSET_BYTES sizeof(uintmax_t)
+#define RE_BITSET_BITS (8 * RE_BITSET_BYTES)
+#define RE_BITSET_CAP(n) (((n)/64)+1)
+#define RE_BITSET_HAS(s, i) (1 & ((s)[(i)/RE_BITSET_BITS] >> ((i)&(RE_BITSET_BITS-1))))
+#define RE_BITSET_MASK(i) ((uintmax_t)1 << ((i)&(RE_BITSET_BITS-1)))
+#define RE_BITSET_SET(s, i) ((s)[(i)/RE_BITSET_BITS] |= RE_BITSET_MASK(i))
+#define RE_BITSET_CLR(s, i) ((s)[(i)/RE_BITSET_BITS] &= ~RE_BITSET_MASK(i))
+#define RE_BITSET_ALLOC(a, n) new_arr(a, uintmax_t, RE_BITSET_CAP(n))
+#define RE_BITSET_SET_ALL(s, n) memset(s, 0xff, RE_BITSET_CAP(n) * RE_BITSET_BYTES)
+#define RE_BITSET_CLR_ALL(s, n) memset(s, 0x00, RE_BITSET_CAP(n) * RE_BITSET_BYTES)
+
+/* regex parsing */
+
+typedef struct {
+ u32 dest;
+} Label;
+
+typedef struct {
+ RegEx *re;
+ Arena *perm, *scratch;
+ DYNARR(u32) lbl;
+ ReCompFlags flags;
+ Str s;
+ int i;
+} ReParser;
+
+typedef enum {
+ N_CHAR,
+ N_CHAR_SET,
+ N_CONCAT,
+ N_GROUP,
+ N_ALTERNATE,
+ N_ONE_OR_MORE,
+ N_ZERO_OR_MORE,
+ N_ZERO_OR_ONE,
+} ReNodeType;
+
+/* possible peepholes:
+ *
+ * (a . b) | (a . c) -> a . (b | c)
+ * a . b -> "ab"
+ * a | b | c -> [abc]
+ *
+ */
+
+typedef struct ReNode {
+ ReNodeType t;
+ u32 c;
+ struct ReNode *a, *b;
+} ReNode;
+
+ReNode *re_new_node(ReParser *s, ReNodeType t, struct ReNode *a, struct ReNode *b) {
+ ReNode *n = new(s->scratch, ReNode);
+ n->t = t;
+ n->a = a;
+ n->b = b;
+ return n;
+}
+
+ReNode *re_new_node_c(ReParser *s, ReNodeType t, u32 c) {
+ ReNode *n = new(s->scratch, ReNode);
+ n->t = t;
+ n->c = c;
+ return n;
+}
+
+ReNode *re_new_node_char(ReParser *s, u32 c) {
+ return re_new_node_c(s, N_CHAR, c);
+}
+
+#define ch(s) ((s)->i < (s)->s.n ? (s)->s.s[(s)->i] : 0)
+#define skip(s,n) ((s)->i += (n))
+
+ReNode *re_parse_expr(ReParser *s);
+
+ReNode *re_parse_group(ReParser *s) {
+ /* TODO */
+ skip(s, 1);
+ ReNode *n = re_new_node(s, N_GROUP, re_parse_expr(s), NULL);
+ if (ch(s) != ')') FAIL_WITH_MSG("expected )");
+ skip(s, 1);
+ return n;
+}
+
+static int cmp_ch_range(const void *pa, const void *pb) {
+ ReChRange a = *(const ReChRange *)pa;
+ ReChRange b = *(const ReChRange *)pb;
+ return (a.min > b.min) - (a.min < b.min);
+}
+
+ReNode *re_parse_char_set(ReParser *s) {
+ ReChSet cset = { 0 };
+ skip(s, 1);
+ if (ch(s) == '^') {
+ cset.invert = 1;
+ skip(s, 1);
+ }
+
+ /* parse list of ranges */
+ DYNARR(ReChRange) l = { 0 };
+ do {
+ u32 c = ch(s);
+ skip(s, 1);
+ if (ch(s) == '-') {
+ skip(s, 1);
+ ASSERT(s->i < s->s.n);
+ u32 n = ch(s);
+ skip(s, 1);
+ if (c > n) {
+ u32 t = c;
+ c = n;
+ n = t;
+ }
+ DA_APUSH(&l, s->scratch, (ReChRange) { c, n });
+ } else {
+ DA_APUSH(&l, s->scratch, (ReChRange) { c, c });
+ }
+ } while (s->i < s->s.n && ch(s) != ']');
+ skip(s, 1);
+
+ /* combine those that overlap */
+ qsort(l.v, l.n, sizeof(ReChRange), cmp_ch_range);
+ for (u32 i = 0; i + 1 < l.n; i++) {
+ if (l.v[i].max >= l.v[i+1].min || l.v[i+1].min == l.v[i].max + 1) {
+ if (l.v[i+1].max > l.v[i].max) {
+ l.v[i].max = l.v[i+1].max;
+ }
+ for (u32 j = i + 2; j < l.n; j++) l.v[j-1] = l.v[j];
+ l.n--;
+ i--;
+ }
+ }
+
+ /* copy to permanent memory */
+ cset.n = l.n;
+ cset.v = new_arr(s->perm, ReChRange, cset.n);
+ memcpy(cset.v, l.v, l.n * sizeof(ReChRange));
+ u32 cseti = s->re->cset.n;
+ DA_APUSH(&s->re->cset, s->perm, cset);
+ return re_new_node_c(s, N_CHAR_SET, cseti);
+}
+
+ReNode *re_parse_atom(ReParser *s) {
+ switch (ch(s)) {
+ case '(':
+ return re_parse_group(s);
+ case '[':
+ return re_parse_char_set(s);
+ case '.':
+ skip(s, 1);
+ return re_new_node_char(s, C_ANY);
+ case '^':
+ skip(s, 1);
+ return re_new_node_char(s, C_LINE_START);
+ case '$':
+ skip(s, 1);
+ return re_new_node_char(s, C_LINE_END);
+ case '\\': {
+ /* TODO: throw error at end-of-string */
+ skip(s, 1);
+ ReNode *n = re_new_node_char(s, ch(s));
+ skip(s, 1);
+ return n;
+ } break;
+ case 0:
+ /* should never happen? */
+ FAIL_WITH_MSG("atom parsed at end");
+ return NULL;
+ default: {
+ ReNode *n = re_new_node_char(s, ch(s));
+ skip(s, 1);
+ return n;
+ } break;
+ }
+ return NULL;
+}
+
+/* needed to prevent repetitions of group start and end */
+ReNode *re_wrap_node(ReParser *s, ReNode *n, ReNodeType t) {
+ if (n && n->t == N_GROUP) {
+ n->a = re_wrap_node(s, n->a, t);
+ return n;
+ } else {
+ return re_new_node(s, t, n, NULL);
+ }
+}
+
+ReNode *re_parse_expr(ReParser *s) {
+ ReNode *p = NULL;
+ ReNode *alt = NULL;
+ while (s->i < s->s.n) {
+ if (ch(s) == '|') {
+ if (alt) {
+ alt = re_new_node(s, N_ALTERNATE, alt, p);
+ } else {
+ alt = p;
+ }
+ p = NULL;
+ skip(s, 1);
+ }
+ if (ch(s) == ')') break;
+ ReNode *n = re_parse_atom(s);
+ switch (ch(s)) {
+ case '?': n = re_wrap_node(s, n, N_ZERO_OR_ONE); skip(s, 1); break;
+ case '*': n = re_wrap_node(s, n, N_ZERO_OR_MORE); skip(s, 1); break;
+ case '+': n = re_wrap_node(s, n, N_ONE_OR_MORE); skip(s, 1); break;
+ }
+ if (p) {
+ p = re_new_node(s, N_CONCAT, p, n);
+ } else {
+ p = n;
+ }
+ }
+ if (alt) p = re_new_node(s, N_ALTERNATE, alt, p);
+ return p;
+}
+
+#undef ch
+#undef skip
+
+/* regex compilation */
+
+u32 re_emit0(ReParser *s, ReOpType op) {
+ u32 i = s->re->op.n;
+ DA_APUSH(&s->re->op, s->perm, (ReOp) { .op = op });
+ return i;
+}
+
+u32 re_emit1(ReParser *s, ReOpType op, u32 c) {
+ u32 i = s->re->op.n;
+ DA_APUSH(&s->re->op, s->perm, (ReOp) { .op = op, .c = c });
+ return i;
+}
+
+u32 re_emit2(ReParser *s, ReOpType op, u16 a, u16 b) {
+ u32 i = s->re->op.n;
+ DA_APUSH(&s->re->op, s->perm, (ReOp) { .op = op, .a = a, .b = b });
+ return i;
+}
+
+u32 re_lbl_reserve(ReParser *s) {
+ u32 l = s->lbl.n;
+ DA_APUSH(&s->lbl, s->scratch, 0);
+ return l;
+}
+
+u32 re_lbl_emit(ReParser *s, u32 l) {
+ s->lbl.v[l] = s->re->op.n;
+ return re_emit1(s, RE_LABEL, l);
+}
+
+u32 re_lbl_push(ReParser *s) {
+ u32 l = re_lbl_reserve(s);
+ re_lbl_emit(s, l);
+ return l;
+}
+
+void re_lbl_resolve(ReParser *s) {
+ RegEx *r = s->re;
+ /* resolve label addresses */
+ for (u32 i = 0; i < r->op.n; i++) {
+ ReOp *o = &r->op.v[i];
+ if (o->op == RE_LABEL) {
+ s->lbl.v[o->c] = i;
+ if (i + 1 < r->op.n) {
+ memmove(&r->op.v[i], &r->op.v[i+1], (r->op.n - (i+1)) * sizeof(ReOp));
+ }
+ r->op.n--;
+ i--;
+ }
+ }
+ /* patch them in */
+ for (u32 i = 0; i < r->op.n; i++) {
+ ReOp *o = &r->op.v[i];
+ switch (o->op) {
+ case RE_JUMP:
+ o->c = s->lbl.v[o->c];
+ break;
+ case RE_SPLIT:
+ o->a = s->lbl.v[o->a];
+ o->b = s->lbl.v[o->b];
+ break;
+ default:
+ break;
+ }
+ }
+ s->lbl.n = 0;
+}
+
+int re_comp_node(ReParser *s, ReNode *n) {
+ if (!n) return 0;
+ switch (n->t) {
+ case N_CHAR:
+ switch (n->c) {
+ case C_ANY:
+ re_emit0(s, RE_CHAR_ANY);
+ break;
+ case C_LINE_START:
+ re_emit0(s, RE_LINE_START);
+ break;
+ case C_LINE_END:
+ re_emit0(s, RE_LINE_END);
+ break;
+ default:
+ re_emit1(s, RE_CHAR, n->c);
+ break;
+ }
+ break;
+ case N_CHAR_SET: {
+ ReChSet *cset = &s->re->cset.v[n->c];
+ if (cset->n == 1 && cset->v[0].max < 65536) {
+ if (cset->v[0].min == cset->v[0].max) {
+ re_emit1(s, cset->invert ? RE_CHAR_NOT : RE_CHAR, cset->v[0].min);
+ } else {
+ re_emit2(s, cset->invert ? RE_CHAR_RANGE_NOT : RE_CHAR_RANGE, cset->v[0].min, cset->v[0].max);
+ }
+ } else {
+ u32 chars = 0;
+ int all_bytes = 1;
+ for (u32 i = 0; i < cset->n; i++) {
+ if (cset->v[i].max > 255) all_bytes = 0;
+ chars += (cset->v[i].max + 1) - cset->v[i].min;
+ }
+ if (chars <= 7 && all_bytes) {
+ u32 op = re_emit0(s, cset->invert ? RE_CHAR_SET_PACKED_NOT : RE_CHAR_SET_PACKED);
+ ReOp *o = &s->re->op.v[op];
+ u8 *c = (u8*)o + 1; /* right past opcode */
+ memset(c, 0, 7);
+ for (u32 i = 0; i < cset->n; i++) {
+ for (u32 j = cset->v[i].min; j <= cset->v[i].max; j++) {
+ *c++ = j;
+ }
+ }
+ } else {
+ re_emit1(s, RE_CHAR_SET, n->c);
+ }
+ }
+ } break;
+ case N_CONCAT:
+ re_comp_node(s, n->a);
+ re_comp_node(s, n->b);
+ break;
+ case N_ALTERNATE: {
+ u32 l1 = re_lbl_reserve(s);
+ u32 l2 = re_lbl_reserve(s);
+ u32 l3 = re_lbl_reserve(s);
+ re_emit2(s, RE_SPLIT, l1, l2);
+ re_lbl_emit(s, l1);
+ re_comp_node(s, n->a);
+ re_emit1(s, RE_JUMP, l3);
+ re_lbl_emit(s, l2);
+ re_comp_node(s, n->b);
+ re_lbl_emit(s, l3);
+ } break;
+ case N_ZERO_OR_ONE: {
+ u32 l1 = re_lbl_reserve(s);
+ u32 l2 = re_lbl_reserve(s);
+ re_emit2(s, RE_SPLIT, l1, l2);
+ re_lbl_emit(s, l1);
+ re_comp_node(s, n->a);
+ re_lbl_emit(s, l2);
+ } break;
+ case N_ONE_OR_MORE: {
+ u32 l1 = re_lbl_reserve(s);
+ u32 l2 = re_lbl_reserve(s);
+ re_lbl_emit(s, l1);
+ re_comp_node(s, n->a);
+ re_emit2(s, RE_SPLIT, l1, l2);
+ re_lbl_emit(s, l2);
+ } break;
+ case N_ZERO_OR_MORE: {
+ u32 l1 = re_lbl_reserve(s);
+ u32 l2 = re_lbl_reserve(s);
+ u32 l3 = re_lbl_reserve(s);
+ re_lbl_emit(s, l1);
+ re_emit2(s, RE_SPLIT, l2, l3);
+ re_lbl_emit(s, l2);
+ re_comp_node(s, n->a);
+ re_emit1(s, RE_JUMP, l1);
+ re_lbl_emit(s, l3);
+ } break;
+ case N_GROUP:
+ if (s->flags & RE_COMP_NO_GROUPS) {
+ re_comp_node(s, n->a);
+ } else {
+ u32 g = s->re->groups++;
+ re_emit1(s, RE_GROUP_START, g);
+ re_comp_node(s, n->a);
+ re_emit1(s, RE_GROUP_END, g);
+ }
+ break;
+ default:
+ FAIL_WITH_MSG("unimplemented");
+ break;
+ }
+ return 0;
+}
+
+void re_op_del(ReParser *s, u32 i, u32 n) {
+ RegEx *r = s->re;
+ if (i + n > r->op.n) n = r->op.n - i;
+ for (u32 j = 0; j < s->lbl.n; j++) {
+ if (s->lbl.v[j] >= i) s->lbl.v[j] -= n;
+ }
+ memmove(&r->op.v[i], &r->op.v[i+1], (r->op.n - (i + n)) * sizeof(ReOp));
+ r->op.n -= n;
+}
+
+void re_op_ins(ReParser *s, u32 i, ReOp op) {
+ RegEx *r = s->re;
+ for (u32 j = 0; j < s->lbl.n; j++) {
+ if (s->lbl.v[j] >= i) s->lbl.v[j] += 1;
+ }
+ DA_AFIT(&r->op, s->perm, r->op.n + 1);
+ if (i + 1 < r->op.n) {
+ memmove(&r->op.v[i+1], &r->op.v[i], (r->op.n - i) * sizeof(ReOp));
+ }
+ r->op.v[i] = op;
+ r->op.n++;
+}
+
+static inline int re_op_eql(RegEx *re, u32 a, u32 b) {
+ ReOp *oa = &re->op.v[a];
+ ReOp *ob = &re->op.v[b];
+ return !memcmp(oa, ob, sizeof(ReOp));
+}
+
+static inline u32 re_first_non_label(ReParser *s, u32 i) {
+ while (s->re->op.v[i].op == RE_LABEL) i++;
+ ASSERT(i < s->re->op.n);
+ return i;
+}
+
+void re_comp_find_unreachable_from(ReParser *s, uintmax_t *unreachable, u32 i) {
+ RegEx *re = s->re;
+ if (!RE_BITSET_HAS(unreachable, i)) return;
+ while (i < re->op.n) {
+ RE_BITSET_CLR(unreachable, i);
+ ReOp *o = &re->op.v[i];
+ switch (o->op) {
+ case RE_SPLIT:
+ re_comp_find_unreachable_from(s, unreachable, s->lbl.v[o->a]);
+ i = s->lbl.v[o->b];
+ break;
+ case RE_JUMP:
+ i = s->lbl.v[o->c];
+ break;
+ case RE_MATCH:
+ case RE_FAIL:
+ return;
+ default:
+ i++;
+ break;
+ }
+ }
+}
+
+void re_comp_find_unreachable(ReParser *s, uintmax_t *unreachable) {
+ RE_BITSET_SET_ALL(unreachable, s->re->op.n);
+ re_comp_find_unreachable_from(s, unreachable, 0);
+}
+
+void re_comp_dcelim(ReParser *s) {
+ RegEx *re = s->re;
+ uintmax_t *unreachable = RE_BITSET_ALLOC(s->scratch, s->re->op.n);
+
+ for (;;) {
+ re_comp_find_unreachable(s, unreachable);
+ int jmp_removed = 0;
+ /* TODO: rewrite this for fast bitset iteration with stdc_count_trailing_zeros() */
+ /* must go backwards to not invalidate the indices */
+ for (i32 i = re->op.n - 1; i >= 0; i--) {
+ ReOp *o = &re->op.v[i];
+ if (RE_BITSET_HAS(unreachable, i)) {
+ if (o->op == RE_JUMP || o->op == RE_SPLIT) jmp_removed = 1;
+ re_op_del(s, i, 1);
+ }
+ }
+ if (!jmp_removed) break;
+ }
+}
+
+void re_comp_opt(ReParser *s) {
+ RegEx *re = s->re;
+
+ /* replace jumps with their destinations, if appropriate*/
+
+#if 1
+ for (u32 i = 0; i < re->op.n; i++) {
+ ReOp *o = &re->op.v[i];
+again:
+ if (o->op == RE_JUMP) {
+ u32 dest = re_first_non_label(s, s->lbl.v[o->c]);
+ if (re->op.v[dest].op == RE_MATCH
+ || re->op.v[dest].op == RE_FAIL
+ || re->op.v[dest].op == RE_SPLIT
+ || re->op.v[dest].op == RE_JUMP) {
+ *o = re->op.v[dest];
+ goto again;
+ }
+ }
+ if (o->op == RE_SPLIT) {
+ /* TODO: will this cause bugs of trampling over the labels intended
+ * for other jumps, other splits? */
+ if (o->a == o->b) {
+ o->c = o->a;
+ o->op = RE_JUMP;
+ goto again;
+ } else {
+ u32 a = re_first_non_label(s, s->lbl.v[o->a]);
+ u32 b = re_first_non_label(s, s->lbl.v[o->b]);
+ if (re_op_eql(re, a, b)) {
+ ReOp op = re->op.v[a];
+ re_op_del(s, a, 1);
+ if (b >= a) b--;
+ re_op_del(s, b, 1);
+ re_op_ins(s, i, op);
+ } else {
+ int redo = 0;
+ while (re->op.v[a].op == RE_JUMP) {
+ o->a = re->op.v[a].c;
+ a = re_first_non_label(s, s->lbl.v[o->a]);
+ redo = 1;
+ }
+ while (re->op.v[b].op == RE_JUMP) {
+ o->b = re->op.v[b].c;
+ b = re_first_non_label(s, s->lbl.v[o->b]);
+ redo = 1;
+ }
+ if (redo) goto again;
+ }
+ }
+ }
+ }
+#endif
+
+ re_comp_dcelim(s);
+}
+
+u8 re_comp_first_byte(ReParser *s) {
+ RegEx *re = s->re;
+ for (u32 i = 0; i < re->op.n; ) {
+ ReOp *o = &re->op.v[i];
+ switch (o->op) {
+ case RE_CHAR:
+ /* TODO: utf-8 encode, then pick first byte */
+ return o->c;
+ case RE_JUMP:
+ i = o->c;
+ break;
+ case RE_GROUP_START:
+ case RE_GROUP_END:
+ i++;
+ break;
+ default:
+ return 0;
+ }
+ }
+ return 0;
+}
+
+int re_comp_fin(ReParser *s) {
+ re_emit0(s, RE_MATCH);
+ re_comp_opt(s);
+ re_lbl_resolve(s);
+ s->re->first_byte = re_comp_first_byte(s);
+ return 0;
+}
+
+int re_comp_ex(RegEx *re, Str src, Arena *perm, Arena *scratch, ReCompFlags flags) {
+ ReParser s = {
+ .re = re,
+ .perm = perm,
+ .scratch = scratch,
+ .s = src,
+ .flags = flags
+ };
+ ReNode *n = re_parse_expr(&s);
+
+ if (re_comp_node(&s, n)) return -1;
+ if (re_comp_fin(&s)) return -1;
+
+ return 0;
+}
+
+int re_comp(RegEx *re, Str src, Arena *perm, Arena *scratch) {
+ return re_comp_ex(re, src, perm, scratch, 0);
+}
+
+/* regex execution */
+
+void re_threadlist_clr(ReThreadList *l) {
+ memset(l->set, 0, 8 * ((l->n / 64) + 1));
+ l->n = 0;
+}
+
+/* TODO: find a way to not copy capture groups */
+void re_threadlist_add(ReThreadList *l, u32 inst, ReSpan *grp, u32 ngrp, Arena *a) {
+ if (!RE_BITSET_HAS(l->set, inst)) {
+ ReSpan *g = new_arr(a, ReSpan, ngrp);
+ if (grp) memcpy(g, grp, ngrp * sizeof(ReSpan));
+ l->v[l->n++] = (ReThread) { inst, g };
+ RE_BITSET_SET(l->set, inst);
+ }
+}
+
+void re_threadlist_alloc(ReThreadList *l, u32 n, Arena *a) {
+ l->set = RE_BITSET_ALLOC(a, n);
+ l->v = new_arr(a, ReThread, n);
+}
+
+static inline int re_char_set_has(RegEx *re, u32 cset, u32 c) {
+ ReChSet *p = &re->cset.v[cset];
+ u32 l = 0, r = p->n;
+ while (l < r) {
+ u32 m = (l + r) / 2;
+ ReChRange x = p->v[m];
+ if (c < x.min) r = m;
+ else if (c > x.max) l = m + 1;
+ else return !p->invert;
+ }
+ return p->invert;
+}
+
+static inline int re_char_set_packed_has(ReOp o, u32 c) {
+ /* will count (c = 0) as true for non-full-length packed char sets.
+ * is that a problem?
+ */
+ u64 b = o.align;
+ int r = 0;
+ for (u32 i = 0; i < 7; i++) {
+ r |= (b & 0xff) == c;
+ b >>= 8;
+ }
+ return r;
+}
+
+typedef enum {
+ RE_THREAD_AT_START = 1,
+ RE_THREAD_AT_END = 2
+} ReThreadStepFlags;
+
+/* 1 for match found, 0 for normal exit, -1 for reexecute in place */
+static inline int re_thread_step(ReSearch *s, ReThread *t, u32 i, ReThreadStepFlags f) {
+ Arena *a = s->a;
+ RegEx *re = s->re;
+ ReOp o = re->op.v[t->i];
+ switch (o.op) {
+ case RE_CHAR:
+ if (s->c != o.c) return 0;
+ break;
+ case RE_CHAR_NOT:
+ if (s->c == o.c) return 0;
+ break;
+ case RE_CHAR_ANY:
+ if (s->c == '\n') return 0;
+ break;
+ case RE_CHAR_SET:
+ if (!re_char_set_has(re, o.c, s->c)) return 0;
+ break;
+ case RE_CHAR_RANGE:
+ if (s->c < o.a || s->c > o.b) return 0;
+ break;
+ case RE_CHAR_RANGE_NOT:
+ if (s->c >= o.a && s->c <= o.b) return 0;
+ break;
+ case RE_CHAR_SET_PACKED:
+ if (!re_char_set_packed_has(o, s->c)) return 0;
+ break;
+ case RE_CHAR_SET_PACKED_NOT:
+ if (re_char_set_packed_has(o, s->c)) return 0;
+ break;
+ case RE_LINE_START:
+ if (s->c_prev != '\n' && !(f & RE_THREAD_AT_START)) return 0;
+ t->i++;
+ return -1;
+ case RE_LINE_END:
+ if (s->c != '\n' && !(f & RE_THREAD_AT_END)) return 0;
+ t->i++;
+ return -1;
+ case RE_MATCH:
+ return 1;
+ case RE_FAIL:
+ return 0;
+ case RE_JUMP:
+ t->i = o.c;
+ return -1;
+ case RE_SPLIT:
+ re_threadlist_add(&s->tcur, o.b, t->grp, re->groups, a);
+ t->i = o.a;
+ return -1;
+ case RE_GROUP_START:
+ t->grp[o.c].start = s->total_idx + i;
+ t->i++;
+ return -1;
+ case RE_GROUP_END:
+ t->grp[o.c].len = s->total_idx + i - t->grp[o.c].start;
+ t->i++;
+ return -1;
+ default:
+ UNREACHABLE;
+ break;
+ }
+ re_threadlist_add(&s->tnext, t->i + 1, t->grp, re->groups, a);
+ return 0;
+}
+
+static inline int re_thread_final(ReSearch *s, ReThread *t, u32 i, ReThreadStepFlags f) {
+ /* halt at any opcodes that involve character matching */
+ /* we could check for C_EOF in all of them, but it's easier this way */
+ switch (s->re->op.v[t->i].op) {
+ case RE_LINE_END:
+ case RE_MATCH:
+ case RE_FAIL:
+ case RE_JUMP:
+ case RE_SPLIT:
+ case RE_GROUP_START:
+ case RE_GROUP_END:
+ return re_thread_step(s, t, i, f);
+ default:
+ return 0;
+ }
+}
+
+static inline int re_threadlist_step(ReSearch *s, ReThreadList *tl, u32 i, ReThreadStepFlags f) {
+ int match = 0;
+ for (u32 j = 0; j < tl->n; j++) {
+ ReThread *t = &tl->v[j];
+ ASSERT(t->i < s->re->op.n);
+ int r = re_thread_step(s, t, i, f);
+ if (r < 0) {
+ j--;
+ } else if (r > 0) {
+ match = 1;
+ s->match_end = s->total_idx + i;
+ memcpy(s->grp, t->grp, sizeof(ReSpan) * s->re->groups);
+ }
+ }
+ return match;
+}
+
+static inline int re_step(ReSearch *s, u32 i, ReThreadStepFlags f) {
+ int r = re_threadlist_step(s, &s->tcur, i, f);
+ f = 0;
+ s->c_prev = s->c;
+ ReThreadList tmp = s->tcur;
+ s->tcur = s->tnext;
+ s->tnext = tmp;
+ re_threadlist_clr(&s->tnext);
+ return r;
+}
+
+static inline int re_step_final(ReSearch *s, u32 i, ReThreadStepFlags f) {
+ int found = 0;
+ for (u32 j = 0; j < s->tcur.n; j++) {
+ int r = re_thread_final(s, &s->tcur.v[j], i, f);
+ if (r < 0) {
+ j--;
+ } else if (r > 0) {
+ found = 1;
+ }
+ }
+ re_threadlist_clr(&s->tcur);
+ re_threadlist_clr(&s->tnext);
+ return found;
+}
+
+static inline isize re_search_try_match(ReSearch *s, size_t i, size_t n) {
+ RegEx *re = s->re;
+ size_t start = s->total_idx;
+ if (~s->flags & RE_SEARCH_MID_MATCH) {
+ re_threadlist_clr(&s->tcur);
+ re_threadlist_clr(&s->tnext);
+ re_threadlist_add(&s->tcur, 0, s->grp, re->groups, s->a);
+ s->flags |= RE_SEARCH_MID_MATCH;
+ s->match_start = start + i;
+ s->match_end = 0;
+ }
+ isize found_i = -1;
+ ReThreadStepFlags f = 0;
+ if (i + start == 0) f = RE_THREAD_AT_START;
+
+ for (; i < n && s->tcur.n > 0; i++) {
+ s->c = (unsigned char)s->buf[i];
+ if (re_step(s, i, f)) {
+ found_i = i;
+ s->match_end = start + i;
+ }
+ }
+ if (s->flags & RE_SEARCH_LAST_CHUNK) {
+ f |= RE_THREAD_AT_END;
+ s->c = C_EOF;
+ if (re_step_final(s, i, f)) {
+ found_i = i;
+ s->match_end = start + i;
+ }
+ }
+
+ if (s->tcur.n > 0) return i;
+ s->flags &= ~RE_SEARCH_MID_MATCH;
+ return found_i;
+}
+
+/* public facing API */
+
+void re_search_start(ReSearch *s, RegEx *re, Arena *a) {
+ *s = (ReSearch) {
+ .a = a,
+ .re = re,
+ .grp = new_arr(a, ReSpan, re->groups),
+ .flags = RE_SEARCH_FIRST_CHUNK
+ };
+ re_threadlist_alloc(&s->tcur, re->op.n, a);
+ re_threadlist_alloc(&s->tnext, re->op.n, a);
+}
+
+void re_search_chunk(ReSearch *s, const char *buf, size_t n) {
+ if (s->buf_len) s->flags &= ~RE_SEARCH_FIRST_CHUNK;
+ s->total_idx += s->buf_len;
+ s->buf = buf;
+ s->buf_len = n;
+ s->buf_idx = 0;
+}
+
+void re_search_last_chunk(ReSearch *s) {
+ s->flags |= RE_SEARCH_LAST_CHUNK;
+}
+
+
+/* searching */
+
+/* returns final index undescriminately.
+ * check s->flags to find if the match is done yet
+ */
+
+int re_search_match(ReSearch *s, ReMatch *m) {
+ size_t i = s->buf_idx;
+ size_t n = s->buf_len;
+ while (i < n) {
+ if (s->re->first_byte && (~s->flags & RE_SEARCH_MID_MATCH)) {
+ const char *p = memchr(s->buf + i, s->re->first_byte, n - i);
+ if (!p) break;
+ i = p - s->buf;
+ }
+ isize r = re_search_try_match(s, i, n);
+ if (r < 0) {
+ i++;
+ } else {
+ if (i == (usize)r) r++;
+ if (s->flags & RE_SEARCH_MID_MATCH) {
+ i = r;
+ } else {
+ s->buf_idx = r;
+ m->grp = s->grp;
+ ASSERT(s->match_start <= s->total_idx + i);
+ ASSERT(s->match_end >= s->match_start);
+ m->extent.start = s->match_start;
+ m->extent.len = s->match_end - s->match_start;
+ return 1;
+ }
+ }
+ }
+ s->buf_idx = s->buf_len;
+ return 0;
+}
+
+/* convenience wrappers */
+
+ReMatchList re_match_all(RegEx *re, Str s, Arena *a) {
+ ReSearch sr = { 0 };
+ ReMatchList ml = { 0 };
+ ReMatch m;
+ re_search_start(&sr, re, a);
+ re_search_chunk(&sr, s.s, s.n);
+ re_search_last_chunk(&sr);
+ while (re_search_match(&sr, &m)) {
+ ReSpan *grp = new_arr(a, ReSpan, re->groups);
+ memcpy(grp, m.grp, sizeof(ReSpan) * re->groups);
+ DA_APUSH(&ml, a, (ReMatch) { .extent = m.extent, .grp = grp });
+ }
+ return ml;
+}
+
+int re_match_full(RegEx *re, Str s, Arena *a) {
+ ReSearch sr = { 0 };
+ ReMatch m;
+ re_search_start(&sr, re, a);
+ re_search_chunk(&sr, s.s, s.n);
+ re_search_last_chunk(&sr);
+ return re_search_match(&sr, &m) && m.extent.start == 0 && m.extent.len == s.n;
+}
+
+int re_match(RegEx *re, Str s, ReMatch *out, Arena *a) {
+ ReSearch sr = { 0 };
+ re_search_start(&sr, re, a);
+ re_search_chunk(&sr, s.s, s.n);
+ re_search_last_chunk(&sr);
+ return re_search_match(&sr, out);
+}