From d723f8a5d54f098f0cf378e5dcf3a7d4ec049822 Mon Sep 17 00:00:00 2001 From: WormHeamer Date: Wed, 31 Dec 2025 05:01:16 -0500 Subject: add regex search (only forwards for now) --- regex.c | 912 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 912 insertions(+) create mode 100644 regex.c (limited to 'regex.c') diff --git a/regex.c b/regex.c new file mode 100644 index 0000000..710593f --- /dev/null +++ b/regex.c @@ -0,0 +1,912 @@ +#include +#include +#include +#include +#include + +#include "wrmr.h" +#include "dynarr.h" +#include "str.h" +#include "regex.h" + +/* bitsets */ + +#define RE_BITSET_BYTES sizeof(uintmax_t) +#define RE_BITSET_BITS (8 * RE_BITSET_BYTES) +#define RE_BITSET_CAP(n) (((n)/64)+1) +#define RE_BITSET_HAS(s, i) (1 & ((s)[(i)/RE_BITSET_BITS] >> ((i)&(RE_BITSET_BITS-1)))) +#define RE_BITSET_MASK(i) ((uintmax_t)1 << ((i)&(RE_BITSET_BITS-1))) +#define RE_BITSET_SET(s, i) ((s)[(i)/RE_BITSET_BITS] |= RE_BITSET_MASK(i)) +#define RE_BITSET_CLR(s, i) ((s)[(i)/RE_BITSET_BITS] &= ~RE_BITSET_MASK(i)) +#define RE_BITSET_ALLOC(a, n) new_arr(a, uintmax_t, RE_BITSET_CAP(n)) +#define RE_BITSET_SET_ALL(s, n) memset(s, 0xff, RE_BITSET_CAP(n) * RE_BITSET_BYTES) +#define RE_BITSET_CLR_ALL(s, n) memset(s, 0x00, RE_BITSET_CAP(n) * RE_BITSET_BYTES) + +/* regex parsing */ + +typedef struct { + u32 dest; +} Label; + +typedef struct { + RegEx *re; + Arena *perm, *scratch; + DYNARR(u32) lbl; + ReCompFlags flags; + Str s; + int i; +} ReParser; + +typedef enum { + N_CHAR, + N_CHAR_SET, + N_CONCAT, + N_GROUP, + N_ALTERNATE, + N_ONE_OR_MORE, + N_ZERO_OR_MORE, + N_ZERO_OR_ONE, +} ReNodeType; + +/* possible peepholes: + * + * (a . b) | (a . c) -> a . (b | c) + * a . b -> "ab" + * a | b | c -> [abc] + * + */ + +typedef struct ReNode { + ReNodeType t; + u32 c; + struct ReNode *a, *b; +} ReNode; + +ReNode *re_new_node(ReParser *s, ReNodeType t, struct ReNode *a, struct ReNode *b) { + ReNode *n = new(s->scratch, ReNode); + n->t = t; + n->a = a; + n->b = b; + return n; +} + +ReNode *re_new_node_c(ReParser *s, ReNodeType t, u32 c) { + ReNode *n = new(s->scratch, ReNode); + n->t = t; + n->c = c; + return n; +} + +ReNode *re_new_node_char(ReParser *s, u32 c) { + return re_new_node_c(s, N_CHAR, c); +} + +#define ch(s) ((s)->i < (s)->s.n ? (s)->s.s[(s)->i] : 0) +#define skip(s,n) ((s)->i += (n)) + +ReNode *re_parse_expr(ReParser *s); + +ReNode *re_parse_group(ReParser *s) { + /* TODO */ + skip(s, 1); + ReNode *n = re_new_node(s, N_GROUP, re_parse_expr(s), NULL); + if (ch(s) != ')') FAIL_WITH_MSG("expected )"); + skip(s, 1); + return n; +} + +static int cmp_ch_range(const void *pa, const void *pb) { + ReChRange a = *(const ReChRange *)pa; + ReChRange b = *(const ReChRange *)pb; + return (a.min > b.min) - (a.min < b.min); +} + +ReNode *re_parse_char_set(ReParser *s) { + ReChSet cset = { 0 }; + skip(s, 1); + if (ch(s) == '^') { + cset.invert = 1; + skip(s, 1); + } + + /* parse list of ranges */ + DYNARR(ReChRange) l = { 0 }; + do { + u32 c = ch(s); + skip(s, 1); + if (ch(s) == '-') { + skip(s, 1); + ASSERT(s->i < s->s.n); + u32 n = ch(s); + skip(s, 1); + if (c > n) { + u32 t = c; + c = n; + n = t; + } + DA_APUSH(&l, s->scratch, (ReChRange) { c, n }); + } else { + DA_APUSH(&l, s->scratch, (ReChRange) { c, c }); + } + } while (s->i < s->s.n && ch(s) != ']'); + skip(s, 1); + + /* combine those that overlap */ + qsort(l.v, l.n, sizeof(ReChRange), cmp_ch_range); + for (u32 i = 0; i + 1 < l.n; i++) { + if (l.v[i].max >= l.v[i+1].min || l.v[i+1].min == l.v[i].max + 1) { + if (l.v[i+1].max > l.v[i].max) { + l.v[i].max = l.v[i+1].max; + } + for (u32 j = i + 2; j < l.n; j++) l.v[j-1] = l.v[j]; + l.n--; + i--; + } + } + + /* copy to permanent memory */ + cset.n = l.n; + cset.v = new_arr(s->perm, ReChRange, cset.n); + memcpy(cset.v, l.v, l.n * sizeof(ReChRange)); + u32 cseti = s->re->cset.n; + DA_APUSH(&s->re->cset, s->perm, cset); + return re_new_node_c(s, N_CHAR_SET, cseti); +} + +ReNode *re_parse_atom(ReParser *s) { + switch (ch(s)) { + case '(': + return re_parse_group(s); + case '[': + return re_parse_char_set(s); + case '.': + skip(s, 1); + return re_new_node_char(s, C_ANY); + case '^': + skip(s, 1); + return re_new_node_char(s, C_LINE_START); + case '$': + skip(s, 1); + return re_new_node_char(s, C_LINE_END); + case '\\': { + /* TODO: throw error at end-of-string */ + skip(s, 1); + ReNode *n = re_new_node_char(s, ch(s)); + skip(s, 1); + return n; + } break; + case 0: + /* should never happen? */ + FAIL_WITH_MSG("atom parsed at end"); + return NULL; + default: { + ReNode *n = re_new_node_char(s, ch(s)); + skip(s, 1); + return n; + } break; + } + return NULL; +} + +/* needed to prevent repetitions of group start and end */ +ReNode *re_wrap_node(ReParser *s, ReNode *n, ReNodeType t) { + if (n && n->t == N_GROUP) { + n->a = re_wrap_node(s, n->a, t); + return n; + } else { + return re_new_node(s, t, n, NULL); + } +} + +ReNode *re_parse_expr(ReParser *s) { + ReNode *p = NULL; + ReNode *alt = NULL; + while (s->i < s->s.n) { + if (ch(s) == '|') { + if (alt) { + alt = re_new_node(s, N_ALTERNATE, alt, p); + } else { + alt = p; + } + p = NULL; + skip(s, 1); + } + if (ch(s) == ')') break; + ReNode *n = re_parse_atom(s); + switch (ch(s)) { + case '?': n = re_wrap_node(s, n, N_ZERO_OR_ONE); skip(s, 1); break; + case '*': n = re_wrap_node(s, n, N_ZERO_OR_MORE); skip(s, 1); break; + case '+': n = re_wrap_node(s, n, N_ONE_OR_MORE); skip(s, 1); break; + } + if (p) { + p = re_new_node(s, N_CONCAT, p, n); + } else { + p = n; + } + } + if (alt) p = re_new_node(s, N_ALTERNATE, alt, p); + return p; +} + +#undef ch +#undef skip + +/* regex compilation */ + +u32 re_emit0(ReParser *s, ReOpType op) { + u32 i = s->re->op.n; + DA_APUSH(&s->re->op, s->perm, (ReOp) { .op = op }); + return i; +} + +u32 re_emit1(ReParser *s, ReOpType op, u32 c) { + u32 i = s->re->op.n; + DA_APUSH(&s->re->op, s->perm, (ReOp) { .op = op, .c = c }); + return i; +} + +u32 re_emit2(ReParser *s, ReOpType op, u16 a, u16 b) { + u32 i = s->re->op.n; + DA_APUSH(&s->re->op, s->perm, (ReOp) { .op = op, .a = a, .b = b }); + return i; +} + +u32 re_lbl_reserve(ReParser *s) { + u32 l = s->lbl.n; + DA_APUSH(&s->lbl, s->scratch, 0); + return l; +} + +u32 re_lbl_emit(ReParser *s, u32 l) { + s->lbl.v[l] = s->re->op.n; + return re_emit1(s, RE_LABEL, l); +} + +u32 re_lbl_push(ReParser *s) { + u32 l = re_lbl_reserve(s); + re_lbl_emit(s, l); + return l; +} + +void re_lbl_resolve(ReParser *s) { + RegEx *r = s->re; + /* resolve label addresses */ + for (u32 i = 0; i < r->op.n; i++) { + ReOp *o = &r->op.v[i]; + if (o->op == RE_LABEL) { + s->lbl.v[o->c] = i; + if (i + 1 < r->op.n) { + memmove(&r->op.v[i], &r->op.v[i+1], (r->op.n - (i+1)) * sizeof(ReOp)); + } + r->op.n--; + i--; + } + } + /* patch them in */ + for (u32 i = 0; i < r->op.n; i++) { + ReOp *o = &r->op.v[i]; + switch (o->op) { + case RE_JUMP: + o->c = s->lbl.v[o->c]; + break; + case RE_SPLIT: + o->a = s->lbl.v[o->a]; + o->b = s->lbl.v[o->b]; + break; + default: + break; + } + } + s->lbl.n = 0; +} + +int re_comp_node(ReParser *s, ReNode *n) { + if (!n) return 0; + switch (n->t) { + case N_CHAR: + switch (n->c) { + case C_ANY: + re_emit0(s, RE_CHAR_ANY); + break; + case C_LINE_START: + re_emit0(s, RE_LINE_START); + break; + case C_LINE_END: + re_emit0(s, RE_LINE_END); + break; + default: + re_emit1(s, RE_CHAR, n->c); + break; + } + break; + case N_CHAR_SET: { + ReChSet *cset = &s->re->cset.v[n->c]; + if (cset->n == 1 && cset->v[0].max < 65536) { + if (cset->v[0].min == cset->v[0].max) { + re_emit1(s, cset->invert ? RE_CHAR_NOT : RE_CHAR, cset->v[0].min); + } else { + re_emit2(s, cset->invert ? RE_CHAR_RANGE_NOT : RE_CHAR_RANGE, cset->v[0].min, cset->v[0].max); + } + } else { + u32 chars = 0; + int all_bytes = 1; + for (u32 i = 0; i < cset->n; i++) { + if (cset->v[i].max > 255) all_bytes = 0; + chars += (cset->v[i].max + 1) - cset->v[i].min; + } + if (chars <= 7 && all_bytes) { + u32 op = re_emit0(s, cset->invert ? RE_CHAR_SET_PACKED_NOT : RE_CHAR_SET_PACKED); + ReOp *o = &s->re->op.v[op]; + u8 *c = (u8*)o + 1; /* right past opcode */ + memset(c, 0, 7); + for (u32 i = 0; i < cset->n; i++) { + for (u32 j = cset->v[i].min; j <= cset->v[i].max; j++) { + *c++ = j; + } + } + } else { + re_emit1(s, RE_CHAR_SET, n->c); + } + } + } break; + case N_CONCAT: + re_comp_node(s, n->a); + re_comp_node(s, n->b); + break; + case N_ALTERNATE: { + u32 l1 = re_lbl_reserve(s); + u32 l2 = re_lbl_reserve(s); + u32 l3 = re_lbl_reserve(s); + re_emit2(s, RE_SPLIT, l1, l2); + re_lbl_emit(s, l1); + re_comp_node(s, n->a); + re_emit1(s, RE_JUMP, l3); + re_lbl_emit(s, l2); + re_comp_node(s, n->b); + re_lbl_emit(s, l3); + } break; + case N_ZERO_OR_ONE: { + u32 l1 = re_lbl_reserve(s); + u32 l2 = re_lbl_reserve(s); + re_emit2(s, RE_SPLIT, l1, l2); + re_lbl_emit(s, l1); + re_comp_node(s, n->a); + re_lbl_emit(s, l2); + } break; + case N_ONE_OR_MORE: { + u32 l1 = re_lbl_reserve(s); + u32 l2 = re_lbl_reserve(s); + re_lbl_emit(s, l1); + re_comp_node(s, n->a); + re_emit2(s, RE_SPLIT, l1, l2); + re_lbl_emit(s, l2); + } break; + case N_ZERO_OR_MORE: { + u32 l1 = re_lbl_reserve(s); + u32 l2 = re_lbl_reserve(s); + u32 l3 = re_lbl_reserve(s); + re_lbl_emit(s, l1); + re_emit2(s, RE_SPLIT, l2, l3); + re_lbl_emit(s, l2); + re_comp_node(s, n->a); + re_emit1(s, RE_JUMP, l1); + re_lbl_emit(s, l3); + } break; + case N_GROUP: + if (s->flags & RE_COMP_NO_GROUPS) { + re_comp_node(s, n->a); + } else { + u32 g = s->re->groups++; + re_emit1(s, RE_GROUP_START, g); + re_comp_node(s, n->a); + re_emit1(s, RE_GROUP_END, g); + } + break; + default: + FAIL_WITH_MSG("unimplemented"); + break; + } + return 0; +} + +void re_op_del(ReParser *s, u32 i, u32 n) { + RegEx *r = s->re; + if (i + n > r->op.n) n = r->op.n - i; + for (u32 j = 0; j < s->lbl.n; j++) { + if (s->lbl.v[j] >= i) s->lbl.v[j] -= n; + } + memmove(&r->op.v[i], &r->op.v[i+1], (r->op.n - (i + n)) * sizeof(ReOp)); + r->op.n -= n; +} + +void re_op_ins(ReParser *s, u32 i, ReOp op) { + RegEx *r = s->re; + for (u32 j = 0; j < s->lbl.n; j++) { + if (s->lbl.v[j] >= i) s->lbl.v[j] += 1; + } + DA_AFIT(&r->op, s->perm, r->op.n + 1); + if (i + 1 < r->op.n) { + memmove(&r->op.v[i+1], &r->op.v[i], (r->op.n - i) * sizeof(ReOp)); + } + r->op.v[i] = op; + r->op.n++; +} + +static inline int re_op_eql(RegEx *re, u32 a, u32 b) { + ReOp *oa = &re->op.v[a]; + ReOp *ob = &re->op.v[b]; + return !memcmp(oa, ob, sizeof(ReOp)); +} + +static inline u32 re_first_non_label(ReParser *s, u32 i) { + while (s->re->op.v[i].op == RE_LABEL) i++; + ASSERT(i < s->re->op.n); + return i; +} + +void re_comp_find_unreachable_from(ReParser *s, uintmax_t *unreachable, u32 i) { + RegEx *re = s->re; + if (!RE_BITSET_HAS(unreachable, i)) return; + while (i < re->op.n) { + RE_BITSET_CLR(unreachable, i); + ReOp *o = &re->op.v[i]; + switch (o->op) { + case RE_SPLIT: + re_comp_find_unreachable_from(s, unreachable, s->lbl.v[o->a]); + i = s->lbl.v[o->b]; + break; + case RE_JUMP: + i = s->lbl.v[o->c]; + break; + case RE_MATCH: + case RE_FAIL: + return; + default: + i++; + break; + } + } +} + +void re_comp_find_unreachable(ReParser *s, uintmax_t *unreachable) { + RE_BITSET_SET_ALL(unreachable, s->re->op.n); + re_comp_find_unreachable_from(s, unreachable, 0); +} + +void re_comp_dcelim(ReParser *s) { + RegEx *re = s->re; + uintmax_t *unreachable = RE_BITSET_ALLOC(s->scratch, s->re->op.n); + + for (;;) { + re_comp_find_unreachable(s, unreachable); + int jmp_removed = 0; + /* TODO: rewrite this for fast bitset iteration with stdc_count_trailing_zeros() */ + /* must go backwards to not invalidate the indices */ + for (i32 i = re->op.n - 1; i >= 0; i--) { + ReOp *o = &re->op.v[i]; + if (RE_BITSET_HAS(unreachable, i)) { + if (o->op == RE_JUMP || o->op == RE_SPLIT) jmp_removed = 1; + re_op_del(s, i, 1); + } + } + if (!jmp_removed) break; + } +} + +void re_comp_opt(ReParser *s) { + RegEx *re = s->re; + + /* replace jumps with their destinations, if appropriate*/ + +#if 1 + for (u32 i = 0; i < re->op.n; i++) { + ReOp *o = &re->op.v[i]; +again: + if (o->op == RE_JUMP) { + u32 dest = re_first_non_label(s, s->lbl.v[o->c]); + if (re->op.v[dest].op == RE_MATCH + || re->op.v[dest].op == RE_FAIL + || re->op.v[dest].op == RE_SPLIT + || re->op.v[dest].op == RE_JUMP) { + *o = re->op.v[dest]; + goto again; + } + } + if (o->op == RE_SPLIT) { + /* TODO: will this cause bugs of trampling over the labels intended + * for other jumps, other splits? */ + if (o->a == o->b) { + o->c = o->a; + o->op = RE_JUMP; + goto again; + } else { + u32 a = re_first_non_label(s, s->lbl.v[o->a]); + u32 b = re_first_non_label(s, s->lbl.v[o->b]); + if (re_op_eql(re, a, b)) { + ReOp op = re->op.v[a]; + re_op_del(s, a, 1); + if (b >= a) b--; + re_op_del(s, b, 1); + re_op_ins(s, i, op); + } else { + int redo = 0; + while (re->op.v[a].op == RE_JUMP) { + o->a = re->op.v[a].c; + a = re_first_non_label(s, s->lbl.v[o->a]); + redo = 1; + } + while (re->op.v[b].op == RE_JUMP) { + o->b = re->op.v[b].c; + b = re_first_non_label(s, s->lbl.v[o->b]); + redo = 1; + } + if (redo) goto again; + } + } + } + } +#endif + + re_comp_dcelim(s); +} + +u8 re_comp_first_byte(ReParser *s) { + RegEx *re = s->re; + for (u32 i = 0; i < re->op.n; ) { + ReOp *o = &re->op.v[i]; + switch (o->op) { + case RE_CHAR: + /* TODO: utf-8 encode, then pick first byte */ + return o->c; + case RE_JUMP: + i = o->c; + break; + case RE_GROUP_START: + case RE_GROUP_END: + i++; + break; + default: + return 0; + } + } + return 0; +} + +int re_comp_fin(ReParser *s) { + re_emit0(s, RE_MATCH); + re_comp_opt(s); + re_lbl_resolve(s); + s->re->first_byte = re_comp_first_byte(s); + return 0; +} + +int re_comp_ex(RegEx *re, Str src, Arena *perm, Arena *scratch, ReCompFlags flags) { + ReParser s = { + .re = re, + .perm = perm, + .scratch = scratch, + .s = src, + .flags = flags + }; + ReNode *n = re_parse_expr(&s); + + if (re_comp_node(&s, n)) return -1; + if (re_comp_fin(&s)) return -1; + + return 0; +} + +int re_comp(RegEx *re, Str src, Arena *perm, Arena *scratch) { + return re_comp_ex(re, src, perm, scratch, 0); +} + +/* regex execution */ + +void re_threadlist_clr(ReThreadList *l) { + memset(l->set, 0, 8 * ((l->n / 64) + 1)); + l->n = 0; +} + +/* TODO: find a way to not copy capture groups */ +void re_threadlist_add(ReThreadList *l, u32 inst, ReSpan *grp, u32 ngrp, Arena *a) { + if (!RE_BITSET_HAS(l->set, inst)) { + ReSpan *g = new_arr(a, ReSpan, ngrp); + if (grp) memcpy(g, grp, ngrp * sizeof(ReSpan)); + l->v[l->n++] = (ReThread) { inst, g }; + RE_BITSET_SET(l->set, inst); + } +} + +void re_threadlist_alloc(ReThreadList *l, u32 n, Arena *a) { + l->set = RE_BITSET_ALLOC(a, n); + l->v = new_arr(a, ReThread, n); +} + +static inline int re_char_set_has(RegEx *re, u32 cset, u32 c) { + ReChSet *p = &re->cset.v[cset]; + u32 l = 0, r = p->n; + while (l < r) { + u32 m = (l + r) / 2; + ReChRange x = p->v[m]; + if (c < x.min) r = m; + else if (c > x.max) l = m + 1; + else return !p->invert; + } + return p->invert; +} + +static inline int re_char_set_packed_has(ReOp o, u32 c) { + /* will count (c = 0) as true for non-full-length packed char sets. + * is that a problem? + */ + u64 b = o.align; + int r = 0; + for (u32 i = 0; i < 7; i++) { + r |= (b & 0xff) == c; + b >>= 8; + } + return r; +} + +typedef enum { + RE_THREAD_AT_START = 1, + RE_THREAD_AT_END = 2 +} ReThreadStepFlags; + +/* 1 for match found, 0 for normal exit, -1 for reexecute in place */ +static inline int re_thread_step(ReSearch *s, ReThread *t, u32 i, ReThreadStepFlags f) { + Arena *a = s->a; + RegEx *re = s->re; + ReOp o = re->op.v[t->i]; + switch (o.op) { + case RE_CHAR: + if (s->c != o.c) return 0; + break; + case RE_CHAR_NOT: + if (s->c == o.c) return 0; + break; + case RE_CHAR_ANY: + if (s->c == '\n') return 0; + break; + case RE_CHAR_SET: + if (!re_char_set_has(re, o.c, s->c)) return 0; + break; + case RE_CHAR_RANGE: + if (s->c < o.a || s->c > o.b) return 0; + break; + case RE_CHAR_RANGE_NOT: + if (s->c >= o.a && s->c <= o.b) return 0; + break; + case RE_CHAR_SET_PACKED: + if (!re_char_set_packed_has(o, s->c)) return 0; + break; + case RE_CHAR_SET_PACKED_NOT: + if (re_char_set_packed_has(o, s->c)) return 0; + break; + case RE_LINE_START: + if (s->c_prev != '\n' && !(f & RE_THREAD_AT_START)) return 0; + t->i++; + return -1; + case RE_LINE_END: + if (s->c != '\n' && !(f & RE_THREAD_AT_END)) return 0; + t->i++; + return -1; + case RE_MATCH: + return 1; + case RE_FAIL: + return 0; + case RE_JUMP: + t->i = o.c; + return -1; + case RE_SPLIT: + re_threadlist_add(&s->tcur, o.b, t->grp, re->groups, a); + t->i = o.a; + return -1; + case RE_GROUP_START: + t->grp[o.c].start = s->total_idx + i; + t->i++; + return -1; + case RE_GROUP_END: + t->grp[o.c].len = s->total_idx + i - t->grp[o.c].start; + t->i++; + return -1; + default: + UNREACHABLE; + break; + } + re_threadlist_add(&s->tnext, t->i + 1, t->grp, re->groups, a); + return 0; +} + +static inline int re_thread_final(ReSearch *s, ReThread *t, u32 i, ReThreadStepFlags f) { + /* halt at any opcodes that involve character matching */ + /* we could check for C_EOF in all of them, but it's easier this way */ + switch (s->re->op.v[t->i].op) { + case RE_LINE_END: + case RE_MATCH: + case RE_FAIL: + case RE_JUMP: + case RE_SPLIT: + case RE_GROUP_START: + case RE_GROUP_END: + return re_thread_step(s, t, i, f); + default: + return 0; + } +} + +static inline int re_threadlist_step(ReSearch *s, ReThreadList *tl, u32 i, ReThreadStepFlags f) { + int match = 0; + for (u32 j = 0; j < tl->n; j++) { + ReThread *t = &tl->v[j]; + ASSERT(t->i < s->re->op.n); + int r = re_thread_step(s, t, i, f); + if (r < 0) { + j--; + } else if (r > 0) { + match = 1; + s->match_end = s->total_idx + i; + memcpy(s->grp, t->grp, sizeof(ReSpan) * s->re->groups); + } + } + return match; +} + +static inline int re_step(ReSearch *s, u32 i, ReThreadStepFlags f) { + int r = re_threadlist_step(s, &s->tcur, i, f); + f = 0; + s->c_prev = s->c; + ReThreadList tmp = s->tcur; + s->tcur = s->tnext; + s->tnext = tmp; + re_threadlist_clr(&s->tnext); + return r; +} + +static inline int re_step_final(ReSearch *s, u32 i, ReThreadStepFlags f) { + int found = 0; + for (u32 j = 0; j < s->tcur.n; j++) { + int r = re_thread_final(s, &s->tcur.v[j], i, f); + if (r < 0) { + j--; + } else if (r > 0) { + found = 1; + } + } + re_threadlist_clr(&s->tcur); + re_threadlist_clr(&s->tnext); + return found; +} + +static inline isize re_search_try_match(ReSearch *s, size_t i, size_t n) { + RegEx *re = s->re; + size_t start = s->total_idx; + if (~s->flags & RE_SEARCH_MID_MATCH) { + re_threadlist_clr(&s->tcur); + re_threadlist_clr(&s->tnext); + re_threadlist_add(&s->tcur, 0, s->grp, re->groups, s->a); + s->flags |= RE_SEARCH_MID_MATCH; + s->match_start = start + i; + s->match_end = 0; + } + isize found_i = -1; + ReThreadStepFlags f = 0; + if (i + start == 0) f = RE_THREAD_AT_START; + + for (; i < n && s->tcur.n > 0; i++) { + s->c = (unsigned char)s->buf[i]; + if (re_step(s, i, f)) { + found_i = i; + s->match_end = start + i; + } + } + if (s->flags & RE_SEARCH_LAST_CHUNK) { + f |= RE_THREAD_AT_END; + s->c = C_EOF; + if (re_step_final(s, i, f)) { + found_i = i; + s->match_end = start + i; + } + } + + if (s->tcur.n > 0) return i; + s->flags &= ~RE_SEARCH_MID_MATCH; + return found_i; +} + +/* public facing API */ + +void re_search_start(ReSearch *s, RegEx *re, Arena *a) { + *s = (ReSearch) { + .a = a, + .re = re, + .grp = new_arr(a, ReSpan, re->groups), + .flags = RE_SEARCH_FIRST_CHUNK + }; + re_threadlist_alloc(&s->tcur, re->op.n, a); + re_threadlist_alloc(&s->tnext, re->op.n, a); +} + +void re_search_chunk(ReSearch *s, const char *buf, size_t n) { + if (s->buf_len) s->flags &= ~RE_SEARCH_FIRST_CHUNK; + s->total_idx += s->buf_len; + s->buf = buf; + s->buf_len = n; + s->buf_idx = 0; +} + +void re_search_last_chunk(ReSearch *s) { + s->flags |= RE_SEARCH_LAST_CHUNK; +} + + +/* searching */ + +/* returns final index undescriminately. + * check s->flags to find if the match is done yet + */ + +int re_search_match(ReSearch *s, ReMatch *m) { + size_t i = s->buf_idx; + size_t n = s->buf_len; + while (i < n) { + if (s->re->first_byte && (~s->flags & RE_SEARCH_MID_MATCH)) { + const char *p = memchr(s->buf + i, s->re->first_byte, n - i); + if (!p) break; + i = p - s->buf; + } + isize r = re_search_try_match(s, i, n); + if (r < 0) { + i++; + } else { + if (i == (usize)r) r++; + if (s->flags & RE_SEARCH_MID_MATCH) { + i = r; + } else { + s->buf_idx = r; + m->grp = s->grp; + ASSERT(s->match_start <= s->total_idx + i); + ASSERT(s->match_end >= s->match_start); + m->extent.start = s->match_start; + m->extent.len = s->match_end - s->match_start; + return 1; + } + } + } + s->buf_idx = s->buf_len; + return 0; +} + +/* convenience wrappers */ + +ReMatchList re_match_all(RegEx *re, Str s, Arena *a) { + ReSearch sr = { 0 }; + ReMatchList ml = { 0 }; + ReMatch m; + re_search_start(&sr, re, a); + re_search_chunk(&sr, s.s, s.n); + re_search_last_chunk(&sr); + while (re_search_match(&sr, &m)) { + ReSpan *grp = new_arr(a, ReSpan, re->groups); + memcpy(grp, m.grp, sizeof(ReSpan) * re->groups); + DA_APUSH(&ml, a, (ReMatch) { .extent = m.extent, .grp = grp }); + } + return ml; +} + +int re_match_full(RegEx *re, Str s, Arena *a) { + ReSearch sr = { 0 }; + ReMatch m; + re_search_start(&sr, re, a); + re_search_chunk(&sr, s.s, s.n); + re_search_last_chunk(&sr); + return re_search_match(&sr, &m) && m.extent.start == 0 && m.extent.len == s.n; +} + +int re_match(RegEx *re, Str s, ReMatch *out, Arena *a) { + ReSearch sr = { 0 }; + re_search_start(&sr, re, a); + re_search_chunk(&sr, s.s, s.n); + re_search_last_chunk(&sr); + return re_search_match(&sr, out); +} -- cgit v1.2.3