#include #include #include #include #include #include "wrmr.h" #include "dynarr.h" #include "str.h" #include "regex.h" #include "utf8.h" /* bitsets */ #define RE_BITSET_BYTES sizeof(uintmax_t) #define RE_BITSET_BITS (8 * RE_BITSET_BYTES) #define RE_BITSET_CAP(n) (((n)/64)+1) #define RE_BITSET_HAS(s, i) (1 & ((s)[(i)/RE_BITSET_BITS] >> ((i)&(RE_BITSET_BITS-1)))) #define RE_BITSET_MASK(i) ((uintmax_t)1 << ((i)&(RE_BITSET_BITS-1))) #define RE_BITSET_SET(s, i) ((s)[(i)/RE_BITSET_BITS] |= RE_BITSET_MASK(i)) #define RE_BITSET_CLR(s, i) ((s)[(i)/RE_BITSET_BITS] &= ~RE_BITSET_MASK(i)) #define RE_BITSET_ALLOC(a, n) new_arr(a, uintmax_t, RE_BITSET_CAP(n)) #define RE_BITSET_SET_ALL(s, n) memset(s, 0xff, RE_BITSET_CAP(n) * RE_BITSET_BYTES) #define RE_BITSET_CLR_ALL(s, n) memset(s, 0x00, RE_BITSET_CAP(n) * RE_BITSET_BYTES) /* regex parsing */ typedef struct { u32 dest; } Label; typedef struct { u32 *s; isize n; } ReStr32; typedef struct { RegEx *re; Arena *perm, *scratch; DYNARR(u32) lbl; ReCompFlags flags; ReCompErr err; ReStr32 s; int i; } ReParser; typedef enum { N_CHAR, N_CHAR_SET, N_CONCAT, N_GROUP, N_ALTERNATE, N_ONE_OR_MORE, N_ZERO_OR_MORE, N_ZERO_OR_ONE, } ReNodeType; /* possible peepholes: * * (a . b) | (a . c) -> a . (b | c) * a . b -> "ab" * a | b | c -> [abc] * */ typedef struct ReNode { ReNodeType t; u32 c; struct ReNode *a, *b; } ReNode; ReNode *re_new_node(ReParser *s, ReNodeType t, struct ReNode *a, struct ReNode *b) { ReNode *n = new(s->scratch, ReNode); n->t = t; n->a = a; n->b = b; return n; } ReNode *re_new_node_c(ReParser *s, ReNodeType t, u32 c) { ReNode *n = new(s->scratch, ReNode); n->t = t; n->c = c; return n; } ReNode *re_new_node_char(ReParser *s, u32 c) { return re_new_node_c(s, N_CHAR, c); } #define ch(s) ((s)->i < (s)->s.n ? (s)->s.s[(s)->i] : 0) #define skip(s,n) ((s)->i += (n)) ReNode *re_parse_expr(ReParser *s); ReNode *re_parse_group(ReParser *s) { /* TODO */ skip(s, 1); ReNode *n = re_new_node(s, N_GROUP, re_parse_expr(s), NULL); if (ch(s) != ')') { s->err = RE_COMP_ENORPAREN; return NULL; } skip(s, 1); return n; } static int cmp_ch_range(const void *pa, const void *pb) { ReChRange a = *(const ReChRange *)pa; ReChRange b = *(const ReChRange *)pb; return (a.min > b.min) - (a.min < b.min); } ReNode *re_parse_char_set(ReParser *s) { ReChSet cset = { 0 }; skip(s, 1); if (ch(s) == '^') { cset.invert = 1; skip(s, 1); } /* parse list of ranges */ DYNARR(ReChRange) l = { 0 }; do { u32 c = ch(s); skip(s, 1); if (ch(s) == '-') { skip(s, 1); ASSERT(s->i < s->s.n); u32 n = ch(s); skip(s, 1); if (c > n) { u32 t = c; c = n; n = t; } DA_APUSH(&l, s->scratch, (ReChRange) { c, n }); } else { DA_APUSH(&l, s->scratch, (ReChRange) { c, c }); } } while (s->i < s->s.n && ch(s) != ']'); skip(s, 1); /* combine those that overlap */ qsort(l.v, l.n, sizeof(ReChRange), cmp_ch_range); for (u32 i = 0; i + 1 < l.n; i++) { if (l.v[i].max >= l.v[i+1].min || l.v[i+1].min == l.v[i].max + 1) { if (l.v[i+1].max > l.v[i].max) { l.v[i].max = l.v[i+1].max; } for (u32 j = i + 2; j < l.n; j++) l.v[j-1] = l.v[j]; l.n--; i--; } } /* copy to permanent memory */ cset.n = l.n; cset.v = new_arr(s->perm, ReChRange, cset.n); memcpy(cset.v, l.v, l.n * sizeof(ReChRange)); u32 cseti = s->re->cset.n; DA_APUSH(&s->re->cset, s->perm, cset); return re_new_node_c(s, N_CHAR_SET, cseti); } u32 esc_char(u32 c) { switch (c) { case 'n': return '\n'; case 'r': return '\r'; case 't': return '\t'; case 'v': return '\v'; default: return c; } } ReNode *re_parse_atom(ReParser *s) { switch (ch(s)) { case '(': return re_parse_group(s); case '[': return re_parse_char_set(s); case '.': skip(s, 1); return re_new_node_char(s, C_ANY); case '^': skip(s, 1); return re_new_node_char(s, C_LINE_START); case '$': skip(s, 1); return re_new_node_char(s, C_LINE_END); case '\\': { /* TODO: throw error at end-of-string */ skip(s, 1); ReNode *n = re_new_node_char(s, esc_char(ch(s))); skip(s, 1); return n; } break; case 0: /* should never happen? */ s->err = RE_COMP_EEOF; return NULL; default: { ReNode *n = re_new_node_char(s, ch(s)); skip(s, 1); return n; } break; } return NULL; } /* needed to prevent repetitions of group start and end */ ReNode *re_wrap_node(ReParser *s, ReNode *n, ReNodeType t) { if (n && n->t == N_GROUP) { n->a = re_wrap_node(s, n->a, t); return n; } else { return re_new_node(s, t, n, NULL); } } ReNode *re_parse_expr(ReParser *s) { ReNode *p = NULL; ReNode *alt = NULL; while (s->i < s->s.n) { if (ch(s) == '|') { if (alt) { alt = re_new_node(s, N_ALTERNATE, alt, p); } else { alt = p; } p = NULL; skip(s, 1); } if (ch(s) == ')') break; ReNode *n = re_parse_atom(s); switch (ch(s)) { case '?': n = re_wrap_node(s, n, N_ZERO_OR_ONE); skip(s, 1); break; case '*': n = re_wrap_node(s, n, N_ZERO_OR_MORE); skip(s, 1); break; case '+': n = re_wrap_node(s, n, N_ONE_OR_MORE); skip(s, 1); break; } if (p) { p = re_new_node(s, N_CONCAT, p, n); } else { p = n; } } if (alt) p = re_new_node(s, N_ALTERNATE, alt, p); return p; } #undef ch #undef skip /* regex compilation */ u32 re_emit0(ReParser *s, ReOpType op) { u32 i = s->re->op.n; DA_APUSH(&s->re->op, s->perm, (ReOp) { .op = op }); return i; } u32 re_emit1(ReParser *s, ReOpType op, u32 c) { u32 i = s->re->op.n; DA_APUSH(&s->re->op, s->perm, (ReOp) { .op = op, .c = c }); return i; } u32 re_emit2(ReParser *s, ReOpType op, u16 a, u16 b) { u32 i = s->re->op.n; DA_APUSH(&s->re->op, s->perm, (ReOp) { .op = op, .a = a, .b = b }); return i; } u32 re_lbl_reserve(ReParser *s) { u32 l = s->lbl.n; DA_APUSH(&s->lbl, s->scratch, 0); return l; } u32 re_lbl_emit(ReParser *s, u32 l) { s->lbl.v[l] = s->re->op.n; return re_emit1(s, RE_LABEL, l); } u32 re_lbl_push(ReParser *s) { u32 l = re_lbl_reserve(s); re_lbl_emit(s, l); return l; } void re_lbl_resolve(ReParser *s) { RegEx *r = s->re; /* resolve label addresses */ for (u32 i = 0; i < r->op.n; i++) { ReOp *o = &r->op.v[i]; if (o->op == RE_LABEL) { s->lbl.v[o->c] = i; if (i + 1 < r->op.n) { memmove(&r->op.v[i], &r->op.v[i+1], (r->op.n - (i+1)) * sizeof(ReOp)); } r->op.n--; i--; } } /* patch them in */ for (u32 i = 0; i < r->op.n; i++) { ReOp *o = &r->op.v[i]; switch (o->op) { case RE_JUMP: o->c = s->lbl.v[o->c]; break; case RE_SPLIT: o->a = s->lbl.v[o->a]; o->b = s->lbl.v[o->b]; break; default: break; } } s->lbl.n = 0; } int re_comp_node(ReParser *s, ReNode *n) { if (!n) return 0; switch (n->t) { case N_CHAR: switch (n->c) { case C_ANY: re_emit0(s, RE_CHAR_ANY); break; case C_LINE_START: re_emit0(s, RE_LINE_START); break; case C_LINE_END: re_emit0(s, RE_LINE_END); break; default: re_emit1(s, RE_CHAR, n->c); break; } break; case N_CHAR_SET: { ReChSet *cset = &s->re->cset.v[n->c]; if (cset->n == 1 && cset->v[0].max < 65536) { if (cset->v[0].min == cset->v[0].max) { re_emit1(s, cset->invert ? RE_CHAR_NOT : RE_CHAR, cset->v[0].min); } else { re_emit2(s, cset->invert ? RE_CHAR_RANGE_NOT : RE_CHAR_RANGE, cset->v[0].min, cset->v[0].max); } } else { u32 chars = 0; int all_bytes = 1; for (u32 i = 0; i < cset->n; i++) { if (cset->v[i].max > 255) all_bytes = 0; chars += (cset->v[i].max + 1) - cset->v[i].min; } if (chars <= 7 && all_bytes) { u32 op = re_emit0(s, cset->invert ? RE_CHAR_SET_PACKED_NOT : RE_CHAR_SET_PACKED); ReOp *o = &s->re->op.v[op]; u8 *c = (u8*)o + 1; /* right past opcode */ memset(c, 0, 7); for (u32 i = 0; i < cset->n; i++) { for (u32 j = cset->v[i].min; j <= cset->v[i].max; j++) { *c++ = j; } } } else { re_emit1(s, RE_CHAR_SET, n->c); } } } break; case N_CONCAT: re_comp_node(s, n->a); re_comp_node(s, n->b); break; case N_ALTERNATE: { u32 l1 = re_lbl_reserve(s); u32 l2 = re_lbl_reserve(s); u32 l3 = re_lbl_reserve(s); re_emit2(s, RE_SPLIT, l1, l2); re_lbl_emit(s, l1); re_comp_node(s, n->a); re_emit1(s, RE_JUMP, l3); re_lbl_emit(s, l2); re_comp_node(s, n->b); re_lbl_emit(s, l3); } break; case N_ZERO_OR_ONE: { u32 l1 = re_lbl_reserve(s); u32 l2 = re_lbl_reserve(s); re_emit2(s, RE_SPLIT, l1, l2); re_lbl_emit(s, l1); re_comp_node(s, n->a); re_lbl_emit(s, l2); } break; case N_ONE_OR_MORE: { u32 l1 = re_lbl_reserve(s); u32 l2 = re_lbl_reserve(s); re_lbl_emit(s, l1); re_comp_node(s, n->a); re_emit2(s, RE_SPLIT, l1, l2); re_lbl_emit(s, l2); } break; case N_ZERO_OR_MORE: { u32 l1 = re_lbl_reserve(s); u32 l2 = re_lbl_reserve(s); u32 l3 = re_lbl_reserve(s); re_lbl_emit(s, l1); re_emit2(s, RE_SPLIT, l2, l3); re_lbl_emit(s, l2); re_comp_node(s, n->a); re_emit1(s, RE_JUMP, l1); re_lbl_emit(s, l3); } break; case N_GROUP: if (s->flags & RE_COMP_NO_GROUPS) { re_comp_node(s, n->a); } else { u32 g = s->re->groups++; re_emit1(s, RE_GROUP_START, g); re_comp_node(s, n->a); re_emit1(s, RE_GROUP_END, g); } break; default: FAIL_WITH_MSG("unimplemented"); break; } return 0; } void re_op_del(ReParser *s, u32 i, u32 n) { RegEx *r = s->re; if (i + n > r->op.n) n = r->op.n - i; for (u32 j = 0; j < s->lbl.n; j++) { if (s->lbl.v[j] >= i) s->lbl.v[j] -= n; } memmove(&r->op.v[i], &r->op.v[i+1], (r->op.n - (i + n)) * sizeof(ReOp)); r->op.n -= n; } void re_op_ins(ReParser *s, u32 i, ReOp op) { RegEx *r = s->re; for (u32 j = 0; j < s->lbl.n; j++) { if (s->lbl.v[j] >= i) s->lbl.v[j] += 1; } DA_AFIT(&r->op, s->perm, r->op.n + 1); if (i + 1 < r->op.n) { memmove(&r->op.v[i+1], &r->op.v[i], (r->op.n - i) * sizeof(ReOp)); } r->op.v[i] = op; r->op.n++; } static inline int re_op_eql(RegEx *re, u32 a, u32 b) { ReOp *oa = &re->op.v[a]; ReOp *ob = &re->op.v[b]; return !memcmp(oa, ob, sizeof(ReOp)); } static inline u32 re_first_non_label(ReParser *s, u32 i) { while (s->re->op.v[i].op == RE_LABEL) i++; ASSERT(i < s->re->op.n); return i; } void re_comp_find_unreachable_from(ReParser *s, uintmax_t *unreachable, u32 i) { RegEx *re = s->re; if (!RE_BITSET_HAS(unreachable, i)) return; while (i < re->op.n) { RE_BITSET_CLR(unreachable, i); ReOp *o = &re->op.v[i]; switch (o->op) { case RE_SPLIT: re_comp_find_unreachable_from(s, unreachable, s->lbl.v[o->a]); i = s->lbl.v[o->b]; break; case RE_JUMP: i = s->lbl.v[o->c]; break; case RE_MATCH: case RE_FAIL: return; default: i++; break; } } } void re_comp_find_unreachable(ReParser *s, uintmax_t *unreachable) { RE_BITSET_SET_ALL(unreachable, s->re->op.n); re_comp_find_unreachable_from(s, unreachable, 0); } void re_comp_dcelim(ReParser *s) { RegEx *re = s->re; uintmax_t *unreachable = RE_BITSET_ALLOC(s->scratch, s->re->op.n); for (;;) { re_comp_find_unreachable(s, unreachable); int jmp_removed = 0; /* TODO: rewrite this for fast bitset iteration with stdc_count_trailing_zeros() */ /* must go backwards to not invalidate the indices */ for (i32 i = re->op.n - 1; i >= 0; i--) { ReOp *o = &re->op.v[i]; if (RE_BITSET_HAS(unreachable, i)) { if (o->op == RE_JUMP || o->op == RE_SPLIT) jmp_removed = 1; re_op_del(s, i, 1); } } if (!jmp_removed) break; } } void re_comp_opt(ReParser *s) { RegEx *re = s->re; /* replace jumps with their destinations, if appropriate*/ #if 1 for (u32 i = 0; i < re->op.n; i++) { ReOp *o = &re->op.v[i]; again: if (o->op == RE_JUMP) { u32 dest = re_first_non_label(s, s->lbl.v[o->c]); if (re->op.v[dest].op == RE_MATCH || re->op.v[dest].op == RE_FAIL || re->op.v[dest].op == RE_SPLIT || re->op.v[dest].op == RE_JUMP) { *o = re->op.v[dest]; goto again; } } if (o->op == RE_SPLIT) { /* TODO: will this cause bugs of trampling over the labels intended * for other jumps, other splits? */ if (o->a == o->b) { o->c = o->a; o->op = RE_JUMP; goto again; } else { u32 a = re_first_non_label(s, s->lbl.v[o->a]); u32 b = re_first_non_label(s, s->lbl.v[o->b]); if (re_op_eql(re, a, b)) { ReOp op = re->op.v[a]; re_op_del(s, a, 1); if (b >= a) b--; re_op_del(s, b, 1); re_op_ins(s, i, op); } else { int redo = 0; while (re->op.v[a].op == RE_JUMP) { o->a = re->op.v[a].c; a = re_first_non_label(s, s->lbl.v[o->a]); redo = 1; } while (re->op.v[b].op == RE_JUMP) { o->b = re->op.v[b].c; b = re_first_non_label(s, s->lbl.v[o->b]); redo = 1; } if (redo) goto again; } } } } #endif re_comp_dcelim(s); } u32 re_comp_first_char32(ReParser *s) { RegEx *re = s->re; for (u32 i = 0; i < re->op.n; ) { ReOp *o = &re->op.v[i]; switch (o->op) { case RE_CHAR: /* TODO: utf-8 encode, then pick first byte */ return o->c; case RE_JUMP: i = o->c; break; case RE_GROUP_START: case RE_GROUP_END: i++; break; default: return 0; } } return 0; } u8 re_comp_first_byte(ReParser *s) { u32 c = re_comp_first_char32(s); if (c == 0) return 0; char buf[4]; utf8_encode(buf, &c, 1); return buf[0]; } int re_comp_fin(ReParser *s) { re_emit0(s, RE_MATCH); re_comp_opt(s); re_lbl_resolve(s); s->re->first_byte = re_comp_first_byte(s); return 0; } static inline ReStr32 re_str_to_str32(Str s, Arena *a) { u32 n = utf8_decode_len(s.s, s.n); u32 *v = new_arr(a, u32, n); utf8_decode(v, s.s, n); return (ReStr32) { v, n }; } int re_comp_ex(RegEx *re, Str src, Arena *perm, Arena *scratch, ReCompFlags flags) { ReParser s = { .re = re, .perm = perm, .scratch = scratch, .s = re_str_to_str32(src, scratch), .flags = flags, .err = RE_COMP_ENONE }; ReNode *n = re_parse_expr(&s); if (!s.err) { if (s.s.n > 0 && s.s.s[s.s.n - 1] == ')') { s.err = RE_COMP_ENOLPAREN; } } if (re_comp_node(&s, n)) return -1; if (re_comp_fin(&s)) return -1; return s.err; } const char *re_comp_strerror(ReCompErr err) { switch (err) { case RE_COMP_ENONE: return "success"; case RE_COMP_ENORPAREN: return "no closing )"; case RE_COMP_ENOLPAREN: return "no opening ("; case RE_COMP_EEOF: return "unexpected end of pattern"; } return "unknown regex error"; } int re_comp(RegEx *re, Str src, Arena *perm, Arena *scratch) { return re_comp_ex(re, src, perm, scratch, 0); } /* regex execution */ void re_threadlist_clr(ReThreadList *l) { memset(l->set, 0, 8 * ((l->n / 64) + 1)); l->n = 0; } /* TODO: find a way to not copy capture groups */ void re_threadlist_add(ReThreadList *l, u32 inst, ReSpan *grp, u32 ngrp, Arena *a) { if (!RE_BITSET_HAS(l->set, inst)) { ReSpan *g = new_arr(a, ReSpan, ngrp); if (grp) memcpy(g, grp, ngrp * sizeof(ReSpan)); l->v[l->n++] = (ReThread) { inst, g }; RE_BITSET_SET(l->set, inst); } } void re_threadlist_alloc(ReThreadList *l, u32 n, Arena *a) { l->set = RE_BITSET_ALLOC(a, n); l->v = new_arr(a, ReThread, n); } static inline int re_char_set_has(RegEx *re, u32 cset, u32 c) { ReChSet *p = &re->cset.v[cset]; u32 l = 0, r = p->n; while (l < r) { u32 m = (l + r) / 2; ReChRange x = p->v[m]; if (c < x.min) r = m; else if (c > x.max) l = m + 1; else return !p->invert; } return p->invert; } static inline int re_char_set_packed_has(ReOp o, u32 c) { /* will count (c = 0) as true for non-full-length packed char sets. * is that a problem? */ u64 b = o.align; int r = 0; for (u32 i = 0; i < 7; i++) { r |= (b & 0xff) == c; b >>= 8; } return r; } typedef enum { RE_THREAD_AT_START = 1, RE_THREAD_AT_END = 2 } ReThreadStepFlags; /* 1 for match found, 0 for normal exit, -1 for reexecute in place */ static inline int re_thread_step(ReSearch *s, ReThread *t, u32 i, ReThreadStepFlags f) { Arena *a = s->a; RegEx *re = s->re; ReOp o = re->op.v[t->i]; switch (o.op) { case RE_CHAR: if (s->c != o.c) return 0; break; case RE_CHAR_NOT: if (s->c == o.c) return 0; break; case RE_CHAR_ANY: if (s->c == '\n') return 0; break; case RE_CHAR_SET: if (!re_char_set_has(re, o.c, s->c)) return 0; break; case RE_CHAR_RANGE: if (s->c < o.a || s->c > o.b) return 0; break; case RE_CHAR_RANGE_NOT: if (s->c >= o.a && s->c <= o.b) return 0; break; case RE_CHAR_SET_PACKED: if (!re_char_set_packed_has(o, s->c)) return 0; break; case RE_CHAR_SET_PACKED_NOT: if (re_char_set_packed_has(o, s->c)) return 0; break; case RE_LINE_START: if (s->c_prev != '\n' && !(f & RE_THREAD_AT_START)) return 0; t->i++; return -1; case RE_LINE_END: if (s->c != '\n' && !(f & RE_THREAD_AT_END)) return 0; t->i++; return -1; case RE_MATCH: return 1; case RE_FAIL: return 0; case RE_JUMP: t->i = o.c; return -1; case RE_SPLIT: re_threadlist_add(&s->tcur, o.b, t->grp, re->groups, a); t->i = o.a; return -1; case RE_GROUP_START: t->grp[o.c].start = s->total_idx + i; t->i++; return -1; case RE_GROUP_END: t->grp[o.c].len = s->total_idx + i - t->grp[o.c].start; t->i++; return -1; default: UNREACHABLE; break; } re_threadlist_add(&s->tnext, t->i + 1, t->grp, re->groups, a); return 0; } static inline int re_thread_final(ReSearch *s, ReThread *t, u32 i, ReThreadStepFlags f) { /* halt at any opcodes that involve character matching */ /* we could check for C_EOF in all of them, but it's easier this way */ switch (s->re->op.v[t->i].op) { case RE_LINE_END: case RE_MATCH: case RE_FAIL: case RE_JUMP: case RE_SPLIT: case RE_GROUP_START: case RE_GROUP_END: return re_thread_step(s, t, i, f); default: return 0; } } static inline int re_threadlist_step(ReSearch *s, ReThreadList *tl, u32 i, ReThreadStepFlags f) { int match = 0; for (u32 j = 0; j < tl->n; j++) { ReThread *t = &tl->v[j]; ASSERT(t->i < s->re->op.n); int r = re_thread_step(s, t, i, f); if (r < 0) { j--; } else if (r > 0) { match = 1; s->match_end = s->total_idx + i; memcpy(s->grp, t->grp, sizeof(ReSpan) * s->re->groups); } } return match; } static inline int re_step(ReSearch *s, u32 i, ReThreadStepFlags f) { int r = re_threadlist_step(s, &s->tcur, i, f); f = 0; s->c_prev = s->c; ReThreadList tmp = s->tcur; s->tcur = s->tnext; s->tnext = tmp; re_threadlist_clr(&s->tnext); return r; } static inline int re_step_final(ReSearch *s, u32 i, ReThreadStepFlags f) { int found = 0; for (u32 j = 0; j < s->tcur.n; j++) { int r = re_thread_final(s, &s->tcur.v[j], i, f); if (r < 0) { j--; } else if (r > 0) { found = 1; } } re_threadlist_clr(&s->tcur); re_threadlist_clr(&s->tnext); return found; } static inline isize re_search_try_match(ReSearch *s, size_t i, size_t n) { RegEx *re = s->re; size_t start = s->total_idx; if (~s->flags & RE_SEARCH_MID_MATCH) { re_threadlist_clr(&s->tcur); re_threadlist_clr(&s->tnext); re_threadlist_add(&s->tcur, 0, s->grp, re->groups, s->a); s->flags |= RE_SEARCH_MID_MATCH; s->match_start = start + i; s->match_end = 0; } isize found_i = -1; ReThreadStepFlags f = 0; if (i == 0 && (s->flags & (RE_SEARCH_FIRST_CHUNK | RE_SEARCH_WAS_NEWLINE))) f = RE_THREAD_AT_START; for (; i < n && s->tcur.n > 0; ) { i += utf8_to_char32(s->buf, i, n, &s->c); if (re_step(s, i, f)) { found_i = i; s->match_end = start + i; } } if (s->flags & RE_SEARCH_LAST_CHUNK) { f |= RE_THREAD_AT_END; if (re_step_final(s, i, f)) { found_i = i; s->match_end = start + i; } } if (s->tcur.n > 0) return i; s->flags &= ~RE_SEARCH_MID_MATCH; return found_i; } /* public facing API */ void re_search_start(ReSearch *s, RegEx *re, Arena *a) { *s = (ReSearch) { .a = a, .re = re, .grp = new_arr(a, ReSpan, re->groups), .flags = 0 }; re_threadlist_alloc(&s->tcur, re->op.n, a); re_threadlist_alloc(&s->tnext, re->op.n, a); } void re_search_chunk(ReSearch *s, const char *buf, size_t n) { if (s->buf_len) s->flags &= ~RE_SEARCH_FIRST_CHUNK; s->total_idx += s->buf_len; s->buf = buf; s->buf_len = n; s->buf_idx = 0; } void re_search_last_chunk(ReSearch *s) { s->flags |= RE_SEARCH_LAST_CHUNK; } void re_search_first_chunk(ReSearch *s) { s->flags |= RE_SEARCH_FIRST_CHUNK; } /* searching */ /* returns final index undescriminately. * check s->flags to find if the match is done yet */ static inline void re_search_chunk_fin(ReSearch *s) { u32 n = s->buf_len; s->buf_idx = n; if (n > 0) { if (s->buf[n-1] == '\n') { s->flags |= RE_SEARCH_WAS_NEWLINE; } else { s->flags &= ~RE_SEARCH_WAS_NEWLINE; } } } int re_search_match_at_start(ReSearch *s, ReMatch *m) { size_t i = s->buf_idx; size_t n = s->buf_len; for (;;) { while (i < n && (s->buf[i] & 0xc0) == 0x80) i++; if (i == n) break; isize r = re_search_try_match(s, i, n); if (r < 0) { return 0; } else { if (i == (usize)r) r++; if (s->flags & RE_SEARCH_MID_MATCH) { i = r; } else { s->buf_idx = r; m->grp = s->grp; ASSERT(s->match_start <= s->total_idx + i); ASSERT(s->match_end >= s->match_start); m->extent.start = s->match_start; m->extent.len = s->match_end - s->match_start; return 1; } } } re_search_chunk_fin(s); return 0; } int re_search_match(ReSearch *s, ReMatch *m) { u32 i = s->buf_idx; u32 n = s->buf_len; for (;;) { while (i < n && (s->buf[i] & 0xc0) == 0x80) i++; if (i == n) break; if (s->re->first_byte && (~s->flags & RE_SEARCH_MID_MATCH)) { const char *p = memchr(s->buf + i, s->re->first_byte, n - i); if (!p) break; i = p - s->buf; } s->buf_idx = i++; if (re_search_match_at_start(s, m)) return 1; } re_search_chunk_fin(s); return 0; } /* convenience wrappers */ ReMatchList re_match_all(RegEx *re, Str s, Arena *a) { ReSearch sr = { 0 }; ReMatchList ml = { 0 }; ReMatch m; re_search_start(&sr, re, a); re_search_chunk(&sr, s.s, s.n); re_search_first_chunk(&sr); re_search_last_chunk(&sr); while (re_search_match(&sr, &m)) { ReSpan *grp = new_arr(a, ReSpan, re->groups); memcpy(grp, m.grp, sizeof(ReSpan) * re->groups); DA_APUSH(&ml, a, (ReMatch) { .extent = m.extent, .grp = grp }); } return ml; } int re_match_full(RegEx *re, Str s, Arena *a) { ReSearch sr = { 0 }; ReMatch m; re_search_start(&sr, re, a); re_search_chunk(&sr, s.s, s.n); re_search_first_chunk(&sr); re_search_last_chunk(&sr); return re_search_match(&sr, &m) && m.extent.start == 0 && m.extent.len == s.n; } int re_match(RegEx *re, Str s, ReMatch *out, Arena *a) { ReSearch sr = { 0 }; re_search_start(&sr, re, a); re_search_chunk(&sr, s.s, s.n); re_search_first_chunk(&sr); re_search_last_chunk(&sr); return re_search_match(&sr, out); }