summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--main.c133
-rw-r--r--regex.c912
-rw-r--r--regex.h126
3 files changed, 1156 insertions, 15 deletions
diff --git a/main.c b/main.c
index db6d641..70a25a6 100644
--- a/main.c
+++ b/main.c
@@ -22,6 +22,7 @@
#include "utf8.h"
#include "str.h"
#include "freelist.h"
+#include "regex.h"
#define ED_BUF_MAX 1024
@@ -52,6 +53,8 @@ typedef struct {
Txt *txt_free;
EditMode mode;
u32 count;
+ Str input_line, input_prompt;
+ Str last_search;
} Editor;
Editor e = { 0 };
@@ -433,13 +436,16 @@ void draw(void *ctx) {
ASSERT(start.i <= eb->txt->ptbl.v[start.p].n);
if (!cur_found) vui_curs_pos(x, y);
- /*
- u32 c = txt_chr(l);
- vui_printf(-1, -4, "%u", e.count);
- vui_aprintf(-1, -3, e.mode ? FG_BCYAN : FG_CYAN, "%s", e.mode ? "INSERT" : "NORMAL");
- vui_printf(-1, -1, "%u - %u.%u - %02x (%c)", txt_ofs(eb->cur), l.p, l.i, c, (c < 0x20 || c > 0x7e) ? ' ' : c);
- u32 used = e.scratch.beg - e.scratch.start, max = e.scratch.end - e.scratch.start;
- vui_printf(-1, -2, "e.scratch %.02f/%.02fk", used/1024.0, max/1024.0);*/
+
+ if (e.input_line.n > 0 || e.input_prompt.n > 0) {
+ u32 x = 0;
+ x += vui_putsn(0, 0, e.input_prompt.s, e.input_prompt.n);
+ x += vui_putsn(x, 0, e.input_line.s, e.input_line.n);
+ vui_curs_pos(x, 0);
+ vui_curs_shape(VUI_CURS_BAR);
+ u32 c = COLS;
+ while (x < c) vui_chr(x++, 0, ' ');
+ }
}
TxtLoc logical_line_start(TxtLoc l) {
@@ -448,7 +454,77 @@ TxtLoc logical_line_start(TxtLoc l) {
return l;
}
+Str get_input_line(Str prompt) {
+ e.input_prompt = prompt;
+ DYNARR(char) s = { 0 };
+ for (;;) {
+ draw(NULL);
+ vui_blit();
+ u32 c = vui_key();
+ switch (c) {
+ case '\r':
+ goto done;
+ case KEY_ESC:
+ e.input_line = (Str) { 0, 0 };
+ e.input_prompt = (Str) { 0, 0 };
+ return (Str) { 0, 0 };
+ case KEY_BKSP:
+ if (s.n > 0) s.n--;
+ break;
+ case 0x17 /* ^W */:
+ while (s.n > 0 && is_space(s.v[s.n-1])) s.n--;
+ while (s.n > 0 && !is_space(s.v[s.n-1])) s.n--;
+ break;
+ default:
+ if (c > 0 && c <= KEY_UTF8_MAX && c >= ' ') {
+ u32 n = utf8_encode_len(&c, 1);
+ DA_AFIT(&s, &e.scratch, s.n + n);
+ utf8_encode(&s.v[s.n], &c, n);
+ s.n += n;
+ }
+ break;
+ }
+ e.input_line = (Str) { s.v, s.n };
+ }
+done:;
+ e.input_line = (Str) { 0, 0 };
+ e.input_prompt = (Str) { 0, 0 };
+ return (Str) { s.v, s.n };
+}
+
+int search_next_regex(TxtLoc l, Str src, TxtLoc *out) {
+ RegEx re = { 0 };
+ ReSearch s = { 0 };
+ if (re_comp_ex(&re, src, &e.scratch, &e.scratch, RE_COMP_NO_GROUPS)) {
+ /* TODO: report parse error */
+ return 0;
+ }
+ TxtLoc t = l;
+ int match_found = 0;
+search_from_start:
+ re_search_start(&s, &re, &e.scratch);
+ while (!at_end(t)) {
+ TxtLoc p = t;
+ Str chnk = txt_next_chunk(&t);
+ re_search_chunk(&s, chnk.s, chnk.n);
+ if (at_end(t)) re_search_last_chunk(&s);
+ ReMatch m;
+ if (re_search_match(&s, &m)) {
+ *out = (TxtLoc) { p.t, p.p, p.i + m.extent.start };
+ return 1;
+ }
+ }
+ if (match_found == 0) {
+ match_found = -1;
+ t = txt_start(t.t);
+ goto search_from_start;
+ }
+ return 0;
+}
+
int motion(TxtLoc *lp, u32 c) {
+ static Str last_search = { 0 };
+
TxtLoc l = *lp;
TxtLoc last_loc = l;
for (;;) {
@@ -472,6 +548,7 @@ int motion(TxtLoc *lp, u32 c) {
loop:
switch (c) {
+
case KEY_LEFT:
case 'h':
l = cprev(l);
@@ -480,6 +557,15 @@ loop:
case 'l':
l = cnext(l);
break;
+ case KEY_DOWN:
+ case 'j':
+ l = next_line(l);
+ break;
+ case KEY_UP:
+ case 'k':
+ l = prev_line(l);
+ break;
+
case KEY_LEFT | KEY_CTRL_BIT:
case 'b':
l = prev_word(l);
@@ -488,14 +574,7 @@ loop:
case 'w':
l = next_word(l);
break;
- case KEY_UP:
- case 'k':
- l = prev_line(l);
- break;
- case KEY_DOWN:
- case 'j':
- l = next_line(l);
- break;
+
case KEY_UP | KEY_CTRL_BIT:
case '{':
l = prev_par(l);
@@ -513,6 +592,7 @@ loop:
case '%':
if (!match_bracket(l, &l)) return 0;
break;
+
case KEY_PGUP:
for (u32 i = 0; i < LINES; i += 3) l = prev_line(l);
break;
@@ -530,6 +610,7 @@ loop:
case '$':
l = end_of_line(l);
break;
+
case KEY_HOME | KEY_CTRL_BIT:
case 'g':
if (e.count) {
@@ -548,6 +629,7 @@ loop:
l = txt_end(l.t);
}
break;
+
case 'f': {
u32 k = vui_key();
TxtLoc t = cnext(l);
@@ -574,6 +656,27 @@ loop:
t = bprev(t);
}
} break;
+
+ case '/': {
+ if (last_search.s) free(last_search.s);
+ last_search = (Str) { 0, 0 };
+ Str src = get_input_line(S("Search: "));
+ if (!src.n) return 0;
+ last_search.s = malloc(src.n);
+ if (!last_search.s) FAIL_WITH_MSG("failed to allocate search");
+ memcpy(last_search.s, src.s, src.n);
+ last_search.n = src.n;
+ TxtLoc r;
+ if (search_next_regex(l, src, &r)) {
+ l = r;
+ }
+ } break;
+
+ case 'n': {
+ TxtLoc r;
+ if (last_search.n > 0 && search_next_regex(cnext(l), last_search, &r)) l = r;
+ } break;
+
default:
return 0;
}
diff --git a/regex.c b/regex.c
new file mode 100644
index 0000000..710593f
--- /dev/null
+++ b/regex.c
@@ -0,0 +1,912 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "wrmr.h"
+#include "dynarr.h"
+#include "str.h"
+#include "regex.h"
+
+/* bitsets */
+
+#define RE_BITSET_BYTES sizeof(uintmax_t)
+#define RE_BITSET_BITS (8 * RE_BITSET_BYTES)
+#define RE_BITSET_CAP(n) (((n)/64)+1)
+#define RE_BITSET_HAS(s, i) (1 & ((s)[(i)/RE_BITSET_BITS] >> ((i)&(RE_BITSET_BITS-1))))
+#define RE_BITSET_MASK(i) ((uintmax_t)1 << ((i)&(RE_BITSET_BITS-1)))
+#define RE_BITSET_SET(s, i) ((s)[(i)/RE_BITSET_BITS] |= RE_BITSET_MASK(i))
+#define RE_BITSET_CLR(s, i) ((s)[(i)/RE_BITSET_BITS] &= ~RE_BITSET_MASK(i))
+#define RE_BITSET_ALLOC(a, n) new_arr(a, uintmax_t, RE_BITSET_CAP(n))
+#define RE_BITSET_SET_ALL(s, n) memset(s, 0xff, RE_BITSET_CAP(n) * RE_BITSET_BYTES)
+#define RE_BITSET_CLR_ALL(s, n) memset(s, 0x00, RE_BITSET_CAP(n) * RE_BITSET_BYTES)
+
+/* regex parsing */
+
+typedef struct {
+ u32 dest;
+} Label;
+
+typedef struct {
+ RegEx *re;
+ Arena *perm, *scratch;
+ DYNARR(u32) lbl;
+ ReCompFlags flags;
+ Str s;
+ int i;
+} ReParser;
+
+typedef enum {
+ N_CHAR,
+ N_CHAR_SET,
+ N_CONCAT,
+ N_GROUP,
+ N_ALTERNATE,
+ N_ONE_OR_MORE,
+ N_ZERO_OR_MORE,
+ N_ZERO_OR_ONE,
+} ReNodeType;
+
+/* possible peepholes:
+ *
+ * (a . b) | (a . c) -> a . (b | c)
+ * a . b -> "ab"
+ * a | b | c -> [abc]
+ *
+ */
+
+typedef struct ReNode {
+ ReNodeType t;
+ u32 c;
+ struct ReNode *a, *b;
+} ReNode;
+
+ReNode *re_new_node(ReParser *s, ReNodeType t, struct ReNode *a, struct ReNode *b) {
+ ReNode *n = new(s->scratch, ReNode);
+ n->t = t;
+ n->a = a;
+ n->b = b;
+ return n;
+}
+
+ReNode *re_new_node_c(ReParser *s, ReNodeType t, u32 c) {
+ ReNode *n = new(s->scratch, ReNode);
+ n->t = t;
+ n->c = c;
+ return n;
+}
+
+ReNode *re_new_node_char(ReParser *s, u32 c) {
+ return re_new_node_c(s, N_CHAR, c);
+}
+
+#define ch(s) ((s)->i < (s)->s.n ? (s)->s.s[(s)->i] : 0)
+#define skip(s,n) ((s)->i += (n))
+
+ReNode *re_parse_expr(ReParser *s);
+
+ReNode *re_parse_group(ReParser *s) {
+ /* TODO */
+ skip(s, 1);
+ ReNode *n = re_new_node(s, N_GROUP, re_parse_expr(s), NULL);
+ if (ch(s) != ')') FAIL_WITH_MSG("expected )");
+ skip(s, 1);
+ return n;
+}
+
+static int cmp_ch_range(const void *pa, const void *pb) {
+ ReChRange a = *(const ReChRange *)pa;
+ ReChRange b = *(const ReChRange *)pb;
+ return (a.min > b.min) - (a.min < b.min);
+}
+
+ReNode *re_parse_char_set(ReParser *s) {
+ ReChSet cset = { 0 };
+ skip(s, 1);
+ if (ch(s) == '^') {
+ cset.invert = 1;
+ skip(s, 1);
+ }
+
+ /* parse list of ranges */
+ DYNARR(ReChRange) l = { 0 };
+ do {
+ u32 c = ch(s);
+ skip(s, 1);
+ if (ch(s) == '-') {
+ skip(s, 1);
+ ASSERT(s->i < s->s.n);
+ u32 n = ch(s);
+ skip(s, 1);
+ if (c > n) {
+ u32 t = c;
+ c = n;
+ n = t;
+ }
+ DA_APUSH(&l, s->scratch, (ReChRange) { c, n });
+ } else {
+ DA_APUSH(&l, s->scratch, (ReChRange) { c, c });
+ }
+ } while (s->i < s->s.n && ch(s) != ']');
+ skip(s, 1);
+
+ /* combine those that overlap */
+ qsort(l.v, l.n, sizeof(ReChRange), cmp_ch_range);
+ for (u32 i = 0; i + 1 < l.n; i++) {
+ if (l.v[i].max >= l.v[i+1].min || l.v[i+1].min == l.v[i].max + 1) {
+ if (l.v[i+1].max > l.v[i].max) {
+ l.v[i].max = l.v[i+1].max;
+ }
+ for (u32 j = i + 2; j < l.n; j++) l.v[j-1] = l.v[j];
+ l.n--;
+ i--;
+ }
+ }
+
+ /* copy to permanent memory */
+ cset.n = l.n;
+ cset.v = new_arr(s->perm, ReChRange, cset.n);
+ memcpy(cset.v, l.v, l.n * sizeof(ReChRange));
+ u32 cseti = s->re->cset.n;
+ DA_APUSH(&s->re->cset, s->perm, cset);
+ return re_new_node_c(s, N_CHAR_SET, cseti);
+}
+
+ReNode *re_parse_atom(ReParser *s) {
+ switch (ch(s)) {
+ case '(':
+ return re_parse_group(s);
+ case '[':
+ return re_parse_char_set(s);
+ case '.':
+ skip(s, 1);
+ return re_new_node_char(s, C_ANY);
+ case '^':
+ skip(s, 1);
+ return re_new_node_char(s, C_LINE_START);
+ case '$':
+ skip(s, 1);
+ return re_new_node_char(s, C_LINE_END);
+ case '\\': {
+ /* TODO: throw error at end-of-string */
+ skip(s, 1);
+ ReNode *n = re_new_node_char(s, ch(s));
+ skip(s, 1);
+ return n;
+ } break;
+ case 0:
+ /* should never happen? */
+ FAIL_WITH_MSG("atom parsed at end");
+ return NULL;
+ default: {
+ ReNode *n = re_new_node_char(s, ch(s));
+ skip(s, 1);
+ return n;
+ } break;
+ }
+ return NULL;
+}
+
+/* needed to prevent repetitions of group start and end */
+ReNode *re_wrap_node(ReParser *s, ReNode *n, ReNodeType t) {
+ if (n && n->t == N_GROUP) {
+ n->a = re_wrap_node(s, n->a, t);
+ return n;
+ } else {
+ return re_new_node(s, t, n, NULL);
+ }
+}
+
+ReNode *re_parse_expr(ReParser *s) {
+ ReNode *p = NULL;
+ ReNode *alt = NULL;
+ while (s->i < s->s.n) {
+ if (ch(s) == '|') {
+ if (alt) {
+ alt = re_new_node(s, N_ALTERNATE, alt, p);
+ } else {
+ alt = p;
+ }
+ p = NULL;
+ skip(s, 1);
+ }
+ if (ch(s) == ')') break;
+ ReNode *n = re_parse_atom(s);
+ switch (ch(s)) {
+ case '?': n = re_wrap_node(s, n, N_ZERO_OR_ONE); skip(s, 1); break;
+ case '*': n = re_wrap_node(s, n, N_ZERO_OR_MORE); skip(s, 1); break;
+ case '+': n = re_wrap_node(s, n, N_ONE_OR_MORE); skip(s, 1); break;
+ }
+ if (p) {
+ p = re_new_node(s, N_CONCAT, p, n);
+ } else {
+ p = n;
+ }
+ }
+ if (alt) p = re_new_node(s, N_ALTERNATE, alt, p);
+ return p;
+}
+
+#undef ch
+#undef skip
+
+/* regex compilation */
+
+u32 re_emit0(ReParser *s, ReOpType op) {
+ u32 i = s->re->op.n;
+ DA_APUSH(&s->re->op, s->perm, (ReOp) { .op = op });
+ return i;
+}
+
+u32 re_emit1(ReParser *s, ReOpType op, u32 c) {
+ u32 i = s->re->op.n;
+ DA_APUSH(&s->re->op, s->perm, (ReOp) { .op = op, .c = c });
+ return i;
+}
+
+u32 re_emit2(ReParser *s, ReOpType op, u16 a, u16 b) {
+ u32 i = s->re->op.n;
+ DA_APUSH(&s->re->op, s->perm, (ReOp) { .op = op, .a = a, .b = b });
+ return i;
+}
+
+u32 re_lbl_reserve(ReParser *s) {
+ u32 l = s->lbl.n;
+ DA_APUSH(&s->lbl, s->scratch, 0);
+ return l;
+}
+
+u32 re_lbl_emit(ReParser *s, u32 l) {
+ s->lbl.v[l] = s->re->op.n;
+ return re_emit1(s, RE_LABEL, l);
+}
+
+u32 re_lbl_push(ReParser *s) {
+ u32 l = re_lbl_reserve(s);
+ re_lbl_emit(s, l);
+ return l;
+}
+
+void re_lbl_resolve(ReParser *s) {
+ RegEx *r = s->re;
+ /* resolve label addresses */
+ for (u32 i = 0; i < r->op.n; i++) {
+ ReOp *o = &r->op.v[i];
+ if (o->op == RE_LABEL) {
+ s->lbl.v[o->c] = i;
+ if (i + 1 < r->op.n) {
+ memmove(&r->op.v[i], &r->op.v[i+1], (r->op.n - (i+1)) * sizeof(ReOp));
+ }
+ r->op.n--;
+ i--;
+ }
+ }
+ /* patch them in */
+ for (u32 i = 0; i < r->op.n; i++) {
+ ReOp *o = &r->op.v[i];
+ switch (o->op) {
+ case RE_JUMP:
+ o->c = s->lbl.v[o->c];
+ break;
+ case RE_SPLIT:
+ o->a = s->lbl.v[o->a];
+ o->b = s->lbl.v[o->b];
+ break;
+ default:
+ break;
+ }
+ }
+ s->lbl.n = 0;
+}
+
+int re_comp_node(ReParser *s, ReNode *n) {
+ if (!n) return 0;
+ switch (n->t) {
+ case N_CHAR:
+ switch (n->c) {
+ case C_ANY:
+ re_emit0(s, RE_CHAR_ANY);
+ break;
+ case C_LINE_START:
+ re_emit0(s, RE_LINE_START);
+ break;
+ case C_LINE_END:
+ re_emit0(s, RE_LINE_END);
+ break;
+ default:
+ re_emit1(s, RE_CHAR, n->c);
+ break;
+ }
+ break;
+ case N_CHAR_SET: {
+ ReChSet *cset = &s->re->cset.v[n->c];
+ if (cset->n == 1 && cset->v[0].max < 65536) {
+ if (cset->v[0].min == cset->v[0].max) {
+ re_emit1(s, cset->invert ? RE_CHAR_NOT : RE_CHAR, cset->v[0].min);
+ } else {
+ re_emit2(s, cset->invert ? RE_CHAR_RANGE_NOT : RE_CHAR_RANGE, cset->v[0].min, cset->v[0].max);
+ }
+ } else {
+ u32 chars = 0;
+ int all_bytes = 1;
+ for (u32 i = 0; i < cset->n; i++) {
+ if (cset->v[i].max > 255) all_bytes = 0;
+ chars += (cset->v[i].max + 1) - cset->v[i].min;
+ }
+ if (chars <= 7 && all_bytes) {
+ u32 op = re_emit0(s, cset->invert ? RE_CHAR_SET_PACKED_NOT : RE_CHAR_SET_PACKED);
+ ReOp *o = &s->re->op.v[op];
+ u8 *c = (u8*)o + 1; /* right past opcode */
+ memset(c, 0, 7);
+ for (u32 i = 0; i < cset->n; i++) {
+ for (u32 j = cset->v[i].min; j <= cset->v[i].max; j++) {
+ *c++ = j;
+ }
+ }
+ } else {
+ re_emit1(s, RE_CHAR_SET, n->c);
+ }
+ }
+ } break;
+ case N_CONCAT:
+ re_comp_node(s, n->a);
+ re_comp_node(s, n->b);
+ break;
+ case N_ALTERNATE: {
+ u32 l1 = re_lbl_reserve(s);
+ u32 l2 = re_lbl_reserve(s);
+ u32 l3 = re_lbl_reserve(s);
+ re_emit2(s, RE_SPLIT, l1, l2);
+ re_lbl_emit(s, l1);
+ re_comp_node(s, n->a);
+ re_emit1(s, RE_JUMP, l3);
+ re_lbl_emit(s, l2);
+ re_comp_node(s, n->b);
+ re_lbl_emit(s, l3);
+ } break;
+ case N_ZERO_OR_ONE: {
+ u32 l1 = re_lbl_reserve(s);
+ u32 l2 = re_lbl_reserve(s);
+ re_emit2(s, RE_SPLIT, l1, l2);
+ re_lbl_emit(s, l1);
+ re_comp_node(s, n->a);
+ re_lbl_emit(s, l2);
+ } break;
+ case N_ONE_OR_MORE: {
+ u32 l1 = re_lbl_reserve(s);
+ u32 l2 = re_lbl_reserve(s);
+ re_lbl_emit(s, l1);
+ re_comp_node(s, n->a);
+ re_emit2(s, RE_SPLIT, l1, l2);
+ re_lbl_emit(s, l2);
+ } break;
+ case N_ZERO_OR_MORE: {
+ u32 l1 = re_lbl_reserve(s);
+ u32 l2 = re_lbl_reserve(s);
+ u32 l3 = re_lbl_reserve(s);
+ re_lbl_emit(s, l1);
+ re_emit2(s, RE_SPLIT, l2, l3);
+ re_lbl_emit(s, l2);
+ re_comp_node(s, n->a);
+ re_emit1(s, RE_JUMP, l1);
+ re_lbl_emit(s, l3);
+ } break;
+ case N_GROUP:
+ if (s->flags & RE_COMP_NO_GROUPS) {
+ re_comp_node(s, n->a);
+ } else {
+ u32 g = s->re->groups++;
+ re_emit1(s, RE_GROUP_START, g);
+ re_comp_node(s, n->a);
+ re_emit1(s, RE_GROUP_END, g);
+ }
+ break;
+ default:
+ FAIL_WITH_MSG("unimplemented");
+ break;
+ }
+ return 0;
+}
+
+void re_op_del(ReParser *s, u32 i, u32 n) {
+ RegEx *r = s->re;
+ if (i + n > r->op.n) n = r->op.n - i;
+ for (u32 j = 0; j < s->lbl.n; j++) {
+ if (s->lbl.v[j] >= i) s->lbl.v[j] -= n;
+ }
+ memmove(&r->op.v[i], &r->op.v[i+1], (r->op.n - (i + n)) * sizeof(ReOp));
+ r->op.n -= n;
+}
+
+void re_op_ins(ReParser *s, u32 i, ReOp op) {
+ RegEx *r = s->re;
+ for (u32 j = 0; j < s->lbl.n; j++) {
+ if (s->lbl.v[j] >= i) s->lbl.v[j] += 1;
+ }
+ DA_AFIT(&r->op, s->perm, r->op.n + 1);
+ if (i + 1 < r->op.n) {
+ memmove(&r->op.v[i+1], &r->op.v[i], (r->op.n - i) * sizeof(ReOp));
+ }
+ r->op.v[i] = op;
+ r->op.n++;
+}
+
+static inline int re_op_eql(RegEx *re, u32 a, u32 b) {
+ ReOp *oa = &re->op.v[a];
+ ReOp *ob = &re->op.v[b];
+ return !memcmp(oa, ob, sizeof(ReOp));
+}
+
+static inline u32 re_first_non_label(ReParser *s, u32 i) {
+ while (s->re->op.v[i].op == RE_LABEL) i++;
+ ASSERT(i < s->re->op.n);
+ return i;
+}
+
+void re_comp_find_unreachable_from(ReParser *s, uintmax_t *unreachable, u32 i) {
+ RegEx *re = s->re;
+ if (!RE_BITSET_HAS(unreachable, i)) return;
+ while (i < re->op.n) {
+ RE_BITSET_CLR(unreachable, i);
+ ReOp *o = &re->op.v[i];
+ switch (o->op) {
+ case RE_SPLIT:
+ re_comp_find_unreachable_from(s, unreachable, s->lbl.v[o->a]);
+ i = s->lbl.v[o->b];
+ break;
+ case RE_JUMP:
+ i = s->lbl.v[o->c];
+ break;
+ case RE_MATCH:
+ case RE_FAIL:
+ return;
+ default:
+ i++;
+ break;
+ }
+ }
+}
+
+void re_comp_find_unreachable(ReParser *s, uintmax_t *unreachable) {
+ RE_BITSET_SET_ALL(unreachable, s->re->op.n);
+ re_comp_find_unreachable_from(s, unreachable, 0);
+}
+
+void re_comp_dcelim(ReParser *s) {
+ RegEx *re = s->re;
+ uintmax_t *unreachable = RE_BITSET_ALLOC(s->scratch, s->re->op.n);
+
+ for (;;) {
+ re_comp_find_unreachable(s, unreachable);
+ int jmp_removed = 0;
+ /* TODO: rewrite this for fast bitset iteration with stdc_count_trailing_zeros() */
+ /* must go backwards to not invalidate the indices */
+ for (i32 i = re->op.n - 1; i >= 0; i--) {
+ ReOp *o = &re->op.v[i];
+ if (RE_BITSET_HAS(unreachable, i)) {
+ if (o->op == RE_JUMP || o->op == RE_SPLIT) jmp_removed = 1;
+ re_op_del(s, i, 1);
+ }
+ }
+ if (!jmp_removed) break;
+ }
+}
+
+void re_comp_opt(ReParser *s) {
+ RegEx *re = s->re;
+
+ /* replace jumps with their destinations, if appropriate*/
+
+#if 1
+ for (u32 i = 0; i < re->op.n; i++) {
+ ReOp *o = &re->op.v[i];
+again:
+ if (o->op == RE_JUMP) {
+ u32 dest = re_first_non_label(s, s->lbl.v[o->c]);
+ if (re->op.v[dest].op == RE_MATCH
+ || re->op.v[dest].op == RE_FAIL
+ || re->op.v[dest].op == RE_SPLIT
+ || re->op.v[dest].op == RE_JUMP) {
+ *o = re->op.v[dest];
+ goto again;
+ }
+ }
+ if (o->op == RE_SPLIT) {
+ /* TODO: will this cause bugs of trampling over the labels intended
+ * for other jumps, other splits? */
+ if (o->a == o->b) {
+ o->c = o->a;
+ o->op = RE_JUMP;
+ goto again;
+ } else {
+ u32 a = re_first_non_label(s, s->lbl.v[o->a]);
+ u32 b = re_first_non_label(s, s->lbl.v[o->b]);
+ if (re_op_eql(re, a, b)) {
+ ReOp op = re->op.v[a];
+ re_op_del(s, a, 1);
+ if (b >= a) b--;
+ re_op_del(s, b, 1);
+ re_op_ins(s, i, op);
+ } else {
+ int redo = 0;
+ while (re->op.v[a].op == RE_JUMP) {
+ o->a = re->op.v[a].c;
+ a = re_first_non_label(s, s->lbl.v[o->a]);
+ redo = 1;
+ }
+ while (re->op.v[b].op == RE_JUMP) {
+ o->b = re->op.v[b].c;
+ b = re_first_non_label(s, s->lbl.v[o->b]);
+ redo = 1;
+ }
+ if (redo) goto again;
+ }
+ }
+ }
+ }
+#endif
+
+ re_comp_dcelim(s);
+}
+
+u8 re_comp_first_byte(ReParser *s) {
+ RegEx *re = s->re;
+ for (u32 i = 0; i < re->op.n; ) {
+ ReOp *o = &re->op.v[i];
+ switch (o->op) {
+ case RE_CHAR:
+ /* TODO: utf-8 encode, then pick first byte */
+ return o->c;
+ case RE_JUMP:
+ i = o->c;
+ break;
+ case RE_GROUP_START:
+ case RE_GROUP_END:
+ i++;
+ break;
+ default:
+ return 0;
+ }
+ }
+ return 0;
+}
+
+int re_comp_fin(ReParser *s) {
+ re_emit0(s, RE_MATCH);
+ re_comp_opt(s);
+ re_lbl_resolve(s);
+ s->re->first_byte = re_comp_first_byte(s);
+ return 0;
+}
+
+int re_comp_ex(RegEx *re, Str src, Arena *perm, Arena *scratch, ReCompFlags flags) {
+ ReParser s = {
+ .re = re,
+ .perm = perm,
+ .scratch = scratch,
+ .s = src,
+ .flags = flags
+ };
+ ReNode *n = re_parse_expr(&s);
+
+ if (re_comp_node(&s, n)) return -1;
+ if (re_comp_fin(&s)) return -1;
+
+ return 0;
+}
+
+int re_comp(RegEx *re, Str src, Arena *perm, Arena *scratch) {
+ return re_comp_ex(re, src, perm, scratch, 0);
+}
+
+/* regex execution */
+
+void re_threadlist_clr(ReThreadList *l) {
+ memset(l->set, 0, 8 * ((l->n / 64) + 1));
+ l->n = 0;
+}
+
+/* TODO: find a way to not copy capture groups */
+void re_threadlist_add(ReThreadList *l, u32 inst, ReSpan *grp, u32 ngrp, Arena *a) {
+ if (!RE_BITSET_HAS(l->set, inst)) {
+ ReSpan *g = new_arr(a, ReSpan, ngrp);
+ if (grp) memcpy(g, grp, ngrp * sizeof(ReSpan));
+ l->v[l->n++] = (ReThread) { inst, g };
+ RE_BITSET_SET(l->set, inst);
+ }
+}
+
+void re_threadlist_alloc(ReThreadList *l, u32 n, Arena *a) {
+ l->set = RE_BITSET_ALLOC(a, n);
+ l->v = new_arr(a, ReThread, n);
+}
+
+static inline int re_char_set_has(RegEx *re, u32 cset, u32 c) {
+ ReChSet *p = &re->cset.v[cset];
+ u32 l = 0, r = p->n;
+ while (l < r) {
+ u32 m = (l + r) / 2;
+ ReChRange x = p->v[m];
+ if (c < x.min) r = m;
+ else if (c > x.max) l = m + 1;
+ else return !p->invert;
+ }
+ return p->invert;
+}
+
+static inline int re_char_set_packed_has(ReOp o, u32 c) {
+ /* will count (c = 0) as true for non-full-length packed char sets.
+ * is that a problem?
+ */
+ u64 b = o.align;
+ int r = 0;
+ for (u32 i = 0; i < 7; i++) {
+ r |= (b & 0xff) == c;
+ b >>= 8;
+ }
+ return r;
+}
+
+typedef enum {
+ RE_THREAD_AT_START = 1,
+ RE_THREAD_AT_END = 2
+} ReThreadStepFlags;
+
+/* 1 for match found, 0 for normal exit, -1 for reexecute in place */
+static inline int re_thread_step(ReSearch *s, ReThread *t, u32 i, ReThreadStepFlags f) {
+ Arena *a = s->a;
+ RegEx *re = s->re;
+ ReOp o = re->op.v[t->i];
+ switch (o.op) {
+ case RE_CHAR:
+ if (s->c != o.c) return 0;
+ break;
+ case RE_CHAR_NOT:
+ if (s->c == o.c) return 0;
+ break;
+ case RE_CHAR_ANY:
+ if (s->c == '\n') return 0;
+ break;
+ case RE_CHAR_SET:
+ if (!re_char_set_has(re, o.c, s->c)) return 0;
+ break;
+ case RE_CHAR_RANGE:
+ if (s->c < o.a || s->c > o.b) return 0;
+ break;
+ case RE_CHAR_RANGE_NOT:
+ if (s->c >= o.a && s->c <= o.b) return 0;
+ break;
+ case RE_CHAR_SET_PACKED:
+ if (!re_char_set_packed_has(o, s->c)) return 0;
+ break;
+ case RE_CHAR_SET_PACKED_NOT:
+ if (re_char_set_packed_has(o, s->c)) return 0;
+ break;
+ case RE_LINE_START:
+ if (s->c_prev != '\n' && !(f & RE_THREAD_AT_START)) return 0;
+ t->i++;
+ return -1;
+ case RE_LINE_END:
+ if (s->c != '\n' && !(f & RE_THREAD_AT_END)) return 0;
+ t->i++;
+ return -1;
+ case RE_MATCH:
+ return 1;
+ case RE_FAIL:
+ return 0;
+ case RE_JUMP:
+ t->i = o.c;
+ return -1;
+ case RE_SPLIT:
+ re_threadlist_add(&s->tcur, o.b, t->grp, re->groups, a);
+ t->i = o.a;
+ return -1;
+ case RE_GROUP_START:
+ t->grp[o.c].start = s->total_idx + i;
+ t->i++;
+ return -1;
+ case RE_GROUP_END:
+ t->grp[o.c].len = s->total_idx + i - t->grp[o.c].start;
+ t->i++;
+ return -1;
+ default:
+ UNREACHABLE;
+ break;
+ }
+ re_threadlist_add(&s->tnext, t->i + 1, t->grp, re->groups, a);
+ return 0;
+}
+
+static inline int re_thread_final(ReSearch *s, ReThread *t, u32 i, ReThreadStepFlags f) {
+ /* halt at any opcodes that involve character matching */
+ /* we could check for C_EOF in all of them, but it's easier this way */
+ switch (s->re->op.v[t->i].op) {
+ case RE_LINE_END:
+ case RE_MATCH:
+ case RE_FAIL:
+ case RE_JUMP:
+ case RE_SPLIT:
+ case RE_GROUP_START:
+ case RE_GROUP_END:
+ return re_thread_step(s, t, i, f);
+ default:
+ return 0;
+ }
+}
+
+static inline int re_threadlist_step(ReSearch *s, ReThreadList *tl, u32 i, ReThreadStepFlags f) {
+ int match = 0;
+ for (u32 j = 0; j < tl->n; j++) {
+ ReThread *t = &tl->v[j];
+ ASSERT(t->i < s->re->op.n);
+ int r = re_thread_step(s, t, i, f);
+ if (r < 0) {
+ j--;
+ } else if (r > 0) {
+ match = 1;
+ s->match_end = s->total_idx + i;
+ memcpy(s->grp, t->grp, sizeof(ReSpan) * s->re->groups);
+ }
+ }
+ return match;
+}
+
+static inline int re_step(ReSearch *s, u32 i, ReThreadStepFlags f) {
+ int r = re_threadlist_step(s, &s->tcur, i, f);
+ f = 0;
+ s->c_prev = s->c;
+ ReThreadList tmp = s->tcur;
+ s->tcur = s->tnext;
+ s->tnext = tmp;
+ re_threadlist_clr(&s->tnext);
+ return r;
+}
+
+static inline int re_step_final(ReSearch *s, u32 i, ReThreadStepFlags f) {
+ int found = 0;
+ for (u32 j = 0; j < s->tcur.n; j++) {
+ int r = re_thread_final(s, &s->tcur.v[j], i, f);
+ if (r < 0) {
+ j--;
+ } else if (r > 0) {
+ found = 1;
+ }
+ }
+ re_threadlist_clr(&s->tcur);
+ re_threadlist_clr(&s->tnext);
+ return found;
+}
+
+static inline isize re_search_try_match(ReSearch *s, size_t i, size_t n) {
+ RegEx *re = s->re;
+ size_t start = s->total_idx;
+ if (~s->flags & RE_SEARCH_MID_MATCH) {
+ re_threadlist_clr(&s->tcur);
+ re_threadlist_clr(&s->tnext);
+ re_threadlist_add(&s->tcur, 0, s->grp, re->groups, s->a);
+ s->flags |= RE_SEARCH_MID_MATCH;
+ s->match_start = start + i;
+ s->match_end = 0;
+ }
+ isize found_i = -1;
+ ReThreadStepFlags f = 0;
+ if (i + start == 0) f = RE_THREAD_AT_START;
+
+ for (; i < n && s->tcur.n > 0; i++) {
+ s->c = (unsigned char)s->buf[i];
+ if (re_step(s, i, f)) {
+ found_i = i;
+ s->match_end = start + i;
+ }
+ }
+ if (s->flags & RE_SEARCH_LAST_CHUNK) {
+ f |= RE_THREAD_AT_END;
+ s->c = C_EOF;
+ if (re_step_final(s, i, f)) {
+ found_i = i;
+ s->match_end = start + i;
+ }
+ }
+
+ if (s->tcur.n > 0) return i;
+ s->flags &= ~RE_SEARCH_MID_MATCH;
+ return found_i;
+}
+
+/* public facing API */
+
+void re_search_start(ReSearch *s, RegEx *re, Arena *a) {
+ *s = (ReSearch) {
+ .a = a,
+ .re = re,
+ .grp = new_arr(a, ReSpan, re->groups),
+ .flags = RE_SEARCH_FIRST_CHUNK
+ };
+ re_threadlist_alloc(&s->tcur, re->op.n, a);
+ re_threadlist_alloc(&s->tnext, re->op.n, a);
+}
+
+void re_search_chunk(ReSearch *s, const char *buf, size_t n) {
+ if (s->buf_len) s->flags &= ~RE_SEARCH_FIRST_CHUNK;
+ s->total_idx += s->buf_len;
+ s->buf = buf;
+ s->buf_len = n;
+ s->buf_idx = 0;
+}
+
+void re_search_last_chunk(ReSearch *s) {
+ s->flags |= RE_SEARCH_LAST_CHUNK;
+}
+
+
+/* searching */
+
+/* returns final index undescriminately.
+ * check s->flags to find if the match is done yet
+ */
+
+int re_search_match(ReSearch *s, ReMatch *m) {
+ size_t i = s->buf_idx;
+ size_t n = s->buf_len;
+ while (i < n) {
+ if (s->re->first_byte && (~s->flags & RE_SEARCH_MID_MATCH)) {
+ const char *p = memchr(s->buf + i, s->re->first_byte, n - i);
+ if (!p) break;
+ i = p - s->buf;
+ }
+ isize r = re_search_try_match(s, i, n);
+ if (r < 0) {
+ i++;
+ } else {
+ if (i == (usize)r) r++;
+ if (s->flags & RE_SEARCH_MID_MATCH) {
+ i = r;
+ } else {
+ s->buf_idx = r;
+ m->grp = s->grp;
+ ASSERT(s->match_start <= s->total_idx + i);
+ ASSERT(s->match_end >= s->match_start);
+ m->extent.start = s->match_start;
+ m->extent.len = s->match_end - s->match_start;
+ return 1;
+ }
+ }
+ }
+ s->buf_idx = s->buf_len;
+ return 0;
+}
+
+/* convenience wrappers */
+
+ReMatchList re_match_all(RegEx *re, Str s, Arena *a) {
+ ReSearch sr = { 0 };
+ ReMatchList ml = { 0 };
+ ReMatch m;
+ re_search_start(&sr, re, a);
+ re_search_chunk(&sr, s.s, s.n);
+ re_search_last_chunk(&sr);
+ while (re_search_match(&sr, &m)) {
+ ReSpan *grp = new_arr(a, ReSpan, re->groups);
+ memcpy(grp, m.grp, sizeof(ReSpan) * re->groups);
+ DA_APUSH(&ml, a, (ReMatch) { .extent = m.extent, .grp = grp });
+ }
+ return ml;
+}
+
+int re_match_full(RegEx *re, Str s, Arena *a) {
+ ReSearch sr = { 0 };
+ ReMatch m;
+ re_search_start(&sr, re, a);
+ re_search_chunk(&sr, s.s, s.n);
+ re_search_last_chunk(&sr);
+ return re_search_match(&sr, &m) && m.extent.start == 0 && m.extent.len == s.n;
+}
+
+int re_match(RegEx *re, Str s, ReMatch *out, Arena *a) {
+ ReSearch sr = { 0 };
+ re_search_start(&sr, re, a);
+ re_search_chunk(&sr, s.s, s.n);
+ re_search_last_chunk(&sr);
+ return re_search_match(&sr, out);
+}
diff --git a/regex.h b/regex.h
new file mode 100644
index 0000000..38089d3
--- /dev/null
+++ b/regex.h
@@ -0,0 +1,126 @@
+#ifndef REGEX_H
+#define REGEX_H
+
+#include <stdint.h>
+
+#include "arena.h"
+#include "dynarr.h"
+#include "str.h"
+
+typedef enum : u8 {
+ RE_CHAR,
+ RE_CHAR_NOT,
+ RE_CHAR_ANY,
+ RE_CHAR_SET,
+ RE_CHAR_RANGE,
+ RE_CHAR_RANGE_NOT,
+ RE_CHAR_SET_PACKED,
+ RE_CHAR_SET_PACKED_NOT,
+ RE_LINE_START,
+ RE_LINE_END,
+ RE_MATCH,
+ RE_FAIL,
+ RE_JUMP,
+ RE_SPLIT,
+ RE_GROUP_START,
+ RE_GROUP_END,
+ RE_LABEL /* only used during codegen */
+} ReOpType;
+
+typedef union {
+ struct {
+ ReOpType op;
+ union {
+ uint32_t c;
+ struct {
+ u16 a, b;
+ };
+ };
+ };
+ u64 align;
+} ReOp;
+
+typedef struct {
+ uint32_t min, max;
+} ReChRange;
+
+typedef struct {
+ /* sorted list of non-overlapping character ranges */
+ ReChRange *v;
+ uint32_t n;
+ int invert;
+} ReChSet;
+
+/* for RE_CHAR */
+typedef enum {
+ C_ANY = 0x80000000,
+ C_LINE_START,
+ C_LINE_END,
+ C_EOF = 0xffffffff
+} ReChSpecial;
+
+typedef struct {
+ DYNARR(ReOp) op;
+ DYNARR(ReChSet) cset;
+ uint32_t groups;
+ u8 first_byte; /* 0 if unknown */
+} RegEx;
+
+typedef struct {
+ uint32_t start, len;
+} ReSpan;
+
+typedef struct {
+ ReSpan extent;
+ ReSpan *grp;
+} ReMatch;
+
+typedef DYNARR(ReMatch) ReMatchList;
+
+typedef struct {
+ u32 i;
+ ReSpan *grp;
+} ReThread;
+
+typedef struct {
+ ReThread *v;
+ uintmax_t *set;
+ size_t n;
+} ReThreadList;
+
+typedef enum {
+ RE_SEARCH_FIRST_CHUNK = 1,
+ RE_SEARCH_LAST_CHUNK = 2,
+ RE_SEARCH_MID_MATCH = 4,
+} ReSearchFlags;
+
+typedef struct {
+ Arena *a;
+ RegEx *re;
+ ReSpan *grp;
+ const char *buf;
+ size_t buf_len, buf_idx;
+ size_t total_idx;
+ size_t match_start, match_end;
+ ReThreadList tcur, tnext;
+ ReSearchFlags flags;
+ uint32_t c, c_prev;
+} ReSearch;
+
+typedef enum {
+ RE_COMP_NO_GROUPS = 1
+} ReCompFlags;
+
+int re_comp(RegEx *re, Str src, Arena *perm, Arena *scratch);
+int re_comp_ex(RegEx *re, Str src, Arena *perm, Arena *scratch, ReCompFlags flags);
+
+void re_search_start(ReSearch *s, RegEx *re, Arena *a);
+void re_search_chunk(ReSearch *s, const char *buf, size_t n);
+void re_search_last_chunk(ReSearch *s);
+int re_search_match(ReSearch *s, ReMatch *m);
+
+ReMatchList re_match_all(RegEx *re, Str s, Arena *a);
+int re_match_full(RegEx *re, Str s, Arena *a);
+int re_match(RegEx *re, Str s, ReMatch *out, Arena *a);
+
+#endif