unicode regexes

author: WormHeamer 2026-01-03 02:10:52 -0500
committer: WormHeamer 2026-01-03 02:10:52 -0500
commit: 02af004a83120468f69786c2d9ec6f44d08d1c1f (patch)
tree: 6e4a35ed5b441cd91156b6882b4495942e12617a
parent: b47bf27f4b3205ec92bd08a918143d1fc467f32a (diff)
2 files changed, 52 insertions, 7 deletions
diff --git a/regex.c b/regex.c
index b73cd42..222aa9d 100644
--- a/regex.c
+++ b/regex.c
@@ -8,6 +8,7 @@
 #include "dynarr.h"
 #include "str.h"
 #include "regex.h"
+#include "utf8.h"
 
 /* bitsets */
 
@@ -29,12 +30,17 @@ typedef struct {
 } Label;
 
 typedef struct {
+	u32 *s;
+	isize n;
+} ReStr32;
+
+typedef struct {
 	RegEx *re;
 	Arena *perm, *scratch;
 	DYNARR(u32) lbl;
 	ReCompFlags flags;
 	ReCompErr err;
-	Str s;
+	ReStr32 s;
 	int i;
 } ReParser;
 
@@ -564,7 +570,7 @@ again:
 	re_comp_dcelim(s);
 }
 
-u8 re_comp_first_byte(ReParser *s) {
+u32 re_comp_first_char32(ReParser *s) {
 	RegEx *re = s->re;
 	for (u32 i = 0; i < re->op.n; ) {
 		ReOp *o = &re->op.v[i];
@@ -586,6 +592,14 @@ u8 re_comp_first_byte(ReParser *s) {
 	return 0;
 }
 
+u8 re_comp_first_byte(ReParser *s) {
+	u32 c = re_comp_first_char32(s);
+	if (c == 0) return 0;
+	char buf[4];
+	utf8_encode(buf, &c, 1);
+	return buf[0];
+}
+
 int re_comp_fin(ReParser *s) {
 	re_emit0(s, RE_MATCH);
 	re_comp_opt(s);
@@ -594,12 +608,19 @@ int re_comp_fin(ReParser *s) {
 	return 0;
 }
 
+static inline ReStr32 re_str_to_str32(Str s, Arena *a) {
+	u32 n = utf8_decode_len(s.s, s.n);
+	u32 *v = new_arr(a, u32, n);
+	utf8_decode(v, s.s, n);
+	return (ReStr32) { v, n };
+}
+
 int re_comp_ex(RegEx *re, Str src, Arena *perm, Arena *scratch, ReCompFlags flags) {
 	ReParser s = {
 		.re = re,
 		.perm = perm,
 		.scratch = scratch,
-		.s = src,
+		.s = re_str_to_str32(src, scratch),
 		.flags = flags,
 		.err = RE_COMP_ENONE
 	};
@@ -827,8 +848,8 @@ static inline isize re_search_try_match(ReSearch *s, size_t i, size_t n) {
 	ReThreadStepFlags f = 0;
 	if (i == 0 && (s->flags & (RE_SEARCH_FIRST_CHUNK | RE_SEARCH_WAS_NEWLINE))) f = RE_THREAD_AT_START;
 
-	for (; i < n && s->tcur.n > 0; i++) {
-		s->c = (unsigned char)s->buf[i];
+	for (; i < n && s->tcur.n > 0; ) {
+		i += utf8_to_char32(s->buf, i, n, &s->c);
 		if (re_step(s, i, f)) {
 			found_i = i;
 			s->match_end = start + i;
@@ -897,7 +918,9 @@ static inline void re_search_chunk_fin(ReSearch *s) {
 int re_search_match_at_start(ReSearch *s, ReMatch *m) {
 	size_t i = s->buf_idx;
 	size_t n = s->buf_len;
-	while (i < n) {
+	for (;;) {
+		while (i < n && (s->buf[i] & 0xc0) == 0x80) i++;
+		if (i == n) break;
 		isize r = re_search_try_match(s, i, n);
 		if (r < 0) {
 			return 0;
@@ -923,7 +946,9 @@ int re_search_match_at_start(ReSearch *s, ReMatch *m) {
 int re_search_match(ReSearch *s, ReMatch *m) {
 	u32 i = s->buf_idx;
 	u32 n = s->buf_len;
-	while (i < n) {
+	for (;;) {
+		while (i < n && (s->buf[i] & 0xc0) == 0x80) i++;
+		if (i == n) break;
 		if (s->re->first_byte && (~s->flags & RE_SEARCH_MID_MATCH)) {
 			const char *p = memchr(s->buf + i, s->re->first_byte, n - i);
 			if (!p) break;
diff --git a/utf8.h b/utf8.h
index db98e18..0f7b197 100644
--- a/utf8.h
+++ b/utf8.h
@@ -15,6 +15,7 @@ u32 utf8_encode_len(const u32 *src, u32 cp_count);
 void utf8_decode(u32 *dst, const char *src, u32 cp_count);
 void utf8_encode(char *dst, const u32 *src, u32 cp_count);
 u32 utf8_decode_at(const char *s, u32 i, u32 n);
+int utf8_to_char32(const char *s, u32 i, u32 n, u32 *cout);
 int utf8_validate(const char *src, u32 n);
 
 #ifdef UTF8_IMPL
@@ -86,6 +87,25 @@ u32 utf8_decode_at(const char *s, u32 i, u32 n) {
 	return cp;
 }
 
+int utf8_to_char32(const char *s, u32 i, u32 n, u32 *cout) {
+	if (i >= n) return 0;
+	u8 b = s[i];
+	if (~b & 0x80) {
+		*cout = b;
+		return 1;
+	}
+	u32 bits = stdc_leading_ones(b);
+	if (i + bits > n || bits > 4) {
+		*cout = UTF8_INVALID;
+		return 1;
+	}
+	u32 cp = b & (0xff >> bits);
+	for (u32 j = bits; --j;)
+		cp = (cp << 6) | (s[++i] & 0x3f);
+	*cout = cp;
+	return bits;
+}
+
 int utf8_validate(const char *src, u32 n) {
 	/* TODO: rewrite this to be faster */
 	for (u32 i = 0; i < n; i++) {
author	WormHeamer	2026-01-03 02:10:52 -0500
committer	WormHeamer	2026-01-03 02:10:52 -0500
commit	02af004a83120468f69786c2d9ec6f44d08d1c1f (patch)
tree	6e4a35ed5b441cd91156b6882b4495942e12617a
parent	b47bf27f4b3205ec92bd08a918143d1fc467f32a (diff)