diff options
| author | WormHeamer | 2026-01-03 02:10:52 -0500 |
|---|---|---|
| committer | WormHeamer | 2026-01-03 02:10:52 -0500 |
| commit | 02af004a83120468f69786c2d9ec6f44d08d1c1f (patch) | |
| tree | 6e4a35ed5b441cd91156b6882b4495942e12617a | |
| parent | b47bf27f4b3205ec92bd08a918143d1fc467f32a (diff) | |
unicode regexes
| -rw-r--r-- | regex.c | 39 | ||||
| -rw-r--r-- | utf8.h | 20 |
2 files changed, 52 insertions, 7 deletions
@@ -8,6 +8,7 @@ #include "dynarr.h" #include "str.h" #include "regex.h" +#include "utf8.h" /* bitsets */ @@ -29,12 +30,17 @@ typedef struct { } Label; typedef struct { + u32 *s; + isize n; +} ReStr32; + +typedef struct { RegEx *re; Arena *perm, *scratch; DYNARR(u32) lbl; ReCompFlags flags; ReCompErr err; - Str s; + ReStr32 s; int i; } ReParser; @@ -564,7 +570,7 @@ again: re_comp_dcelim(s); } -u8 re_comp_first_byte(ReParser *s) { +u32 re_comp_first_char32(ReParser *s) { RegEx *re = s->re; for (u32 i = 0; i < re->op.n; ) { ReOp *o = &re->op.v[i]; @@ -586,6 +592,14 @@ u8 re_comp_first_byte(ReParser *s) { return 0; } +u8 re_comp_first_byte(ReParser *s) { + u32 c = re_comp_first_char32(s); + if (c == 0) return 0; + char buf[4]; + utf8_encode(buf, &c, 1); + return buf[0]; +} + int re_comp_fin(ReParser *s) { re_emit0(s, RE_MATCH); re_comp_opt(s); @@ -594,12 +608,19 @@ int re_comp_fin(ReParser *s) { return 0; } +static inline ReStr32 re_str_to_str32(Str s, Arena *a) { + u32 n = utf8_decode_len(s.s, s.n); + u32 *v = new_arr(a, u32, n); + utf8_decode(v, s.s, n); + return (ReStr32) { v, n }; +} + int re_comp_ex(RegEx *re, Str src, Arena *perm, Arena *scratch, ReCompFlags flags) { ReParser s = { .re = re, .perm = perm, .scratch = scratch, - .s = src, + .s = re_str_to_str32(src, scratch), .flags = flags, .err = RE_COMP_ENONE }; @@ -827,8 +848,8 @@ static inline isize re_search_try_match(ReSearch *s, size_t i, size_t n) { ReThreadStepFlags f = 0; if (i == 0 && (s->flags & (RE_SEARCH_FIRST_CHUNK | RE_SEARCH_WAS_NEWLINE))) f = RE_THREAD_AT_START; - for (; i < n && s->tcur.n > 0; i++) { - s->c = (unsigned char)s->buf[i]; + for (; i < n && s->tcur.n > 0; ) { + i += utf8_to_char32(s->buf, i, n, &s->c); if (re_step(s, i, f)) { found_i = i; s->match_end = start + i; @@ -897,7 +918,9 @@ static inline void re_search_chunk_fin(ReSearch *s) { int re_search_match_at_start(ReSearch *s, ReMatch *m) { size_t i = s->buf_idx; size_t n = s->buf_len; - while (i < n) { + for (;;) { + while (i < n && (s->buf[i] & 0xc0) == 0x80) i++; + if (i == n) break; isize r = re_search_try_match(s, i, n); if (r < 0) { return 0; @@ -923,7 +946,9 @@ int re_search_match_at_start(ReSearch *s, ReMatch *m) { int re_search_match(ReSearch *s, ReMatch *m) { u32 i = s->buf_idx; u32 n = s->buf_len; - while (i < n) { + for (;;) { + while (i < n && (s->buf[i] & 0xc0) == 0x80) i++; + if (i == n) break; if (s->re->first_byte && (~s->flags & RE_SEARCH_MID_MATCH)) { const char *p = memchr(s->buf + i, s->re->first_byte, n - i); if (!p) break; @@ -15,6 +15,7 @@ u32 utf8_encode_len(const u32 *src, u32 cp_count); void utf8_decode(u32 *dst, const char *src, u32 cp_count); void utf8_encode(char *dst, const u32 *src, u32 cp_count); u32 utf8_decode_at(const char *s, u32 i, u32 n); +int utf8_to_char32(const char *s, u32 i, u32 n, u32 *cout); int utf8_validate(const char *src, u32 n); #ifdef UTF8_IMPL @@ -86,6 +87,25 @@ u32 utf8_decode_at(const char *s, u32 i, u32 n) { return cp; } +int utf8_to_char32(const char *s, u32 i, u32 n, u32 *cout) { + if (i >= n) return 0; + u8 b = s[i]; + if (~b & 0x80) { + *cout = b; + return 1; + } + u32 bits = stdc_leading_ones(b); + if (i + bits > n || bits > 4) { + *cout = UTF8_INVALID; + return 1; + } + u32 cp = b & (0xff >> bits); + for (u32 j = bits; --j;) + cp = (cp << 6) | (s[++i] & 0x3f); + *cout = cp; + return bits; +} + int utf8_validate(const char *src, u32 n) { /* TODO: rewrite this to be faster */ for (u32 i = 0; i < n; i++) { |
