summaryrefslogtreecommitdiff
path: root/regex.c
diff options
context:
space:
mode:
Diffstat (limited to 'regex.c')
-rw-r--r--regex.c39
1 files changed, 32 insertions, 7 deletions
diff --git a/regex.c b/regex.c
index b73cd42..222aa9d 100644
--- a/regex.c
+++ b/regex.c
@@ -8,6 +8,7 @@
#include "dynarr.h"
#include "str.h"
#include "regex.h"
+#include "utf8.h"
/* bitsets */
@@ -29,12 +30,17 @@ typedef struct {
} Label;
typedef struct {
+ u32 *s;
+ isize n;
+} ReStr32;
+
+typedef struct {
RegEx *re;
Arena *perm, *scratch;
DYNARR(u32) lbl;
ReCompFlags flags;
ReCompErr err;
- Str s;
+ ReStr32 s;
int i;
} ReParser;
@@ -564,7 +570,7 @@ again:
re_comp_dcelim(s);
}
-u8 re_comp_first_byte(ReParser *s) {
+u32 re_comp_first_char32(ReParser *s) {
RegEx *re = s->re;
for (u32 i = 0; i < re->op.n; ) {
ReOp *o = &re->op.v[i];
@@ -586,6 +592,14 @@ u8 re_comp_first_byte(ReParser *s) {
return 0;
}
+u8 re_comp_first_byte(ReParser *s) {
+ u32 c = re_comp_first_char32(s);
+ if (c == 0) return 0;
+ char buf[4];
+ utf8_encode(buf, &c, 1);
+ return buf[0];
+}
+
int re_comp_fin(ReParser *s) {
re_emit0(s, RE_MATCH);
re_comp_opt(s);
@@ -594,12 +608,19 @@ int re_comp_fin(ReParser *s) {
return 0;
}
+static inline ReStr32 re_str_to_str32(Str s, Arena *a) {
+ u32 n = utf8_decode_len(s.s, s.n);
+ u32 *v = new_arr(a, u32, n);
+ utf8_decode(v, s.s, n);
+ return (ReStr32) { v, n };
+}
+
int re_comp_ex(RegEx *re, Str src, Arena *perm, Arena *scratch, ReCompFlags flags) {
ReParser s = {
.re = re,
.perm = perm,
.scratch = scratch,
- .s = src,
+ .s = re_str_to_str32(src, scratch),
.flags = flags,
.err = RE_COMP_ENONE
};
@@ -827,8 +848,8 @@ static inline isize re_search_try_match(ReSearch *s, size_t i, size_t n) {
ReThreadStepFlags f = 0;
if (i == 0 && (s->flags & (RE_SEARCH_FIRST_CHUNK | RE_SEARCH_WAS_NEWLINE))) f = RE_THREAD_AT_START;
- for (; i < n && s->tcur.n > 0; i++) {
- s->c = (unsigned char)s->buf[i];
+ for (; i < n && s->tcur.n > 0; ) {
+ i += utf8_to_char32(s->buf, i, n, &s->c);
if (re_step(s, i, f)) {
found_i = i;
s->match_end = start + i;
@@ -897,7 +918,9 @@ static inline void re_search_chunk_fin(ReSearch *s) {
int re_search_match_at_start(ReSearch *s, ReMatch *m) {
size_t i = s->buf_idx;
size_t n = s->buf_len;
- while (i < n) {
+ for (;;) {
+ while (i < n && (s->buf[i] & 0xc0) == 0x80) i++;
+ if (i == n) break;
isize r = re_search_try_match(s, i, n);
if (r < 0) {
return 0;
@@ -923,7 +946,9 @@ int re_search_match_at_start(ReSearch *s, ReMatch *m) {
int re_search_match(ReSearch *s, ReMatch *m) {
u32 i = s->buf_idx;
u32 n = s->buf_len;
- while (i < n) {
+ for (;;) {
+ while (i < n && (s->buf[i] & 0xc0) == 0x80) i++;
+ if (i == n) break;
if (s->re->first_byte && (~s->flags & RE_SEARCH_MID_MATCH)) {
const char *p = memchr(s->buf + i, s->re->first_byte, n - i);
if (!p) break;