From d723f8a5d54f098f0cf378e5dcf3a7d4ec049822 Mon Sep 17 00:00:00 2001 From: WormHeamer Date: Wed, 31 Dec 2025 05:01:16 -0500 Subject: add regex search (only forwards for now) --- regex.h | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 regex.h (limited to 'regex.h') diff --git a/regex.h b/regex.h new file mode 100644 index 0000000..38089d3 --- /dev/null +++ b/regex.h @@ -0,0 +1,126 @@ +#ifndef REGEX_H +#define REGEX_H + +#include + +#include "arena.h" +#include "dynarr.h" +#include "str.h" + +typedef enum : u8 { + RE_CHAR, + RE_CHAR_NOT, + RE_CHAR_ANY, + RE_CHAR_SET, + RE_CHAR_RANGE, + RE_CHAR_RANGE_NOT, + RE_CHAR_SET_PACKED, + RE_CHAR_SET_PACKED_NOT, + RE_LINE_START, + RE_LINE_END, + RE_MATCH, + RE_FAIL, + RE_JUMP, + RE_SPLIT, + RE_GROUP_START, + RE_GROUP_END, + RE_LABEL /* only used during codegen */ +} ReOpType; + +typedef union { + struct { + ReOpType op; + union { + uint32_t c; + struct { + u16 a, b; + }; + }; + }; + u64 align; +} ReOp; + +typedef struct { + uint32_t min, max; +} ReChRange; + +typedef struct { + /* sorted list of non-overlapping character ranges */ + ReChRange *v; + uint32_t n; + int invert; +} ReChSet; + +/* for RE_CHAR */ +typedef enum { + C_ANY = 0x80000000, + C_LINE_START, + C_LINE_END, + C_EOF = 0xffffffff +} ReChSpecial; + +typedef struct { + DYNARR(ReOp) op; + DYNARR(ReChSet) cset; + uint32_t groups; + u8 first_byte; /* 0 if unknown */ +} RegEx; + +typedef struct { + uint32_t start, len; +} ReSpan; + +typedef struct { + ReSpan extent; + ReSpan *grp; +} ReMatch; + +typedef DYNARR(ReMatch) ReMatchList; + +typedef struct { + u32 i; + ReSpan *grp; +} ReThread; + +typedef struct { + ReThread *v; + uintmax_t *set; + size_t n; +} ReThreadList; + +typedef enum { + RE_SEARCH_FIRST_CHUNK = 1, + RE_SEARCH_LAST_CHUNK = 2, + RE_SEARCH_MID_MATCH = 4, +} ReSearchFlags; + +typedef struct { + Arena *a; + RegEx *re; + ReSpan *grp; + const char *buf; + size_t buf_len, buf_idx; + size_t total_idx; + size_t match_start, match_end; + ReThreadList tcur, tnext; + ReSearchFlags flags; + uint32_t c, c_prev; +} ReSearch; + +typedef enum { + RE_COMP_NO_GROUPS = 1 +} ReCompFlags; + +int re_comp(RegEx *re, Str src, Arena *perm, Arena *scratch); +int re_comp_ex(RegEx *re, Str src, Arena *perm, Arena *scratch, ReCompFlags flags); + +void re_search_start(ReSearch *s, RegEx *re, Arena *a); +void re_search_chunk(ReSearch *s, const char *buf, size_t n); +void re_search_last_chunk(ReSearch *s); +int re_search_match(ReSearch *s, ReMatch *m); + +ReMatchList re_match_all(RegEx *re, Str s, Arena *a); +int re_match_full(RegEx *re, Str s, Arena *a); +int re_match(RegEx *re, Str s, ReMatch *out, Arena *a); + +#endif -- cgit v1.2.3