summaryrefslogtreecommitdiff
path: root/regex.h
blob: d6dbbbcbe5b87809d05f0672c0be161a2cc1cda0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#ifndef REGEX_H
#define REGEX_H

#include <stdint.h>

#include "arena.h"
#include "dynarr.h"
#include "str.h"

typedef enum : u8 {
	RE_CHAR,
	RE_CHAR_NOT,
	RE_CHAR_ANY,
	RE_CHAR_SET,
	RE_CHAR_RANGE,
	RE_CHAR_RANGE_NOT,
	RE_CHAR_SET_PACKED,
	RE_CHAR_SET_PACKED_NOT,
	RE_LINE_START,
	RE_LINE_END,
	RE_MATCH,
	RE_FAIL,
	RE_JUMP,
	RE_SPLIT,
	RE_GROUP_START,
	RE_GROUP_END,
	RE_LABEL /* only used during codegen */
} ReOpType;

typedef union {
	struct {
		ReOpType op;
		union {
			uint32_t c;
			struct {
				u16 a, b;
			};
		};
	};
	u64 align;
} ReOp;

typedef struct {
	uint32_t min, max;
} ReChRange;

typedef struct {
	/* sorted list of non-overlapping character ranges */
	ReChRange *v;
	uint32_t n;
	int invert;
} ReChSet;

/* for RE_CHAR */
typedef enum {
	C_ANY = 0x40000000,
	C_LINE_START,
	C_LINE_END,
} ReChSpecial;

typedef struct {
	DYNARR(ReOp) op;
	DYNARR(ReChSet) cset;
	uint32_t groups;
	u8 first_byte; /* 0 if unknown */
} RegEx;

typedef struct {
	uint32_t start, len;
} ReSpan;

typedef struct {
	ReSpan extent;
	ReSpan *grp;
} ReMatch;

typedef DYNARR(ReMatch) ReMatchList;

typedef struct {
	u32 i;
	ReSpan *grp;
} ReThread;

typedef struct {
	ReThread *v;
	uintmax_t *set;
	size_t n;
} ReThreadList;

typedef enum {
	RE_SEARCH_FIRST_CHUNK = 1,
	RE_SEARCH_LAST_CHUNK  = 2,
	RE_SEARCH_MID_MATCH   = 4,
	RE_SEARCH_WAS_NEWLINE = 8,
} ReSearchFlags;

typedef struct {
	Arena *a;
	RegEx *re;
	ReSpan *grp;
	const char *buf;
	size_t buf_len, buf_idx;
	size_t total_idx;
	size_t match_start, match_end;
	ReThreadList tcur, tnext;
	ReSearchFlags flags;
	uint32_t c, c_prev;
} ReSearch;

typedef enum {
	RE_COMP_NO_GROUPS = 1
} ReCompFlags;

typedef enum {
	RE_COMP_ENONE,
	RE_COMP_ENORPAREN,
	RE_COMP_ENOLPAREN,
	RE_COMP_EEOF,
} ReCompErr;

int re_comp(RegEx *re, Str src, Arena *perm, Arena *scratch);
int re_comp_ex(RegEx *re, Str src, Arena *perm, Arena *scratch, ReCompFlags flags);
const char *re_comp_strerror(ReCompErr err);

void re_search_start(ReSearch *s, RegEx *re, Arena *a);
void re_search_chunk(ReSearch *s, const char *buf, size_t n);
void re_search_first_chunk(ReSearch *s);
void re_search_last_chunk(ReSearch *s);
int re_search_match(ReSearch *s, ReMatch *m);
int re_search_match_at_start(ReSearch *s, ReMatch *m);

ReMatchList re_match_all(RegEx *re, Str s, Arena *a);
int re_match_full(RegEx *re, Str s, Arena *a);
int re_match(RegEx *re, Str s, ReMatch *out, Arena *a);

#endif