diff options
| author | WormHeamer | 2026-01-03 02:10:52 -0500 |
|---|---|---|
| committer | WormHeamer | 2026-01-03 02:10:52 -0500 |
| commit | 02af004a83120468f69786c2d9ec6f44d08d1c1f (patch) | |
| tree | 6e4a35ed5b441cd91156b6882b4495942e12617a /utf8.h | |
| parent | b47bf27f4b3205ec92bd08a918143d1fc467f32a (diff) | |
unicode regexes
Diffstat (limited to 'utf8.h')
| -rw-r--r-- | utf8.h | 20 |
1 files changed, 20 insertions, 0 deletions
@@ -15,6 +15,7 @@ u32 utf8_encode_len(const u32 *src, u32 cp_count); void utf8_decode(u32 *dst, const char *src, u32 cp_count); void utf8_encode(char *dst, const u32 *src, u32 cp_count); u32 utf8_decode_at(const char *s, u32 i, u32 n); +int utf8_to_char32(const char *s, u32 i, u32 n, u32 *cout); int utf8_validate(const char *src, u32 n); #ifdef UTF8_IMPL @@ -86,6 +87,25 @@ u32 utf8_decode_at(const char *s, u32 i, u32 n) { return cp; } +int utf8_to_char32(const char *s, u32 i, u32 n, u32 *cout) { + if (i >= n) return 0; + u8 b = s[i]; + if (~b & 0x80) { + *cout = b; + return 1; + } + u32 bits = stdc_leading_ones(b); + if (i + bits > n || bits > 4) { + *cout = UTF8_INVALID; + return 1; + } + u32 cp = b & (0xff >> bits); + for (u32 j = bits; --j;) + cp = (cp << 6) | (s[++i] & 0x3f); + *cout = cp; + return bits; +} + int utf8_validate(const char *src, u32 n) { /* TODO: rewrite this to be faster */ for (u32 i = 0; i < n; i++) { |
