summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorWormHeamer2026-01-03 02:10:52 -0500
committerWormHeamer2026-01-03 02:10:52 -0500
commit02af004a83120468f69786c2d9ec6f44d08d1c1f (patch)
tree6e4a35ed5b441cd91156b6882b4495942e12617a /utf8.h
parentb47bf27f4b3205ec92bd08a918143d1fc467f32a (diff)
unicode regexes
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h20
1 files changed, 20 insertions, 0 deletions
diff --git a/utf8.h b/utf8.h
index db98e18..0f7b197 100644
--- a/utf8.h
+++ b/utf8.h
@@ -15,6 +15,7 @@ u32 utf8_encode_len(const u32 *src, u32 cp_count);
void utf8_decode(u32 *dst, const char *src, u32 cp_count);
void utf8_encode(char *dst, const u32 *src, u32 cp_count);
u32 utf8_decode_at(const char *s, u32 i, u32 n);
+int utf8_to_char32(const char *s, u32 i, u32 n, u32 *cout);
int utf8_validate(const char *src, u32 n);
#ifdef UTF8_IMPL
@@ -86,6 +87,25 @@ u32 utf8_decode_at(const char *s, u32 i, u32 n) {
return cp;
}
+int utf8_to_char32(const char *s, u32 i, u32 n, u32 *cout) {
+ if (i >= n) return 0;
+ u8 b = s[i];
+ if (~b & 0x80) {
+ *cout = b;
+ return 1;
+ }
+ u32 bits = stdc_leading_ones(b);
+ if (i + bits > n || bits > 4) {
+ *cout = UTF8_INVALID;
+ return 1;
+ }
+ u32 cp = b & (0xff >> bits);
+ for (u32 j = bits; --j;)
+ cp = (cp << 6) | (s[++i] & 0x3f);
+ *cout = cp;
+ return bits;
+}
+
int utf8_validate(const char *src, u32 n) {
/* TODO: rewrite this to be faster */
for (u32 i = 0; i < n; i++) {