From 6d5c57abcb42e0b8d268b99bc6c3b10c661d799e Mon Sep 17 00:00:00 2001 From: WormHeamer Date: Fri, 7 Nov 2025 19:48:10 -0500 Subject: rework all unicode handling --- vui.c | 146 ++++++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 76 insertions(+), 70 deletions(-) diff --git a/vui.c b/vui.c index 0434334..258b7f1 100644 --- a/vui.c +++ b/vui.c @@ -62,8 +62,12 @@ static inline void vui_outsn(const char *s, unsigned n); static inline void vui_outs(const char *s); static inline void vui_out_flush(void); -static inline const char *utf8_next(u32 *out, const char *src); -static void utf8_decode(uint32_t *dst, const char *src, unsigned n); +/* unicode */ + +static u32 utf8_decode_len(const char *src, u32 n); +static u32 utf8_encode_len(const u32 *src, u32 n); +static void utf8_decode(u32 *dst, u32 n, const char *src); +static void utf8_encode(char *dst, const u32 *src, u32 n); /* globals */ @@ -303,24 +307,21 @@ static void vui_outf(const char *fmt, ...) { * 6 sixes, 5 fives, 5 fours, 5 threes, four twos, and eight ones. * no obvious pattern. * - * to avoid the subtraction we reverse: - * { 1,1,1,1,1,1,1,1,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,6, } - * - * subtract one from each: - * { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,5, } + * no utf8 elements above 0x10FFFF, so never more than 4 bytes: + * 4444433333222211111111 * - * clamp to <=3 (no unicode codepoints are actually that long): - * { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, } + * subtract one from each, so they fit in 3 bytes: + * 3333322222111100000000 * - * concatenate bits: - * 0b0000000000000001010101101010101011111111111111111111111111111111 + * pack into a u64: + * 0b11111111111010101010010101010000000000000000 * * convert to hex, and now: - * len(cp) = 1 + (0x156AAFFFFFFFF >> (2 * clz(cp))) & 3 + * len(cp) = 1 + (0xFFEAA550000 >> (2 * (32 - clz(cp)))) & 3 */ -#define UTF8_CP_LEN_BITS ((uint64_t)0x156AAFFFFFFFF) -#define UTF8_CP_SHIFT(cp) (stdc_leading_zeros((uint32_t)cp) << 1) +#define UTF8_CP_LEN_BITS ((uint64_t)0xFFEAA550000) +#define UTF8_CP_SHIFT(cp) ((32 - stdc_leading_zeros((uint32_t)(cp))) << 1) #define UTF8_CP_LEN(cp) (1 + ((UTF8_CP_LEN_BITS >> UTF8_CP_SHIFT(cp)) & 3)) static inline void vui_outc(char c) { @@ -332,29 +333,14 @@ static inline void vui_outc(char c) { /* it doesn't make sense to do so, and assuming non-zero lets us dodge a branch * in stdc_leading_zeros() */ static inline void vui_outvc(VuiChar c) { - //ASSUME(c > 0 && c <= 0x110000); - ASSUME(c > 0); - ASSUME(c <= 0x110000); - uint8_t len = UTF8_CP_LEN(c); - vui_out_fit(vui_outn + len); - ASSUME(len > 0 && len < 5); - - if (len == 1) { - vui_out[vui_outn++] = c; - return; - } - - for (unsigned i = len; --i;) { - vui_out[vui_outn + i] = 0x80 | (c & 0x3f); - c >>= 6; - } - - vui_out[vui_outn] = (0xf0 << (4 - len)) | c; - vui_outn += len; + vui_outvcn(&c, 1); } static inline void vui_outvcn(VuiChar *c, size_t n) { - while (n--) vui_outvc(*c++); + u32 len = utf8_encode_len(c, n); + vui_out_fit(vui_outn + len); + utf8_encode(&vui_out[vui_outn], c, n); + vui_outn += len; } static inline void vui_outsn(const char *s, unsigned n) { @@ -617,40 +603,6 @@ void vui_chr(int x, int y, VuiChar c) { vui_chra(x, y, c, ATTR_DEFAULT); } -/* -static inline u32 utf8_next(u32 *p, const char *s, u32 n) { - u32 i = *p; - u8 c = s[i++]; - usize bits = stdc_leading_ones(c); - ASSUME(bits < 5); - u32 cp = c & ((1 << (7-bits)) - 1); - while (bits-- > 1) { - c = s[i++]; - cp = (cp << 6) | (c & 0x3F); - } - *p = i; - return cp; -} -*/ - -static inline const char *utf8_next(u32 *out, const char *src) { - u8 c = *src++; - usize bits = stdc_leading_ones(c); - ASSUME(bits < 5); - u32 cp = c & (-1 >> bits); - while (bits-- > 1) { - c = *src++; - cp = (cp << 6) | (c & 0x3F); - } - *out = cp; - return src; -} - -static void utf8_decode(uint32_t *dst, const char *src, unsigned n) { - const char *end = src + n; - while (src < end) src = utf8_next(dst++, src); -} - static void truncate_span(int *x, unsigned *nptr) { int n = (int)*nptr; if (*x < 0) { @@ -670,10 +622,11 @@ static void truncate_span(int *x, unsigned *nptr) { *nptr = n; } -void vui_putsna(int x, int y, const char *s, unsigned n, VuiAttr a) { +void vui_putsna(int x, int y, const char *s, unsigned srcn, VuiAttr a) { + u32 n = utf8_decode_len(s, srcn); truncate_span(&x, &n); if (n < 1 || y < 0 || y >= (int)LINES) return; - utf8_decode(&CHR(x, y), s, n); + utf8_decode(&CHR(x, y), n, s); for (uint16_t *pa = &ATTR(x, y); n--;) *pa++ = a; } @@ -877,3 +830,56 @@ VuiKey vui_key(void) { return c & 0x80 ? getk_utf8(c) : c; } } + +/* utf8 */ + +static u32 utf8_decode_len(const char *src, u32 n) { + u32 i = 0; + u32 len = 0; + while (i < n) { + i += stdc_leading_ones((u8)src[i]) + ((~src[i] & 0x80) >> 7); + len++; + } + return len; +} + +static void utf8_decode(u32 *dst, u32 n, const char *src) { + while (n--) { + u8 c = *src++; + u32 bits = stdc_leading_ones(c); + ASSUME(bits < 5); + u32 cp = c & (0xff >> bits); + while (bits-- > 1) { + c = *src++; + cp = (cp << 6) | (c & 0x3F); + } + *dst++ = cp; + } +} + +static u32 utf8_encode_len(const u32 *src, u32 n) { + u32 len = 0; + while (n) len += UTF8_CP_LEN(src[--n]); + return len; +} + +static void utf8_encode(char *dst, const u32 *src, u32 n) { + while (n--) { + u32 c = *src++; + ASSUME(c > 0 && c <= 0x110000); + + u32 len = UTF8_CP_LEN(c); + ASSUME(len > 0 && len < 5); + + if (len > 1) { + for (u32 i = len; --i;) { + dst[i] = 0x80 | (c & 0x3f); + c >>= 6; + } + *dst = (0xf0 << (4 - len)) | c; + dst += len; + } else { + *dst++ = c; + } + } +} -- cgit v1.2.3