diff options
| author | WormHeamer | 2025-12-28 02:45:38 -0500 |
|---|---|---|
| committer | WormHeamer | 2025-12-28 02:45:38 -0500 |
| commit | 9f4310c24ca39284ad768a82d368e749b18fd76c (patch) | |
| tree | 0563d7b351de59e395315381a113a936e1382669 /utf8.h | |
| parent | 54e30f7e1a3269946bcffd0bb293c7f4f39eda7c (diff) | |
put more emphasis on TxtLoc
Diffstat (limited to 'utf8.h')
| -rw-r--r-- | utf8.h | 89 |
1 files changed, 89 insertions, 0 deletions
@@ -0,0 +1,89 @@ +/* utf-8 encoding and decoding library */ + +#ifndef UTF8_H +#define UTF8_H + +#include "wrmr.h" + +#define UTF8_INVALID 0xFFFD /* replacement character */ + +u32 utf8_decode_len(const char *src, u32 ch_count); +u32 utf8_encode_len(const u32 *src, u32 cp_count); +void utf8_decode(u32 *dst, const char *src, u32 cp_count); +void utf8_encode(char *dst, const u32 *src, u32 cp_count); +u32 utf8_decode_at(const char *s, u32 i, u32 n); + +#ifdef UTF8_IMPL + +#include <stdbit.h> + +/* packed array of 2-bit lengths for codepoints 0..10FFFF */ +#define UTF8_CP_LEN_BITS ((uint64_t)0xFFEAA550000) +#define UTF8_CP_SHIFT(cp) ((32 - stdc_leading_zeros((uint32_t)(cp))) << 1) +#define UTF8_CP_LEN(cp) (1 + ((UTF8_CP_LEN_BITS >> UTF8_CP_SHIFT(cp)) & 3)) + +u32 utf8_encode_len(const u32 *src, u32 cp_count) { + u32 len = 0; + while (cp_count) len += UTF8_CP_LEN(src[--cp_count]); + return len; +} + +u32 utf8_decode_len(const char *src, u32 ch_count) { + u32 i = 0, len = 0; + while (i < ch_count) { + i += stdc_leading_ones((u8)src[i]) + ((~src[i] & 0x80) >> 7); + len++; + } + return len; +} + +void utf8_encode(char *dst, const u32 *src, u32 cp_count) { + while (cp_count--) { + u32 c = *src++; + ASSUME(c > 0 && c < 0x110000); + u32 len = UTF8_CP_LEN(c); + ASSUME(len > 0 && len < 5); + if (len > 1) { + for (u32 i = len; --i;) { + dst[i] = 0x80 | (c & 0x3f); + c >>= 6; + } + *dst = (0xf0 << (4 - len)) | c; + dst += len; + } else { + *dst++ = c; + } + } +} + +void utf8_decode(u32 *dst, const char *src, u32 cp_count) { + while (cp_count--) { + u8 c = *src++; + u32 bits = stdc_leading_ones(c); + ASSUME(bits < 5); + u32 cp = c & (0xff >> bits); + while (bits-- > 1) { + c = *src++; + cp = (cp << 6) | (c & 0x3F); + } + *dst++ = cp; + } +} + +u32 utf8_decode_at(const char *s, u32 i, u32 n) { + if (i >= n) return 0; + u32 cp = (u8)s[i++]; + u32 b = stdc_leading_ones((u8)cp); + if (!b) return cp; + u32 end = i + b - 1; + if (end >= n) return 0; + cp &= 0xff >> b; + while (i < end) { + u8 c = s[i++]; + cp = (cp << 6) | (c & 0x3f); + } + return cp; +} + +#endif +#endif |
