diff options
| -rw-r--r-- | utf8.h | 122 |
1 files changed, 80 insertions, 42 deletions
@@ -1,63 +1,101 @@ +/* utf-8 encoding and decoding library */ + #ifndef UTF8_H #define UTF8_H -#include "str.h" +#include "wrmr.h" -#define UTF8_INVALID (unsigned)-1 +#define UTF8_INVALID 0xFFFD /* replacement character */ -int utf8_len(unsigned cp); -unsigned utf8_next(Str *s); -void utf8_to_buf(unsigned cp, char *buf, int n); +u32 utf8_decode_len(const char *src, u32 ch_count); +u32 utf8_encode_len(const u32 *src, u32 cp_count); +void utf8_decode(u32 *dst, const char *src, u32 cp_count); +void utf8_encode(char *dst, const u32 *src, u32 cp_count); +u32 utf8_decode_ckd(const char **src, u32 *srcn); +#define UTF8_IMPL #ifdef UTF8_IMPL #include <stdbit.h> -#include <stdint.h> -unsigned utf8_next(Str *s) { - if (s->n < 1) return 0; - static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 }; - static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf }; - int len = niblen[(uint8_t)*s->s >> 4]; - if (!len) { s->n--; return *s->s++; } - if (s->n < len || (s->s[0] & (0x80 >> len))) return UTF8_INVALID; - unsigned cp = (unsigned)*s->s & cpmask[len]; - for (int i = 1; i < len; i++) { - if ((s->s[i] & 0xc0) != 0x80) return UTF8_INVALID; - cp = (cp << 6) | (s->s[i] & 0x3f); +/* packed array of 2-bit lengths for codepoints 0..10FFFF */ +#define UTF8_CP_LEN_BITS ((uint64_t)0xFFEAA550000) +#define UTF8_CP_SHIFT(cp) ((32 - stdc_leading_zeros((uint32_t)(cp))) << 1) +#define UTF8_CP_LEN(cp) (1 + ((UTF8_CP_LEN_BITS >> UTF8_CP_SHIFT(cp)) & 3)) + +u32 utf8_encode_len(const u32 *src, u32 cp_count) { + u32 len = 0; + while (cp_count) len += UTF8_CP_LEN(src[--cp_count]); + return len; +} + +u32 utf8_decode_len(const char *src, u32 ch_count) { + u32 i = 0, len = 0; + while (i < ch_count) { + i += stdc_leading_ones((u8)src[i]) + ((~src[i] & 0x80) >> 7); + len++; } - s->s += len, s->n -= len; - return cp; + return len; } -unsigned utf8_next_unchecked(Str *s) { - if (s->n < 1) return 0; - static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 }; - static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf }; - int len = niblen[(uint8_t)*s->s >> 4]; - if (!len) { s->n--; return *s->s++; } - unsigned cp = (unsigned)*s->s & cpmask[len]; - for (int i = 1; i < len; i++) cp = (cp << 6) | (s->s[i] & 0x3f); - s->s += len, s->n -= len; - return cp; +void utf8_encode(char *dst, const u32 *src, u32 cp_count) { + while (cp_count--) { + u32 c = *src++; + ASSUME(c > 0 && c < 0x110000); + u32 len = UTF8_CP_LEN(c); + ASSUME(len > 0 && len < 5); + if (len > 1) { + for (u32 i = len; --i;) { + dst[i] = 0x80 | (c & 0x3f); + c >>= 6; + } + *dst = (0xf0 << (4 - len)) | c; + dst += len; + } else { + *dst++ = c; + } + } } -int utf8_len(unsigned cp) { - static const uint8_t tbl[33] = { - 6,6,6,6,6,6, 5,5,5,5,5, 4,4,4,4,4, - 3,3,3,3,3, 2,2,2,2, 1,1,1,1,1,1,1,1, - }; - return tbl[stdc_leading_zeros(cp)]; +void utf8_decode(u32 *dst, const char *src, u32 cp_count) { + while (cp_count--) { + u8 c = *src++; + u32 bits = stdc_leading_ones(c); + ASSUME(bits < 5); + u32 cp = c & (0xff >> bits); + while (bits-- > 1) { + c = *src++; + cp = (cp << 6) | (c & 0x3F); + } + *dst++ = cp; + } } -void utf8_to_buf(unsigned cp, char *buf, int n) { - if (n == 1) { - *buf = cp; - return; +u32 utf8_decode_ckd(const char **src, u32 *srcn) { + u32 n = *srcn; + if (!n) return 0; + const u8 *s = (const u8 *)*src; + u32 cp = *s++; + n -= 1; + u32 bits = stdc_leading_ones((u8)cp); + if (bits) { + if (bits >= 5 || n < bits) return UTF8_INVALID; + cp &= 0xff >> bits; + u32 cp_len = bits - 1; + ASSUME(cp_len > 0 && cp_len <= 4); + for (u32 i = 0; i < cp_len; i++) { + u8 c = s[i]; + if ((c & 0xC0) != 0x80) return UTF8_INVALID; + cp = (cp << 6) | (c & 0x3F); + } + if (cp > 0x10FFFF) return UTF8_INVALID; + if (UTF8_CP_LEN(cp) != cp_len) return UTF8_INVALID; + s += cp_len; + n -= cp_len; } - static const uint8_t tbl[5] = { 0b11000000, 0b11100000, 0b11110000, 0b11111000, 0b11111100 }; - for (int i = n; --i;) buf[i] = 0x80 | (cp & 0x3f), cp >>= 6; - buf[0] = tbl[n - 2] | cp; + *src = (const char *)s; + *srcn = n; + return cp; } #endif |
