/* utf-8 encoding and decoding library */ #ifndef UTF8_H #define UTF8_H #include "wrmr.h" #define UTF8_INVALID 0xFFFD /* replacement character */ u32 utf8_decode_len(const char *src, u32 ch_count); u32 utf8_encode_len(const u32 *src, u32 cp_count); void utf8_decode(u32 *dst, const char *src, u32 cp_count); void utf8_encode(char *dst, const u32 *src, u32 cp_count); u32 utf8_decode_ckd(const char **src, u32 *srcn); #define UTF8_IMPL #ifdef UTF8_IMPL #include /* packed array of 2-bit lengths for codepoints 0..10FFFF */ #define UTF8_CP_LEN_BITS ((uint64_t)0xFFEAA550000) #define UTF8_CP_SHIFT(cp) ((32 - stdc_leading_zeros((uint32_t)(cp))) << 1) #define UTF8_CP_LEN(cp) (1 + ((UTF8_CP_LEN_BITS >> UTF8_CP_SHIFT(cp)) & 3)) u32 utf8_encode_len(const u32 *src, u32 cp_count) { u32 len = 0; while (cp_count) len += UTF8_CP_LEN(src[--cp_count]); return len; } u32 utf8_decode_len(const char *src, u32 ch_count) { u32 i = 0, len = 0; while (i < ch_count) { i += stdc_leading_ones((u8)src[i]) + ((~src[i] & 0x80) >> 7); len++; } return len; } void utf8_encode(char *dst, const u32 *src, u32 cp_count) { while (cp_count--) { u32 c = *src++; ASSUME(c > 0 && c < 0x110000); u32 len = UTF8_CP_LEN(c); ASSUME(len > 0 && len < 5); if (len > 1) { for (u32 i = len; --i;) { dst[i] = 0x80 | (c & 0x3f); c >>= 6; } *dst = (0xf0 << (4 - len)) | c; dst += len; } else { *dst++ = c; } } } void utf8_decode(u32 *dst, const char *src, u32 cp_count) { while (cp_count--) { u8 c = *src++; u32 bits = stdc_leading_ones(c); ASSUME(bits < 5); u32 cp = c & (0xff >> bits); while (bits-- > 1) { c = *src++; cp = (cp << 6) | (c & 0x3F); } *dst++ = cp; } } u32 utf8_decode_ckd(const char **src, u32 *srcn) { u32 n = *srcn; if (!n) return 0; const u8 *s = (const u8 *)*src; u32 cp = *s++; n -= 1; u32 bits = stdc_leading_ones((u8)cp); if (bits) { if (bits >= 5 || n < bits) return UTF8_INVALID; cp &= 0xff >> bits; u32 cp_len = bits - 1; ASSUME(cp_len > 0 && cp_len <= 4); for (u32 i = 0; i < cp_len; i++) { u8 c = s[i]; if ((c & 0xC0) != 0x80) return UTF8_INVALID; cp = (cp << 6) | (c & 0x3F); } if (cp > 0x10FFFF) return UTF8_INVALID; if (UTF8_CP_LEN(cp) != cp_len) return UTF8_INVALID; s += cp_len; n -= cp_len; } *src = (const char *)s; *srcn = n; return cp; } #endif #endif