/* utf-8 encoding and decoding library */ #ifndef UTF8_H #define UTF8_H #include "wrmr.h" #define UTF8_INVALID 0xFFFD /* replacement character */ #define UTF8_CP_LEN_BITS ((uint64_t)0xFFEAA550000) #define UTF8_CP_SHIFT(cp) ((32 - stdc_leading_zeros((uint32_t)(cp))) << 1) #define UTF8_CP_LEN(cp) (1 + ((UTF8_CP_LEN_BITS >> UTF8_CP_SHIFT(cp)) & 3)) u32 utf8_decode_len(const char *src, u32 ch_count); u32 utf8_encode_len(const u32 *src, u32 cp_count); void utf8_decode(u32 *dst, const char *src, u32 cp_count); void utf8_encode(char *dst, const u32 *src, u32 cp_count); u32 utf8_decode_at(const char *s, u32 i, u32 n); int utf8_to_char32(const char *s, u32 i, u32 n, u32 *cout); int utf8_validate(const char *src, u32 n); #ifdef UTF8_IMPL #include /* packed array of 2-bit lengths for codepoints 0..10FFFF */ u32 utf8_encode_len(const u32 *src, u32 cp_count) { u32 len = 0; while (cp_count) len += UTF8_CP_LEN(src[--cp_count]); return len; } u32 utf8_decode_len(const char *src, u32 ch_count) { u32 i = 0, len = 0; while (i < ch_count) { i += stdc_leading_ones((u8)src[i]) + ((~src[i] & 0x80) >> 7); len++; } return len; } void utf8_encode(char *dst, const u32 *src, u32 cp_count) { while (cp_count--) { u32 c = *src++; ASSUME(c > 0 && c < 0x110000); u32 len = UTF8_CP_LEN(c); ASSUME(len > 0 && len < 5); if (len > 1) { for (u32 i = len; --i;) { dst[i] = 0x80 | (c & 0x3f); c >>= 6; } *dst = (0xf0 << (4 - len)) | c; dst += len; } else { *dst++ = c; } } } void utf8_decode(u32 *dst, const char *src, u32 cp_count) { while (cp_count--) { u8 c = *src++; u32 bits = stdc_leading_ones(c); ASSUME(bits < 5); u32 cp = c & (0xff >> bits); while (bits-- > 1) { c = *src++; cp = (cp << 6) | (c & 0x3F); } *dst++ = cp; } } u32 utf8_decode_at(const char *s, u32 i, u32 n) { if (i >= n) return 0; u32 cp = (u8)s[i]; u32 b = stdc_leading_ones((u8)cp); if (!b) return cp; u32 end = i + b - 1; if (end >= n) return 0; cp &= 0xff >> b; while (++i <= end) { u8 c = s[i]; cp = (cp << 6) | (c & 0x3f); } return cp; } int utf8_to_char32(const char *s, u32 i, u32 n, u32 *cout) { if (i >= n) return 0; u8 b = s[i]; if (~b & 0x80) { *cout = b; return 1; } u32 bits = stdc_leading_ones(b); if (i + bits > n || bits > 4) { *cout = UTF8_INVALID; return 1; } u32 cp = b & (0xff >> bits); for (u32 j = bits; --j;) cp = (cp << 6) | (s[++i] & 0x3f); *cout = cp; return bits; } int utf8_validate(const char *src, u32 n) { /* TODO: rewrite this to be faster */ for (u32 i = 0; i < n; i++) { if (utf8_decode_at(src, i, n) == 0) { return 0; } } return 1; } #endif #endif