summaryrefslogtreecommitdiff
path: root/utf8.h
blob: 0f7b1977a21664d832d8ca33e87005943aeb1ebd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/* utf-8 encoding and decoding library */

#ifndef UTF8_H
#define UTF8_H

#include "wrmr.h"

#define UTF8_INVALID 0xFFFD /* replacement character */
#define UTF8_CP_LEN_BITS   ((uint64_t)0xFFEAA550000)
#define UTF8_CP_SHIFT(cp)  ((32 - stdc_leading_zeros((uint32_t)(cp))) << 1)
#define UTF8_CP_LEN(cp)    (1 + ((UTF8_CP_LEN_BITS >> UTF8_CP_SHIFT(cp)) & 3))

u32 utf8_decode_len(const char *src, u32 ch_count);
u32 utf8_encode_len(const u32 *src, u32 cp_count);
void utf8_decode(u32 *dst, const char *src, u32 cp_count);
void utf8_encode(char *dst, const u32 *src, u32 cp_count);
u32 utf8_decode_at(const char *s, u32 i, u32 n);
int utf8_to_char32(const char *s, u32 i, u32 n, u32 *cout);
int utf8_validate(const char *src, u32 n);

#ifdef UTF8_IMPL

#include <stdbit.h>

/* packed array of 2-bit lengths for codepoints 0..10FFFF */

u32 utf8_encode_len(const u32 *src, u32 cp_count) {
	u32 len = 0;
	while (cp_count) len += UTF8_CP_LEN(src[--cp_count]);
	return len;
}

u32 utf8_decode_len(const char *src, u32 ch_count) {
	u32 i = 0, len = 0;
	while (i < ch_count) {
		i += stdc_leading_ones((u8)src[i]) + ((~src[i] & 0x80) >> 7);
		len++;
	}
	return len;
}

void utf8_encode(char *dst, const u32 *src, u32 cp_count) {
	while (cp_count--) {
		u32 c = *src++;
		ASSUME(c > 0 && c < 0x110000);
		u32 len = UTF8_CP_LEN(c);
		ASSUME(len > 0 && len < 5);
		if (len > 1) {
			for (u32 i = len; --i;) {
				dst[i] = 0x80 | (c & 0x3f);
				c >>= 6;
			}
			*dst = (0xf0 << (4 - len)) | c;
			dst += len;
		} else {
			*dst++ = c;
		}
	}
}

void utf8_decode(u32 *dst, const char *src, u32 cp_count) {
	while (cp_count--) {
		u8 c = *src++;
		u32 bits = stdc_leading_ones(c);
		ASSUME(bits < 5);
		u32 cp = c & (0xff >> bits);
		while (bits-- > 1) {
			c = *src++;
			cp = (cp << 6) | (c & 0x3F);
		}
		*dst++ = cp;
	}
}

u32 utf8_decode_at(const char *s, u32 i, u32 n) {
	if (i >= n) return 0;
	u32 cp = (u8)s[i];
	u32 b = stdc_leading_ones((u8)cp);
	if (!b) return cp;
	u32 end = i + b - 1;
	if (end >= n) return 0;
	cp &= 0xff >> b;
	while (++i <= end) {
		u8 c = s[i];
		cp = (cp << 6) | (c & 0x3f);
	}
	return cp;
}

int utf8_to_char32(const char *s, u32 i, u32 n, u32 *cout) {
	if (i >= n) return 0;
	u8 b = s[i];
	if (~b & 0x80) {
		*cout = b;
		return 1;
	}
	u32 bits = stdc_leading_ones(b);
	if (i + bits > n || bits > 4) {
		*cout = UTF8_INVALID;
		return 1;
	}
	u32 cp = b & (0xff >> bits);
	for (u32 j = bits; --j;)
		cp = (cp << 6) | (s[++i] & 0x3f);
	*cout = cp;
	return bits;
}

int utf8_validate(const char *src, u32 n) {
	/* TODO: rewrite this to be faster */
	for (u32 i = 0; i < n; i++) {
		if (utf8_decode_at(src, i, n) == 0) {
			return 0;
		}
	}
	return 1;
}

#endif
#endif