summary refs log tree commit diff
path: root/utf8.h
blob: 10a4f2b86f4c62e5b5e2c150da3ae24a56b7f37a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#ifndef UTF8_H
#define UTF8_H

#include "str.h"

#define UTF8_INVALID (unsigned)-1

int utf8_len(unsigned cp);
unsigned utf8_next(Str *s);
void utf8_to_buf(unsigned cp, char *buf, int n);

#ifdef UTF8_IMPL

#include <stdbit.h>
#include <stdint.h>

unsigned utf8_next(Str *s) { 
	if (s->n < 1) return 0;
	static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 };
	static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf };
	int len = niblen[(uint8_t)*s->s >> 4];
	if (!len) { s->n--; return *s->s++; }
	if (s->n < len || (s->s[0] & (0x80 >> len))) return UTF8_INVALID;
	unsigned cp = (unsigned)*s->s & cpmask[len];
	for (int i = 1; i < len; i++) {
		if ((s->s[i] & 0xc0) != 0x80) return UTF8_INVALID;
		cp = (cp << 6) | (s->s[i] & 0x3f);
	}
	s->s += len, s->n -= len;
	return cp;
}

unsigned utf8_next_unchecked(Str *s) { 
	if (s->n < 1) return 0;
	static const uint8_t niblen[16] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,4 };
	static const uint8_t cpmask[4] = { 0x7f, 0x3f, 0x1f, 0xf };
	int len = niblen[(uint8_t)*s->s >> 4];
	if (!len) { s->n--; return *s->s++; }
	unsigned cp = (unsigned)*s->s & cpmask[len];
	for (int i = 1; i < len; i++) cp = (cp << 6) | (s->s[i] & 0x3f);
	s->s += len, s->n -= len;
	return cp;
}

int utf8_len(unsigned cp) {
	static const uint8_t tbl[33] = {
		6,6,6,6,6,6, 5,5,5,5,5, 4,4,4,4,4,
		3,3,3,3,3, 2,2,2,2, 1,1,1,1,1,1,1,1,
	};
	return tbl[stdc_leading_zeros(cp)];
}

void utf8_to_buf(unsigned cp, char *buf, int n) {
	if (n == 1) {
		*buf = cp;
		return;
	}
	static const uint8_t tbl[5] = { 0b11000000, 0b11100000, 0b11110000, 0b11111000, 0b11111100 };
	for (int i = n; --i;) buf[i] = 0x80 | (cp & 0x3f), cp >>= 6;
	buf[0] = tbl[n - 2] | cp;
}

#endif
#endif