1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
#ifndef UTF8_H_INCLUDED
#define UTF8_H_INCLUDED
#include <ctype.h>
#include "forward.h"
#include "ccommon.h"
// utf8 methods defined in src/utf8code.c:
enum {
U8G_Cc, U8G_Lt, U8G_Nd, U8G_Nl,
U8G_Pc, U8G_Pd, U8G_Pf, U8G_Pi,
U8G_Sc, U8G_Zl, U8G_Zp, U8G_Zs,
U8G_Arabic, U8G_Cyrillic,
U8G_Devanagari, U8G_Greek,
U8G_Han, U8G_Latin,
U8G_SIZE
};
extern bool utf8_isgroup(int group, uint32_t c);
extern bool utf8_isalpha(uint32_t c);
extern uint32_t utf8_casefold(uint32_t c);
extern uint32_t utf8_tolower(uint32_t c);
extern uint32_t utf8_toupper(uint32_t c);
extern bool utf8_iscased(uint32_t c);
extern bool utf8_isword(uint32_t c);
extern bool utf8_valid_n(const char* s, intptr_t nbytes);
extern int utf8_icmp_sv(csview s1, csview s2);
extern int utf8_encode(char *out, uint32_t c);
extern uint32_t utf8_peek_off(const char *s, int offset);
STC_INLINE bool utf8_isupper(uint32_t c)
{ return utf8_tolower(c) != c; }
STC_INLINE bool utf8_islower(uint32_t c)
{ return utf8_toupper(c) != c; }
STC_INLINE bool utf8_isalnum(uint32_t c) {
if (c < 128) return isalnum((int)c) != 0;
return utf8_isalpha(c) || utf8_isgroup(U8G_Nd, c);
}
STC_INLINE bool utf8_isblank(uint32_t c) {
if (c < 128) return (c == ' ') | (c == '\t');
return utf8_isgroup(U8G_Zs, c);
}
STC_INLINE bool utf8_isspace(uint32_t c) {
if (c < 128) return isspace((int)c) != 0;
return ((c == 8232) | (c == 8233)) || utf8_isgroup(U8G_Zs, c);
}
/* decode next utf8 codepoint. https://bjoern.hoehrmann.de/utf-8/decoder/dfa */
typedef struct { uint32_t state, codep; } utf8_decode_t;
STC_INLINE uint32_t utf8_decode(utf8_decode_t* d, const uint32_t byte) {
extern const uint8_t utf8_dtab[]; /* utf8code.c */
const uint32_t type = utf8_dtab[byte];
d->codep = d->state ? (byte & 0x3fu) | (d->codep << 6)
: (0xffU >> type) & byte;
return d->state = utf8_dtab[256 + d->state + type];
}
STC_INLINE uint32_t utf8_peek(const char* s) {
utf8_decode_t d = {.state=0};
do { utf8_decode(&d, (uint8_t)*s++); } while (d.state);
return d.codep;
}
/* case-insensitive utf8 string comparison */
STC_INLINE int utf8_icmp(const char* s1, const char* s2) {
return utf8_icmp_sv(c_sv(s1, INTPTR_MAX), c_sv(s2, INTPTR_MAX));
}
STC_INLINE bool utf8_valid(const char* s) {
return utf8_valid_n(s, INTPTR_MAX);
}
/* following functions are independent but assume valid utf8 strings: */
/* number of bytes in the utf8 codepoint from s */
STC_INLINE int utf8_chr_size(const char *s) {
unsigned b = (uint8_t)*s;
if (b < 0x80) return 1;
/*if (b < 0xC2) return 0;*/
if (b < 0xE0) return 2;
if (b < 0xF0) return 3;
/*if (b < 0xF5)*/ return 4;
/*return 0;*/
}
/* number of codepoints in the utf8 string s */
STC_INLINE intptr_t utf8_size(const char *s) {
intptr_t size = 0;
while (*s)
size += (*++s & 0xC0) != 0x80;
return size;
}
STC_INLINE intptr_t utf8_size_n(const char *s, intptr_t nbytes) {
intptr_t size = 0;
while ((nbytes-- != 0) & (*s != 0)) {
size += (*++s & 0xC0) != 0x80;
}
return size;
}
STC_INLINE const char* utf8_at(const char *s, intptr_t index) {
while ((index > 0) & (*s != 0))
index -= (*++s & 0xC0) != 0x80;
return s;
}
STC_INLINE intptr_t utf8_pos(const char* s, intptr_t index)
{ return (intptr_t)(utf8_at(s, index) - s); }
#endif // UTF8_H_INCLUDED
#if defined(i_extern)
# include "../../src/utf8code.c"
# undef i_extern
#endif
|