diff options
| author | Tyge Løvset <[email protected]> | 2022-05-31 15:04:52 +0200 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2022-05-31 15:04:52 +0200 |
| commit | eb9821bec4a292458499042392924595b3338085 (patch) | |
| tree | 90b6286619deefc2eaf72fce4fc67d5959c84557 /include/stc/utf8.h | |
| parent | 0a92ec2235b5f42e93012be14938bb11e3f3650a (diff) | |
| download | STC-modified-eb9821bec4a292458499042392924595b3338085.tar.gz STC-modified-eb9821bec4a292458499042392924595b3338085.zip | |
1) REMOVED files/modules not relevant: makes lib more focused:
- threads.h/threads.c (external lib)
- coptions.h - will be kept as a gist.
- more will follow, (examples, some benchmarks, etc).
2) Replaced UTF8 decoder with Björn Höhrmann's DFA decoder.
Diffstat (limited to 'include/stc/utf8.h')
| -rw-r--r-- | include/stc/utf8.h | 17 |
1 files changed, 10 insertions, 7 deletions
diff --git a/include/stc/utf8.h b/include/stc/utf8.h index e5fa5894..630a7a7c 100644 --- a/include/stc/utf8.h +++ b/include/stc/utf8.h @@ -35,19 +35,22 @@ bool utf8_isalpha(uint32_t c); bool utf8_isalnum(uint32_t c);
uint32_t utf8_tolower(uint32_t c);
uint32_t utf8_toupper(uint32_t c);
-
bool utf8_valid(const char* s);
bool utf8_valid_n(const char* s, size_t n);
-
int utf8_icmp_n(size_t u8max, const char* s1, size_t n1,
const char* s2, size_t n2);
+unsigned utf8_encode(char *out, uint32_t c);
+
/* encode/decode next utf8 codepoint. */
-enum { UTF8_OK = 0, UTF8_ERROR = 4 };
-typedef struct { uint32_t state, codep, size; } utf8_decode_t;
+typedef struct { uint32_t state, codep; } utf8_decode_t;
-void utf8_peek(utf8_decode_t* d, const char *s);
-unsigned utf8_encode(char *out, uint32_t c);
-void utf8_decode(utf8_decode_t *d, const uint8_t b);
+STC_INLINE uint32_t utf8_decode(utf8_decode_t* d, const uint32_t byte) {
+ extern const uint8_t utf8_dtab[];
+ const uint32_t type = utf8_dtab[byte];
+ d->codep = d->state ? (byte & 0x3fu) | (d->codep << 6)
+ : (0xff >> type) & byte;
+ return d->state = utf8_dtab[256 + d->state + type];
+}
/* case-insensitive utf8 string comparison */
STC_INLINE int utf8_icmp(const char* s1, const char* s2) {
|
