summaryrefslogtreecommitdiffhomepage
path: root/include/stc/utf8.h
diff options
context:
space:
mode:
authorTyge Løvset <[email protected]>2022-05-31 15:04:52 +0200
committerTyge Løvset <[email protected]>2022-05-31 15:04:52 +0200
commiteb9821bec4a292458499042392924595b3338085 (patch)
tree90b6286619deefc2eaf72fce4fc67d5959c84557 /include/stc/utf8.h
parent0a92ec2235b5f42e93012be14938bb11e3f3650a (diff)
downloadSTC-modified-eb9821bec4a292458499042392924595b3338085.tar.gz
STC-modified-eb9821bec4a292458499042392924595b3338085.zip
1) REMOVED files/modules not relevant: makes lib more focused:
- threads.h/threads.c (external lib) - coptions.h - will be kept as a gist. - more will follow, (examples, some benchmarks, etc). 2) Replaced UTF8 decoder with Björn Höhrmann's DFA decoder.
Diffstat (limited to 'include/stc/utf8.h')
-rw-r--r--include/stc/utf8.h17
1 files changed, 10 insertions, 7 deletions
diff --git a/include/stc/utf8.h b/include/stc/utf8.h
index e5fa5894..630a7a7c 100644
--- a/include/stc/utf8.h
+++ b/include/stc/utf8.h
@@ -35,19 +35,22 @@ bool utf8_isalpha(uint32_t c);
bool utf8_isalnum(uint32_t c);
uint32_t utf8_tolower(uint32_t c);
uint32_t utf8_toupper(uint32_t c);
-
bool utf8_valid(const char* s);
bool utf8_valid_n(const char* s, size_t n);
-
int utf8_icmp_n(size_t u8max, const char* s1, size_t n1,
const char* s2, size_t n2);
+unsigned utf8_encode(char *out, uint32_t c);
+
/* encode/decode next utf8 codepoint. */
-enum { UTF8_OK = 0, UTF8_ERROR = 4 };
-typedef struct { uint32_t state, codep, size; } utf8_decode_t;
+typedef struct { uint32_t state, codep; } utf8_decode_t;
-void utf8_peek(utf8_decode_t* d, const char *s);
-unsigned utf8_encode(char *out, uint32_t c);
-void utf8_decode(utf8_decode_t *d, const uint8_t b);
+STC_INLINE uint32_t utf8_decode(utf8_decode_t* d, const uint32_t byte) {
+ extern const uint8_t utf8_dtab[];
+ const uint32_t type = utf8_dtab[byte];
+ d->codep = d->state ? (byte & 0x3fu) | (d->codep << 6)
+ : (0xff >> type) & byte;
+ return d->state = utf8_dtab[256 + d->state + type];
+}
/* case-insensitive utf8 string comparison */
STC_INLINE int utf8_icmp(const char* s1, const char* s2) {