Moved utf8 from cregex.h to separate file. Splitted csview.h into another file strings.h.

author: Tyge Løvset <[email protected]> 2022-01-08 23:13:32 +0100
committer: Tyge Løvset <[email protected]> 2022-01-08 23:13:32 +0100
commit: 63562f5135243ac2a2553b4e7360c59e86686d6f (patch)
tree: 439a3853a6bfded32a3031b577457bfba470f1a7 /include/stc/utf8.h
parent: 0a9910eee6582e6ee414071a0d5e7062448989cf (diff)
download: STC-modified-63562f5135243ac2a2553b4e7360c59e86686d6f.tar.gz
STC-modified-63562f5135243ac2a2553b4e7360c59e86686d6f.zip
1 files changed, 99 insertions, 0 deletions
diff --git a/include/stc/utf8.h b/include/stc/utf8.h
new file mode 100644
index 00000000..77b86a18
--- /dev/null
+++ b/include/stc/utf8.h
@@ -0,0 +1,99 @@
+#ifndef STC_UTF8_INCLUDED
+#define STC_UTF8_INCLUDED
+
+#include "ccommon.h"
+#include <ctype.h>
+
+enum utf8_state {
+    utf8_ACCEPT = 0,
+    utf8_REJECT = 12
+};
+
+STC_API uint32_t utf8_decode(uint32_t *state, uint32_t *codep, const uint32_t byte);
+STC_API bool utf8_valid_codepoints(const uint8_t *s, size_t *count);
+
+STC_INLINE bool utf8_is_valid(const char *s)
+{
+    size_t count;
+    return utf8_valid_codepoints((const uint8_t *)s, &count);
+}
+
+STC_INLINE uint32_t utf8_peek(const char *s)
+{
+    uint32_t state = utf8_ACCEPT, codepoint;
+    utf8_decode(&state, &codepoint, (uint8_t)s[0]);
+    return codepoint;
+}
+
+STC_INLINE int utf8_codepoint_width(uint8_t c)
+{
+    int ret = (c & 0xF0) == 0xE0;
+    ret += ret << 1;                       // 3
+    ret |= c < 0x80;                       // 1
+    ret |= ((0xC1 < c) & (c < 0xE0)) << 1; // 2
+    ret |= ((0xEF < c) & (c < 0xF5)) << 2; // 4
+    return ret;
+}
+
+STC_INLINE const char *utf8_next(const char *s)
+{
+    const char* t = s + utf8_codepoint_width((uint8_t)s[0]);
+    
+    uintptr_t p = (uintptr_t)t;
+    p &= (uintptr_t) -(*s != 0);
+    return (const char *)p;
+}
+
+// assumes input is valid utf8! Use utf8_valid_codepoints() if unsure.
+STC_INLINE size_t utf8_size(const char *s)
+{
+    size_t count = 0;
+    while (*s)
+        s += utf8_codepoint_width((uint8_t)*s), ++count;
+    return count;
+}
+
+
+// --------------------------- IMPLEMENTATION ---------------------------------
+#ifdef _i_implement
+
+static const uint8_t utf8_table[] = {
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3,11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+     0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+    12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+    12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+    12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+    12,36,12,12,12,12,12,12,12,12,12,12, 
+};
+
+STC_DEF uint32_t utf8_decode(uint32_t *state, uint32_t *codep,
+                                const uint32_t byte)
+{
+    const uint32_t type = utf8_table[byte];
+    const uint32_t x = (uint32_t) -(*state != utf8_ACCEPT);
+
+    *codep = (x & ((byte & 0x3fu) | (*codep << 6)))  
+           | (~x & ((0xff >> type) & (byte)));
+
+    return *state = utf8_table[256 + *state + type];
+}
+
+STC_DEF bool utf8_valid_codepoints(const uint8_t *s, size_t *count)
+{
+    uint32_t state = utf8_ACCEPT, codepoint;
+    
+    for (*count = 0; *s; ++s)
+        *count += utf8_decode(&state, &codepoint, *s) == utf8_ACCEPT;
+    return state == utf8_ACCEPT;
+}
+
+#endif
+#endif
+#undef i_opt
author	Tyge Løvset <[email protected]>	2022-01-08 23:13:32 +0100
committer	Tyge Løvset <[email protected]>	2022-01-08 23:13:32 +0100
commit	63562f5135243ac2a2553b4e7360c59e86686d6f (patch)
tree	439a3853a6bfded32a3031b577457bfba470f1a7 /include/stc/utf8.h
parent	0a9910eee6582e6ee414071a0d5e7062448989cf (diff)
download	STC-modified-63562f5135243ac2a2553b4e7360c59e86686d6f.tar.gz STC-modified-63562f5135243ac2a2553b4e7360c59e86686d6f.zip