Updated benchmarks and renamed utf8utils.c to utf8code.c

author: Tyge Løvset <[email protected]> 2022-05-30 17:08:09 +0200
committer: Tyge Løvset <[email protected]> 2022-05-30 17:08:09 +0200
commit: 6a5ec101dc606a9bb13ef5a5ee64ab001decc666 (patch)
tree: 7b368fdcc1e7055bd1d327f29c4c108eb604c751 /src/utf8code.c
parent: 58bb58e7980d1eae175ed66dbe873893a05ab81e (diff)
download: STC-modified-6a5ec101dc606a9bb13ef5a5ee64ab001decc666.tar.gz
STC-modified-6a5ec101dc606a9bb13ef5a5ee64ab001decc666.zip
1 files changed, 188 insertions, 0 deletions
diff --git a/src/utf8code.c b/src/utf8code.c
new file mode 100644
index 00000000..c58f78b6
--- /dev/null
+++ b/src/utf8code.c
@@ -0,0 +1,188 @@
+#include <ctype.h>
+#define i_header
+#include <stc/cstr.h>
+#include "utf8tabs.inc"
+
+// https://news.ycombinator.com/item?id=15423674
+// https://gist.github.com/s4y/344a355f8c1f99c6a4cb2347ec4323cc
+
+void utf8_decode(utf8_decode_t *d, const uint8_t b)
+{
+    switch (d->state) {
+    case UTF8_OK:
+        if      (b < 0x80) d->codep = b, d->size = 1;
+        else if (b < 0xC2) d->state = UTF8_ERROR, d->size = 0;
+        else if (b < 0xE0) d->state = 1, d->codep = b & 0x1F, d->size = 2;
+        else if (b < 0xF0) d->state = 2, d->codep = b & 0x0F, d->size = 3;
+        else if (b < 0xF5) d->state = 3, d->codep = b & 0x07, d->size = 4;
+        else d->state = UTF8_ERROR, d->size = 0;
+        break;
+    case 1: case 2: case 3:
+        if ((b & 0xC0) == 0x80) {
+            d->state -= 1;
+            d->codep = (d->codep << 6) | (b & 0x3F);
+        } else
+            d->state = UTF8_ERROR, d->size = 0;
+    }
+}
+
+unsigned utf8_encode(char *out, uint32_t c)
+{
+    if (c < 0x80U) {
+        out[0] = (char) c;
+        return 1;
+    } else if (c < 0x0800U) {
+        out[0] = (char) ((c>>6  & 0x1F) | 0xC0);
+        out[1] = (char) ((c     & 0x3F) | 0x80);
+        return 2;
+    } else if (c < 0x010000U) {
+        if ((c < 0xD800U) | (c >= 0xE000U)) {
+            out[0] = (char) ((c>>12 & 0x0F) | 0xE0);
+            out[1] = (char) ((c>>6  & 0x3F) | 0x80);
+            out[2] = (char) ((c     & 0x3F) | 0x80);
+            return 3;
+        }
+    } else if (c < 0x110000U) {
+        out[0] = (char) ((c>>18 & 0x07) | 0xF0);
+        out[1] = (char) ((c>>12 & 0x3F) | 0x80);
+        out[2] = (char) ((c>>6  & 0x3F) | 0x80);
+        out[3] = (char) ((c     & 0x3F) | 0x80);
+        return 4;
+    }
+    return 0;
+}
+
+void utf8_peek(utf8_decode_t* d, const char *s) {
+    utf8_decode(d, (uint8_t)*s++);
+    switch (d->size) {
+        case 4: utf8_decode(d, (uint8_t)*s++);
+        case 3: utf8_decode(d, (uint8_t)*s++);
+        case 2: utf8_decode(d, (uint8_t)*s++);
+    }
+}
+
+bool utf8_valid(const char* s) {
+    utf8_decode_t d = {UTF8_OK};
+    while (*s)
+        utf8_decode(&d, (uint8_t)*s++);
+    return d.state == UTF8_OK;
+}
+
+bool utf8_valid_n(const char* s, size_t n) {
+    utf8_decode_t d = {UTF8_OK};
+    while ((n-- != 0) & (*s != 0))
+        utf8_decode(&d, (uint8_t)*s++);
+    return d.state == UTF8_OK;
+}
+
+uint32_t utf8_tolower(uint32_t c) {
+    for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) {
+        if (c <= casefold[i].c1) {
+            if (c < casefold[i].c0) return c;
+            int d = casefold[i].m1 - casefold[i].c1;
+            if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1));
+            return c + d;
+        }
+    }
+    return c;
+}
+
+uint32_t utf8_toupper(uint32_t c) {
+    for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) {
+        struct CaseFold cfold = casefold[cfold_low[i]];
+        if (c <= cfold.m1) {
+            int d = cfold.m1 - cfold.c1;
+            if (c < (uint32_t)(cfold.c0 + d)) return c;
+            if (d == 1) return c - ((cfold.m1 & 1) == (c & 1));
+            return c - d;
+        }
+    }
+    return c;
+}
+
+bool utf8_isupper(uint32_t c) {
+    return utf8_tolower(c) != c;
+}
+
+bool utf8_islower(uint32_t c) {
+    return utf8_toupper(c) != c;
+}
+
+bool utf8_isspace(uint32_t c) {
+    static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0,
+                           0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000};
+    for (size_t i=0; i<sizeof t/sizeof *t; ++i)
+        if (c == t[i]) return true;
+    return (c >= 0x2000) & (c <= 0x200A);
+}
+
+bool utf8_isdigit(uint32_t c) {
+    return ((c >= '0') & (c <= '9')) ||
+           ((c >= 0xFF10) & (c <= 0xFF19));
+}
+
+bool utf8_isxdigit(uint32_t c) {
+    static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10,
+                           0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46};
+    for (size_t i=1; i<sizeof t/sizeof *t; i += 2)
+        if (c <= t[i]) return c >= t[i - 1];
+    return false;
+}
+
+bool utf8_isalnum(uint32_t c) {
+    if (c < 128) return isalnum(c) != 0;
+    if ((c >= 0xFF10) & (c <= 0xFF19)) return true;
+    return utf8_islower(c) || utf8_isupper(c);
+}
+
+bool utf8_isalpha(uint32_t c) {
+    if (c < 128) return isalpha(c) != 0;
+    return utf8_islower(c) || utf8_isupper(c);
+}
+
+static struct fnfold {
+    int (*conv_asc)(int);
+    uint32_t (*conv_u8)(uint32_t);
+}
+fn_tolower = {tolower, utf8_tolower},
+fn_toupper = {toupper, utf8_toupper};
+
+
+static cstr cstr_casefold(const cstr* self, struct fnfold fold) {
+    csview sv = cstr_sv(self);
+    cstr out = cstr_null;
+    char *buf = cstr_reserve(&out, sv.size*3/2);
+    uint32_t cp; size_t sz = 0;
+    utf8_decode_t d = {UTF8_OK};
+
+    for (; *sv.str; sv.str += d.size) {
+        utf8_peek(&d, sv.str);
+        switch (d.size) {
+        case 1:
+            buf[sz++] = (char)fold.conv_asc(*sv.str);
+            break;
+        default:
+            cp = fold.conv_u8(d.codep);
+            sz += utf8_encode(buf + sz, cp);
+        }
+    }
+    _cstr_set_size(&out, sz);
+    cstr_shrink_to_fit(&out);
+    return out;
+}
+
+cstr cstr_tolower(const cstr* self) {
+    return cstr_casefold(self, fn_tolower);
+}
+
+cstr cstr_toupper(const cstr* self) {
+    return cstr_casefold(self, fn_toupper);
+}
+
+void cstr_lowercase(cstr* self) {
+    cstr_take(self, cstr_casefold(self, fn_tolower));
+}
+
+void cstr_uppercase(cstr* self) {
+    cstr_take(self, cstr_casefold(self, fn_toupper));
+}
author	Tyge Løvset <[email protected]>	2022-05-30 17:08:09 +0200
committer	Tyge Løvset <[email protected]>	2022-05-30 17:08:09 +0200
commit	6a5ec101dc606a9bb13ef5a5ee64ab001decc666 (patch)
tree	7b368fdcc1e7055bd1d327f29c4c108eb604c751 /src/utf8code.c
parent	58bb58e7980d1eae175ed66dbe873893a05ab81e (diff)
download	STC-modified-6a5ec101dc606a9bb13ef5a5ee64ab001decc666.tar.gz STC-modified-6a5ec101dc606a9bb13ef5a5ee64ab001decc666.zip