From b28d3fa7c3b9233ca485014744bf84e6c4f5a1d3 Mon Sep 17 00:00:00 2001
From: Tyge Lovset <tylovset@gmail.com>
Date: Mon, 30 May 2022 10:17:07 +0200
Subject: Large refactoring on strings / utf8 and some file structure.

---
 src/casefold.c  | 207 -------------------------------------------------
 src/casefold.py | 235 --------------------------------------------------------
 src/cregex.c    |   3 +-
 src/utf8tabs.c  |  59 ++++++++++++++
 src/utf8tabs.h  |  10 +++
 src/utf8tabs.py | 144 ++++++++++++++++++++++++++++++++++
 src/utf8utils.c | 190 +++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 404 insertions(+), 444 deletions(-)
 delete mode 100644 src/casefold.c
 delete mode 100644 src/casefold.py
 create mode 100644 src/utf8tabs.c
 create mode 100644 src/utf8tabs.h
 create mode 100644 src/utf8tabs.py
 create mode 100644 src/utf8utils.c

(limited to 'src')

diff --git a/src/casefold.c b/src/casefold.c
deleted file mode 100644
index 1b0a9463..00000000
--- a/src/casefold.c
+++ /dev/null
@@ -1,207 +0,0 @@
-#include <ctype.h>
-#define i_header
-#include <stc/cstr.h>
-
-static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = {
-    {65, 90, 122}, {181, 181, 956}, {192, 214, 246}, {216, 222, 254},
-    {256, 302, 303}, {306, 310, 311}, {313, 327, 328}, {330, 374, 375}, {376, 376, 255},
-    {377, 381, 382}, {383, 383, 115}, {385, 385, 595}, {386, 388, 389}, {390, 390, 596},
-    {391, 391, 392}, {393, 394, 599}, {395, 395, 396}, {398, 398, 477}, {399, 399, 601},
-    {400, 400, 603}, {401, 401, 402}, {403, 403, 608}, {404, 404, 611}, {406, 406, 617},
-    {407, 407, 616}, {408, 408, 409}, {412, 412, 623}, {413, 413, 626}, {415, 415, 629},
-    {416, 420, 421}, {422, 422, 640}, {423, 423, 424}, {425, 425, 643}, {428, 428, 429},
-    {430, 430, 648}, {431, 431, 432}, {433, 434, 651}, {435, 437, 438}, {439, 439, 658},
-    {440, 442, 443}, {452, 452, 454}, {453, 453, 454}, {455, 455, 457}, {456, 456, 457},
-    {458, 458, 460}, {459, 475, 476}, {478, 494, 495}, {497, 497, 499}, {498, 500, 501},
-    {502, 502, 405}, {503, 503, 447}, {504, 542, 543}, {544, 544, 414}, {546, 562, 563},
-    {570, 570, 11365}, {571, 571, 572}, {573, 573, 410}, {574, 574, 11366}, {577, 577, 578},
-    {579, 579, 384}, {580, 580, 649}, {581, 581, 652}, {582, 590, 591}, {837, 837, 953},
-    {880, 882, 883}, {886, 886, 887}, {895, 895, 1011}, {902, 902, 940}, {904, 906, 943},
-    {908, 908, 972}, {910, 911, 974}, {913, 929, 961}, {931, 939, 971}, {962, 962, 963},
-    {975, 975, 983}, {976, 976, 946}, {977, 977, 952}, {981, 981, 966}, {982, 982, 960},
-    {984, 1006, 1007}, {1008, 1008, 954}, {1009, 1009, 961}, {1012, 1012, 952}, {1013, 1013, 949},
-    {1015, 1015, 1016}, {1017, 1017, 1010}, {1018, 1018, 1019}, {1021, 1023, 893},
-    {1024, 1039, 1119}, {1040, 1071, 1103}, {1120, 1152, 1153}, {1162, 1214, 1215},
-    {1216, 1216, 1231}, {1217, 1229, 1230}, {1232, 1326, 1327}, {1329, 1366, 1414},
-    {4256, 4293, 11557}, {4295, 4296, 11560}, {5112, 5117, 5109}, {7296, 7296, 1074},
-    {7297, 7297, 1076}, {7298, 7298, 1086}, {7299, 7300, 1090}, {7301, 7301, 1090},
-    {7302, 7302, 1098}, {7303, 7303, 1123}, {7304, 7304, 42571}, {7312, 7354, 4346},
-    {7357, 7359, 4351}, {7680, 7828, 7829}, {7835, 7835, 7777}, {7838, 7838, 223},
-    {7840, 7934, 7935}, {7944, 7951, 7943}, {7960, 7965, 7957}, {7976, 7983, 7975},
-    {7992, 7999, 7991}, {8008, 8013, 8005}, {8025, 8028, 8020}, {8040, 8047, 8039},
-    {8072, 8079, 8071}, {8088, 8095, 8087}, {8104, 8111, 8103}, {8120, 8121, 8113},
-    {8122, 8123, 8049}, {8124, 8124, 8115}, {8126, 8126, 953}, {8136, 8139, 8053},
-    {8140, 8140, 8131}, {8152, 8153, 8145}, {8154, 8155, 8055}, {8168, 8169, 8161},
-    {8170, 8171, 8059}, {8172, 8172, 8165}, {8184, 8185, 8057}, {8186, 8187, 8061},
-    {8188, 8188, 8179}, {8486, 8486, 969}, {8490, 8490, 107}, {8491, 8491, 229},
-    {8498, 8498, 8526}, {8544, 8559, 8575}, {8579, 8579, 8580}, {9398, 9423, 9449},
-    {11264, 11311, 11359}, {11360, 11360, 11361}, {11362, 11362, 619}, {11363, 11363, 7549},
-    {11364, 11364, 637}, {11367, 11371, 11372}, {11373, 11373, 593}, {11374, 11374, 625},
-    {11375, 11375, 592}, {11376, 11376, 594}, {11378, 11380, 11381}, {11390, 11391, 576},
-    {11392, 11490, 11491}, {11499, 11501, 11502}, {11506, 11508, 11509}, {42562, 42604, 42605},
-    {42624, 42650, 42651}, {42786, 42798, 42799}, {42802, 42862, 42863}, {42873, 42875, 42876},
-    {42877, 42877, 7545}, {42878, 42886, 42887}, {42891, 42891, 42892}, {42893, 42893, 613},
-    {42896, 42898, 42899}, {42902, 42920, 42921}, {42922, 42922, 614}, {42923, 42923, 604},
-    {42924, 42924, 609}, {42925, 42925, 620}, {42926, 42926, 618}, {42928, 42928, 670},
-    {42929, 42929, 647}, {42930, 42930, 669}, {42931, 42931, 43859}, {42932, 42946, 42947},
-    {42948, 42948, 42900}, {42949, 42949, 642}, {42950, 42950, 7566}, {42951, 42953, 42954},
-    {42960, 42962, 42963}, {42968, 42970, 42971}, {43888, 43913, 5049}, {65313, 65338, 65370},
-}; // 188
-static uint8_t cfold_low[] = {
-    0, 138, 10, 111, 2, 139, 3, 8, 4, 5, 6, 7, 9, 59, 12, 14, 16, 20, 49, 25,
-    56, 52, 29, 31, 33, 35, 37, 39, 50, 40, 41, 42, 43, 44, 45, 17, 46, 47, 48, 51,
-    53, 55, 155, 58, 62, 152, 150, 153, 11, 13, 15, 18, 19, 171, 21, 172, 22, 167, 170, 24,
-    23, 174, 146, 173, 26, 151, 27, 28, 148, 30, 181, 32, 176, 34, 60, 36, 61, 38, 177, 175,
-    64, 65, 87, 67, 68, 71, 75, 83, 76, 82, 63, 126, 80, 1, 78, 81, 72, 73, 77, 137,
-    69, 70, 74, 79, 85, 66, 84, 86, 89, 99, 100, 101, 102, 103, 104, 88, 90, 105, 91, 93,
-    92, 94, 95, 107, 108, 186, 98, 164, 147, 182, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119,
-    124, 127, 130, 134, 132, 135, 120, 121, 122, 123, 125, 128, 129, 131, 133, 136, 140, 141, 142, 143,
-    144, 145, 54, 57, 149, 154, 156, 157, 158, 96, 97, 159, 106, 160, 161, 162, 163, 165, 166, 168,
-    180, 169, 179, 183, 184, 185, 178, 187,
-};
-
-uint32_t utf8_tolower(uint32_t c) {
-    for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) {
-        if (c <= casefold[i].c1) {
-            if (c < casefold[i].c0) return c;
-            int d = casefold[i].m1 - casefold[i].c1;
-            if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1));
-            return c + d;
-        }
-    }
-    return c;
-}
-
-uint32_t utf8_toupper(uint32_t c) {
-    for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) {
-        struct CaseFold cfold = casefold[cfold_low[i]];
-        if (c <= cfold.m1) {
-            int d = cfold.m1 - cfold.c1;
-            if (c < (uint32_t)(cfold.c0 + d)) return c;
-            if (d == 1) return c - ((cfold.m1 & 1) == (c & 1));
-            return c - d;
-        }
-    }
-    return c;
-}
-
-bool utf8_isupper(uint32_t c) {
-    return utf8_tolower(c) != c;
-}
-
-bool utf8_islower(uint32_t c) {
-    return utf8_toupper(c) != c;
-}
-
-bool utf8_isspace(uint32_t c) {
-    static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0,
-                           0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000};
-    for (size_t i=0; i<sizeof t/sizeof *t; ++i)
-        if (c == t[i]) return true;
-    return (c >= 0x2000) & (c <= 0x200A);
-}
-
-bool utf8_isdigit(uint32_t c) {
-    return ((c >= '0') & (c <= '9')) || 
-           ((c >= 0xFF10) & (c <= 0xFF19));
-}
-
-bool utf8_isxdigit(uint32_t c) {
-    static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10, 
-                           0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46};
-    for (size_t i=1; i<sizeof t/sizeof *t; i += 2)
-        if (c <= t[i]) return c >= t[i - 1];
-    return false;
-}
-
-bool utf8_isalnum(uint32_t c) {
-    if (c < 128) return isalnum(c) != 0;
-    if ((c >= 0xFF10) & (c <= 0xFF19)) return true;
-    return utf8_islower(c) || utf8_isupper(c);
-}
-
-bool utf8_isalpha(uint32_t c) {
-    if (c < 128) return isalpha(c) != 0;
-    return utf8_islower(c) || utf8_isupper(c);
-}
-
-static struct fnfold {
-    int (*conv_asc)(int);
-    uint32_t (*conv_u8)(uint32_t);
-}
-fn_tolower = {tolower, utf8_tolower},
-fn_toupper = {toupper, utf8_toupper};
-
-
-static cstr cstr_casefold(const cstr* self, struct fnfold fold) {
-    csview sv = cstr_sv(self);
-    cstr out = cstr_null;
-    char *buf = cstr_reserve(&out, sv.size*3/2);
-    uint32_t cp; size_t sz = 0;
-    utf8_decode_t d = {UTF8_OK};
-
-    for (; *sv.str; sv.str += d.size) {
-        utf8_peek(sv.str, &d);
-        switch (d.size) {
-        case 1:
-            buf[sz++] = (char)fold.conv_asc(*sv.str);
-            break;
-        default: 
-            cp = fold.conv_u8(d.codep);
-            sz += utf8_encode(buf + sz, cp);
-        }
-    }
-    _cstr_set_size(&out, sz);
-    cstr_shrink_to_fit(&out);
-    return out;
-}
-
-cstr cstr_tolower(const cstr* self) {
-    return cstr_casefold(self, fn_tolower);
-}
-
-cstr cstr_toupper(const cstr* self) {
-    return cstr_casefold(self, fn_toupper);
-}
-
-void cstr_lowercase(cstr* self) {
-    cstr_take(self, cstr_casefold(self, fn_tolower));
-}
-
-void cstr_uppercase(cstr* self) {
-    cstr_take(self, cstr_casefold(self, fn_toupper));
-}
-
-#ifdef TEST
-int main()
-{
-    for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i)
-    {
-        char x[3][5]={0};
-        unsigned s0, s1, s2;
-        uint32_t a = casefold[i].c0;
-        uint32_t b = utf8_tolower(a);
-        uint32_t c = utf8_toupper(b);
-
-        s0 = utf8_encode(x[0], a);
-        s1 = utf8_encode(x[1], b);
-        s2 = utf8_encode(x[2], c);
-        printf("%s %s %s - %u %u %u (%u %u %u)\n", x[0], x[1], x[2], a, b, c, s0, s1, s2);
-    }
-    c_auto (cstr, t1)
-    {
-        t1 = cstr_new("Die preußischen Köstlichkeiten.");
-
-        cstr_buf b = cstr_buffer(&t1);
-        printf("%s, %llu %llu\n", b.data, b.size, b.cap);
-        cstr_lowercase(&t1);
-        b = cstr_buffer(&t1);
-        printf("%s, %llu %llu\n", b.data, b.size, b.cap);
-
-        cstr_uppercase(&t1);
-        b = cstr_buffer(&t1);
-        printf("%s, %llu %llu\n", b.data, b.size, b.cap);
-    }
-}
-#endif
-
diff --git a/src/casefold.py b/src/casefold.py
deleted file mode 100644
index 951f3bf6..00000000
--- a/src/casefold.py
+++ /dev/null
@@ -1,235 +0,0 @@
-#!python
-import pandas as pd
-import numpy as np
-
-def read_unidata(catfilter='Lu', casefilter='lowcase', big=False):
-    ud = pd.read_csv("ucd/UnicodeData.txt", sep=';', converters={0: lambda x: int(x, base=16)},
-                      names=['code', 'name', 'category', 'canclass', 'bidircat', 'chrdecomp',
-                             'decdig', 'digval', 'numval', 'mirrored', 'uc1name', 'comment',
-                             'upcase', 'lowcase', 'titlecase'],
-                      usecols=['code', 'name', 'category', 'bidircat', 'upcase', 'lowcase', 'titlecase'])
-    if big: 
-        ud = ud[ud['code'] >= (1<<16)]
-    else:
-        ud = ud[ud['code'] < (1<<16)]
-    ud = ud[ud['category'] ==  catfilter]
-    ud = ud.replace(np.nan, '0')
-    for k in ['upcase', 'lowcase', 'titlecase']:
-        ud[k] = ud[k].apply(int, base=16)
-    if casefilter: # 'lowcase', 'upcase', 'titlecase'
-        ud = ud[ud[casefilter] != 0]
-    return ud
-
-
-def read_casefold(big=False):
-    cf = pd.read_csv("ucd/CaseFolding.txt", engine='python', sep='; #? ?', comment='#',
-                     converters={0: lambda x: int(x, base=16)},
-                     names=['code', 'status', 'lowcase', 'name'])
-    if big:
-        cf = cf[cf['code'] >= (1<<16)]
-    else:
-        cf = cf[cf['code'] < (1<<16)]
-    cf = cf[cf.status.isin(['S', 'C'])]
-    cf['lowcase'] = cf['lowcase'].apply(int, base=16)
-    #print(cf['name'].values)
-    #print(cf)
-    return cf
-
-
-def make_caselist(df):
-    letters=[]
-    for idx, row in df.iterrows():
-        #print(idx+1, ':', row['code'], row['lowcase'] - row['code'], ',', chr(row['code']), chr(row['lowcase']), ',', row['name'])
-        letters.append([idx+1, row['code'], row['lowcase'], row['name']])
-    return letters
-
-
-def make_casefold(letters):
-    prevoffset = 0
-    diffoffset = 0
-    prev = [-1, 0, 0]
-    diff = [-1, 0, 0]
-
-    out = []
-    n = 1
-    for x in letters:
-        offset = x[2] - x[1]
-        diffoffset = prevoffset - offset
-        if (diff[1] and x[1] - prev[1] != diff[1]) or (diff[2] and x[2] - prev[2] != diff[2]) or prevoffset != offset:
-            out.append([x[1], x[2], n, x[3]]) # , ';', chr(x[1]), chr(x[2]), ';', x[1], offset, "CHANGE")
-            #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset, "CHANGE", n+1)
-            diff[1] = 0
-            diff[2] = 0
-            n = 1
-        else:
-            n += 1
-            if diff[1] == 0:
-                diff[1] = x[1] - prev[1]
-                diff[2] = x[2] - prev[2]
-            #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset)
-            diff[1] = x[1] - prev[1]
-            diff[2] = x[2] - prev[2]
-        prev[2] = x[2]
-        prev[1] = x[1]
-        prevoffset = offset
-
-    out.append(out[-1])
-    out[-1][2] = 26
-    cfold = []
-    for i in range(0, len(out)-1):
-        d = out[i][1] - out[i][0]
-        cfold.append([out[i][0], out[i+1][2], d, out[i][3]])
-    return cfold
-
-
-def print_casefold(cfold):
-    print('''
-static struct CaseFold { uint16_t c0, c1, m1; } casefold[] = {''')
-    n = 1
-    s = 5
-    count = 0
-    table = []
-    print('   ', end='')
-    for x in cfold:
-        d = 2 if abs(x[2]) == 1 else 1
-        a = x[0]
-        b = x[0] + (x[1] - 1)*d
-        c = b + x[2]
-        if b >= 1<<16 or c >= 1<<16: # only to make sure...
-            break
-        #print(' {%d, %d, %d}, // %s %s, %s\n   ' % (a, b, c, chr(a), chr(a + x[2]), x[3]), end='')
-        if True: # compact
-            if n == s: 
-                n = 0
-                if a > 1000:
-                    s = 4
-                print('\n   ', end='')
-            print(' {%d, %d, %d},' % (a, b, c), end='')
-            table.append((a, b, c))
-        else:
-            print(' {%d, %d, %d}, // ' % (a, b, c), end='')
-            for y in range(x[0], x[0] + x[1], d):
-                print('%s %s, ' % (chr(y), chr(y + x[2])), end='')
-            print('')
-        count += 1
-        n += 1
-    print('\n}; // %d' % (count))
-    return table
-
-
-def make_casetable():
-    df = read_casefold()
-    #df = read_unidata()
-    letters = make_caselist(df)
-    cfold = make_casefold(letters)
-    return cfold
-
-
-def print_casefold_low(table):
-    cfold_low = [i for i in range(len(table))]
-    cfold_low.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0]))
-
-    print('static uint8_t cfold_low[] = {\n   ', end='')
-    for i in range(len(cfold_low)):
-        print(" %d," % (cfold_low[i]), end='\n   ' if (i+1) % 20 == 0 else '')
-    print('\n};')
-
-
-########### main:
-
-if __name__ == "__main__":
-    print('''#include <stdint.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <stc/utf8.h>
-#include <stdbool.h>''')
-
-    cfold = make_casetable()
-    table = print_casefold(cfold)
-    print_casefold_low(table)
-
-    print(r'''
-uint32_t utf8_tolower(uint32_t c) {
-    for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) {
-        if (c <= casefold[i].c1) {
-            if (c < casefold[i].c0) return c;
-            int d = casefold[i].m1 - casefold[i].c1;
-            if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1));
-            return c + d;
-        }
-    }
-    return c;
-}
-
-uint32_t utf8_toupper(uint32_t c) {
-    for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) {
-        struct CaseFold cfold = casefold[cfold_low[i]];
-        if (c <= cfold.m1) {
-            int d = cfold.m1 - cfold.c1;
-            if (c < (uint32_t)(cfold.c0 + d)) return c;
-            if (d == 1) return c - ((cfold.m1 & 1) == (c & 1));
-            return c - d;
-        }
-    }
-    return c;
-}
-
-bool utf8_isupper(uint32_t c) {
-    return utf8_tolower(c) != c;
-}
-
-bool utf8_islower(uint32_t c) {
-    return utf8_toupper(c) != c;
-}
-
-bool utf8_isspace(uint32_t c) {
-    static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0,
-                           0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000};
-    for (size_t i=0; i<sizeof t/sizeof *t; ++i)
-        if (c == t[i]) return true;
-    return (c >= 0x2000) & (c <= 0x200A);
-}
-
-bool utf8_isdigit(uint32_t c) {
-    return ((c >= '0') & (c <= '9')) || 
-           ((c >= 0xFF10) & (c <= 0xFF19));
-}
-
-bool utf8_isxdigit(uint32_t c) {
-    static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10, 
-                           0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46};
-    for (size_t i=1; i<sizeof t/sizeof *t; i += 2)
-        if (c <= t[i]) return c >= t[i - 1];
-    return false;
-}
-
-bool utf8_isalnum(uint32_t c) {
-    if (c < 128) return isalnum(c) != 0;
-    if ((c >= 0xFF10) & (c <= 0xFF19)) return true;
-    return utf8_islower(c) || utf8_isupper(c);
-}
-
-bool utf8_isalpha(uint32_t c) {
-    if (c < 128) return isalpha(c) != 0;
-    return utf8_islower(c) || utf8_isupper(c);
-}
-
-
-#ifdef TEST
-int main()
-{
-    for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i)
-    {
-        char x[3][5]={0};
-        uint32_t a = casefold[i].c0;
-        uint32_t b = utf8_tolower(a);
-        uint32_t c = utf8_toupper(b);
-
-        utf8_encode(x[0], a);
-        utf8_encode(x[1], b);
-        utf8_encode(x[2], c);
-        printf("%s %s %s - %u %u %u\n", x[0], x[1], x[2], a, b, c);
-    }
-}
-#endif
-''')
diff --git a/src/cregex.c b/src/cregex.c
index 0f585f5d..34c78090 100644
--- a/src/cregex.c
+++ b/src/cregex.c
@@ -32,7 +32,6 @@ THE SOFTWARE.
 #include <string.h>
 #include <ctype.h>
 #include <stc/cregex.h>
-#include "cregex_utf8.c"
 
 typedef uint32_t Rune; /* Utf8 code point */
 typedef int32_t Token;
@@ -594,7 +593,7 @@ nextc(Parser *par, Rune *rp)
                 return 2;
             case 'p': case 'P': { /* https://www.regular-expressions.info/unicode.html */
                 static struct { const char* c; int n, r; } cls[] = {
-                    {"{Alpha}", 7, U8_LC}, {"{LC}", 4, U8_LC}, 
+                    {"{Alpha}", 7, U8_LC}, {"{LC}", 4, U8_LC},
                     {"{Alnum}", 7, U8_Xan},
                     {"{Digit}", 7, U8_Nd}, {"{Nd}", 4, U8_Nd},
                     {"{Lower}", 7, U8_Ll}, {"{Ll}", 4, U8_Ll},
diff --git a/src/utf8tabs.c b/src/utf8tabs.c
new file mode 100644
index 00000000..8168f78f
--- /dev/null
+++ b/src/utf8tabs.c
@@ -0,0 +1,59 @@
+#include "utf8tabs.h"
+
+struct CaseFold casefold[] = {
+    {65, 90, 122}, {181, 181, 956}, {192, 214, 246}, {216, 222, 254},
+    {256, 302, 303}, {306, 310, 311}, {313, 327, 328}, {330, 374, 375}, {376, 376, 255},
+    {377, 381, 382}, {383, 383, 115}, {385, 385, 595}, {386, 388, 389}, {390, 390, 596},
+    {391, 391, 392}, {393, 394, 599}, {395, 395, 396}, {398, 398, 477}, {399, 399, 601},
+    {400, 400, 603}, {401, 401, 402}, {403, 403, 608}, {404, 404, 611}, {406, 406, 617},
+    {407, 407, 616}, {408, 408, 409}, {412, 412, 623}, {413, 413, 626}, {415, 415, 629},
+    {416, 420, 421}, {422, 422, 640}, {423, 423, 424}, {425, 425, 643}, {428, 428, 429},
+    {430, 430, 648}, {431, 431, 432}, {433, 434, 651}, {435, 437, 438}, {439, 439, 658},
+    {440, 442, 443}, {452, 452, 454}, {453, 453, 454}, {455, 455, 457}, {456, 456, 457},
+    {458, 458, 460}, {459, 475, 476}, {478, 494, 495}, {497, 497, 499}, {498, 500, 501},
+    {502, 502, 405}, {503, 503, 447}, {504, 542, 543}, {544, 544, 414}, {546, 562, 563},
+    {570, 570, 11365}, {571, 571, 572}, {573, 573, 410}, {574, 574, 11366}, {577, 577, 578},
+    {579, 579, 384}, {580, 580, 649}, {581, 581, 652}, {582, 590, 591}, {837, 837, 953},
+    {880, 882, 883}, {886, 886, 887}, {895, 895, 1011}, {902, 902, 940}, {904, 906, 943},
+    {908, 908, 972}, {910, 911, 974}, {913, 929, 961}, {931, 939, 971}, {962, 962, 963},
+    {975, 975, 983}, {976, 976, 946}, {977, 977, 952}, {981, 981, 966}, {982, 982, 960},
+    {984, 1006, 1007}, {1008, 1008, 954}, {1009, 1009, 961}, {1012, 1012, 952}, {1013, 1013, 949},
+    {1015, 1015, 1016}, {1017, 1017, 1010}, {1018, 1018, 1019}, {1021, 1023, 893},
+    {1024, 1039, 1119}, {1040, 1071, 1103}, {1120, 1152, 1153}, {1162, 1214, 1215},
+    {1216, 1216, 1231}, {1217, 1229, 1230}, {1232, 1326, 1327}, {1329, 1366, 1414},
+    {4256, 4293, 11557}, {4295, 4296, 11560}, {5112, 5117, 5109}, {7296, 7296, 1074},
+    {7297, 7297, 1076}, {7298, 7298, 1086}, {7299, 7300, 1090}, {7301, 7301, 1090},
+    {7302, 7302, 1098}, {7303, 7303, 1123}, {7304, 7304, 42571}, {7312, 7354, 4346},
+    {7357, 7359, 4351}, {7680, 7828, 7829}, {7835, 7835, 7777}, {7838, 7838, 223},
+    {7840, 7934, 7935}, {7944, 7951, 7943}, {7960, 7965, 7957}, {7976, 7983, 7975},
+    {7992, 7999, 7991}, {8008, 8013, 8005}, {8025, 8028, 8020}, {8040, 8047, 8039},
+    {8072, 8079, 8071}, {8088, 8095, 8087}, {8104, 8111, 8103}, {8120, 8121, 8113},
+    {8122, 8123, 8049}, {8124, 8124, 8115}, {8126, 8126, 953}, {8136, 8139, 8053},
+    {8140, 8140, 8131}, {8152, 8153, 8145}, {8154, 8155, 8055}, {8168, 8169, 8161},
+    {8170, 8171, 8059}, {8172, 8172, 8165}, {8184, 8185, 8057}, {8186, 8187, 8061},
+    {8188, 8188, 8179}, {8486, 8486, 969}, {8490, 8490, 107}, {8491, 8491, 229},
+    {8498, 8498, 8526}, {8544, 8559, 8575}, {8579, 8579, 8580}, {9398, 9423, 9449},
+    {11264, 11311, 11359}, {11360, 11360, 11361}, {11362, 11362, 619}, {11363, 11363, 7549},
+    {11364, 11364, 637}, {11367, 11371, 11372}, {11373, 11373, 593}, {11374, 11374, 625},
+    {11375, 11375, 592}, {11376, 11376, 594}, {11378, 11380, 11381}, {11390, 11391, 576},
+    {11392, 11490, 11491}, {11499, 11501, 11502}, {11506, 11508, 11509}, {42562, 42604, 42605},
+    {42624, 42650, 42651}, {42786, 42798, 42799}, {42802, 42862, 42863}, {42873, 42875, 42876},
+    {42877, 42877, 7545}, {42878, 42886, 42887}, {42891, 42891, 42892}, {42893, 42893, 613},
+    {42896, 42898, 42899}, {42902, 42920, 42921}, {42922, 42922, 614}, {42923, 42923, 604},
+    {42924, 42924, 609}, {42925, 42925, 620}, {42926, 42926, 618}, {42928, 42928, 670},
+    {42929, 42929, 647}, {42930, 42930, 669}, {42931, 42931, 43859}, {42932, 42946, 42947},
+    {42948, 42948, 42900}, {42949, 42949, 642}, {42950, 42950, 7566}, {42951, 42953, 42954},
+    {42960, 42962, 42963}, {42968, 42970, 42971}, {43888, 43913, 5049}, {65313, 65338, 65370},
+}; // 188
+uint8_t cfold_low[] = {
+    0, 138, 10, 111, 2, 139, 3, 8, 4, 5, 6, 7, 9, 59, 12, 14, 16, 20, 49, 25,
+    56, 52, 29, 31, 33, 35, 37, 39, 50, 40, 41, 42, 43, 44, 45, 17, 46, 47, 48, 51,
+    53, 55, 155, 58, 62, 152, 150, 153, 11, 13, 15, 18, 19, 171, 21, 172, 22, 167, 170, 24,
+    23, 174, 146, 173, 26, 151, 27, 28, 148, 30, 181, 32, 176, 34, 60, 36, 61, 38, 177, 175,
+    64, 65, 87, 67, 68, 71, 75, 83, 76, 82, 63, 126, 80, 1, 78, 81, 72, 73, 77, 137,
+    69, 70, 74, 79, 85, 66, 84, 86, 89, 99, 100, 101, 102, 103, 104, 88, 90, 105, 91, 93,
+    92, 94, 95, 107, 108, 186, 98, 164, 147, 182, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119,
+    124, 127, 130, 134, 132, 135, 120, 121, 122, 123, 125, 128, 129, 131, 133, 136, 140, 141, 142, 143,
+    144, 145, 54, 57, 149, 154, 156, 157, 158, 96, 97, 159, 106, 160, 161, 162, 163, 165, 166, 168,
+    180, 169, 179, 183, 184, 185, 178, 187,
+};
diff --git a/src/utf8tabs.h b/src/utf8tabs.h
new file mode 100644
index 00000000..95251f75
--- /dev/null
+++ b/src/utf8tabs.h
@@ -0,0 +1,10 @@
+#ifndef utf8tabs_included
+#define utf8tabs_included
+
+#include <stdint.h>
+struct CaseFold { uint16_t c0, c1, m1; } ;
+
+extern struct CaseFold casefold[188];
+extern uint8_t cfold_low[188];
+
+#endif
diff --git a/src/utf8tabs.py b/src/utf8tabs.py
new file mode 100644
index 00000000..563180e3
--- /dev/null
+++ b/src/utf8tabs.py
@@ -0,0 +1,144 @@
+#!python
+import pandas as pd
+import numpy as np
+
+def read_unidata(catfilter='Lu', casefilter='lowcase', big=False):
+    ud = pd.read_csv("ucd/UnicodeData.txt", sep=';', converters={0: lambda x: int(x, base=16)},
+                      names=['code', 'name', 'category', 'canclass', 'bidircat', 'chrdecomp',
+                             'decdig', 'digval', 'numval', 'mirrored', 'uc1name', 'comment',
+                             'upcase', 'lowcase', 'titlecase'],
+                      usecols=['code', 'name', 'category', 'bidircat', 'upcase', 'lowcase', 'titlecase'])
+    if big:
+        ud = ud[ud['code'] >= (1<<16)]
+    else:
+        ud = ud[ud['code'] < (1<<16)]
+    ud = ud[ud['category'] ==  catfilter]
+    ud = ud.replace(np.nan, '0')
+    for k in ['upcase', 'lowcase', 'titlecase']:
+        ud[k] = ud[k].apply(int, base=16)
+    if casefilter: # 'lowcase', 'upcase', 'titlecase'
+        ud = ud[ud[casefilter] != 0]
+    return ud
+
+
+def read_casefold(big=False):
+    cf = pd.read_csv("ucd/CaseFolding.txt", engine='python', sep='; #? ?', comment='#',
+                     converters={0: lambda x: int(x, base=16)},
+                     names=['code', 'status', 'lowcase', 'name'])
+    if big:
+        cf = cf[cf['code'] >= (1<<16)]
+    else:
+        cf = cf[cf['code'] < (1<<16)]
+    cf = cf[cf.status.isin(['S', 'C'])]
+    cf['lowcase'] = cf['lowcase'].apply(int, base=16)
+    #print(cf['name'].values)
+    #print(cf)
+    return cf
+
+
+def make_caselist(df):
+    letters=[]
+    for idx, row in df.iterrows():
+        #print(idx+1, ':', row['code'], row['lowcase'] - row['code'], ',', chr(row['code']), chr(row['lowcase']), ',', row['name'])
+        letters.append([idx+1, row['code'], row['lowcase'], row['name']])
+    return letters
+
+
+def make_casefold(letters):
+    prevoffset = 0
+    diffoffset = 0
+    prev = [-1, 0, 0]
+    diff = [-1, 0, 0]
+
+    out = []
+    n = 1
+    for x in letters:
+        offset = x[2] - x[1]
+        diffoffset = prevoffset - offset
+        if (diff[1] and x[1] - prev[1] != diff[1]) or (diff[2] and x[2] - prev[2] != diff[2]) or prevoffset != offset:
+            out.append([x[1], x[2], n, x[3]]) # , ';', chr(x[1]), chr(x[2]), ';', x[1], offset, "CHANGE")
+            #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset, "CHANGE", n+1)
+            diff[1] = 0
+            diff[2] = 0
+            n = 1
+        else:
+            n += 1
+            if diff[1] == 0:
+                diff[1] = x[1] - prev[1]
+                diff[2] = x[2] - prev[2]
+            #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset)
+            diff[1] = x[1] - prev[1]
+            diff[2] = x[2] - prev[2]
+        prev[2] = x[2]
+        prev[1] = x[1]
+        prevoffset = offset
+
+    out.append(out[-1])
+    out[-1][2] = 26
+    cfold = []
+    for i in range(0, len(out)-1):
+        d = out[i][1] - out[i][0]
+        cfold.append([out[i][0], out[i+1][2], d, out[i][3]])
+    return cfold
+
+
+def print_casefold(cfold):
+    print('''
+struct CaseFold casefold[] = {''')
+    n = 1
+    s = 5
+    count = 0
+    table = []
+    print('   ', end='')
+    for x in cfold:
+        d = 2 if abs(x[2]) == 1 else 1
+        a = x[0]
+        b = x[0] + (x[1] - 1)*d
+        c = b + x[2]
+        if b >= 1<<16 or c >= 1<<16: # only to make sure...
+            break
+        #print(' {%d, %d, %d}, // %s %s, %s\n   ' % (a, b, c, chr(a), chr(a + x[2]), x[3]), end='')
+        if True: # compact
+            if n == s:
+                n = 0
+                if a > 1000:
+                    s = 4
+                print('\n   ', end='')
+            print(' {%d, %d, %d},' % (a, b, c), end='')
+            table.append((a, b, c))
+        else:
+            print(' {%d, %d, %d}, // ' % (a, b, c), end='')
+            for y in range(x[0], x[0] + x[1], d):
+                print('%s %s, ' % (chr(y), chr(y + x[2])), end='')
+            print('')
+        count += 1
+        n += 1
+    print('\n}; // %d' % (count))
+    return table
+
+
+def make_casetable():
+    df = read_casefold()
+    #df = read_unidata()
+    letters = make_caselist(df)
+    cfold = make_casefold(letters)
+    return cfold
+
+
+def print_casefold_low(table):
+    cfold_low = [i for i in range(len(table))]
+    cfold_low.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0]))
+
+    print('uint8_t cfold_low[] = {\n   ', end='')
+    for i in range(len(cfold_low)):
+        print(" %d," % (cfold_low[i]), end='\n   ' if (i+1) % 20 == 0 else '')
+    print('\n};')
+
+
+########### main:
+
+if __name__ == "__main__":
+    print('#include "utf8tabs.h"')
+    cfold = make_casetable()
+    table = print_casefold(cfold)
+    print_casefold_low(table)
diff --git a/src/utf8utils.c b/src/utf8utils.c
new file mode 100644
index 00000000..3b01ae39
--- /dev/null
+++ b/src/utf8utils.c
@@ -0,0 +1,190 @@
+#include <ctype.h>
+#define i_header
+#include <stc/cstr.h>
+
+#include "utf8tabs.h"
+#include "utf8tabs.c"
+
+// https://news.ycombinator.com/item?id=15423674
+// https://gist.github.com/s4y/344a355f8c1f99c6a4cb2347ec4323cc
+
+void utf8_decode(utf8_decode_t *d, const uint8_t b)
+{
+    switch (d->state) {
+    case UTF8_OK:
+        if      (b < 0x80) d->codep = b, d->size = 1;
+        else if (b < 0xC2) d->state = UTF8_ERROR, d->size = 0;
+        else if (b < 0xE0) d->state = 1, d->codep = b & 0x1F, d->size = 2;
+        else if (b < 0xF0) d->state = 2, d->codep = b & 0x0F, d->size = 3;
+        else if (b < 0xF5) d->state = 3, d->codep = b & 0x07, d->size = 4;
+        else d->state = UTF8_ERROR, d->size = 0;
+        break;
+    case 1: case 2: case 3:
+        if ((b & 0xC0) == 0x80) {
+            d->state -= 1;
+            d->codep = (d->codep << 6) | (b & 0x3F);
+        } else
+            d->state = UTF8_ERROR, d->size = 0;
+    }
+}
+
+unsigned utf8_encode(char *out, uint32_t c)
+{
+    if (c < 0x80U) {
+        out[0] = (char) c;
+        return 1;
+    } else if (c < 0x0800U) {
+        out[0] = (char) ((c>>6  & 0x1F) | 0xC0);
+        out[1] = (char) ((c     & 0x3F) | 0x80);
+        return 2;
+    } else if (c < 0x010000U) {
+        if ((c < 0xD800U) | (c >= 0xE000U)) {
+            out[0] = (char) ((c>>12 & 0x0F) | 0xE0);
+            out[1] = (char) ((c>>6  & 0x3F) | 0x80);
+            out[2] = (char) ((c     & 0x3F) | 0x80);
+            return 3;
+        }
+    } else if (c < 0x110000U) {
+        out[0] = (char) ((c>>18 & 0x07) | 0xF0);
+        out[1] = (char) ((c>>12 & 0x3F) | 0x80);
+        out[2] = (char) ((c>>6  & 0x3F) | 0x80);
+        out[3] = (char) ((c     & 0x3F) | 0x80);
+        return 4;
+    }
+    return 0;
+}
+
+void utf8_peek(utf8_decode_t* d, const char *s) {
+    utf8_decode(d, (uint8_t)*s++);
+    switch (d->size) {
+        case 4: utf8_decode(d, (uint8_t)*s++);
+        case 3: utf8_decode(d, (uint8_t)*s++);
+        case 2: utf8_decode(d, (uint8_t)*s++);
+    }
+}
+
+bool utf8_valid(const char* s) {
+    utf8_decode_t d = {UTF8_OK};
+    while (*s)
+        utf8_decode(&d, (uint8_t)*s++);
+    return d.state == UTF8_OK;
+}
+
+bool utf8_valid_n(const char* s, size_t n) {
+    utf8_decode_t d = {UTF8_OK};
+    while ((n-- != 0) & (*s != 0))
+        utf8_decode(&d, (uint8_t)*s++);
+    return d.state == UTF8_OK;
+}
+
+uint32_t utf8_tolower(uint32_t c) {
+    for (size_t i=0; i < sizeof casefold/sizeof *casefold; ++i) {
+        if (c <= casefold[i].c1) {
+            if (c < casefold[i].c0) return c;
+            int d = casefold[i].m1 - casefold[i].c1;
+            if (d == 1) return c + ((casefold[i].c1 & 1) == (c & 1));
+            return c + d;
+        }
+    }
+    return c;
+}
+
+uint32_t utf8_toupper(uint32_t c) {
+    for (size_t i=0; i < sizeof cfold_low/sizeof *cfold_low; ++i) {
+        struct CaseFold cfold = casefold[cfold_low[i]];
+        if (c <= cfold.m1) {
+            int d = cfold.m1 - cfold.c1;
+            if (c < (uint32_t)(cfold.c0 + d)) return c;
+            if (d == 1) return c - ((cfold.m1 & 1) == (c & 1));
+            return c - d;
+        }
+    }
+    return c;
+}
+
+bool utf8_isupper(uint32_t c) {
+    return utf8_tolower(c) != c;
+}
+
+bool utf8_islower(uint32_t c) {
+    return utf8_toupper(c) != c;
+}
+
+bool utf8_isspace(uint32_t c) {
+    static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0,
+                           0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000};
+    for (size_t i=0; i<sizeof t/sizeof *t; ++i)
+        if (c == t[i]) return true;
+    return (c >= 0x2000) & (c <= 0x200A);
+}
+
+bool utf8_isdigit(uint32_t c) {
+    return ((c >= '0') & (c <= '9')) ||
+           ((c >= 0xFF10) & (c <= 0xFF19));
+}
+
+bool utf8_isxdigit(uint32_t c) {
+    static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66, 0xFF10,
+                           0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46};
+    for (size_t i=1; i<sizeof t/sizeof *t; i += 2)
+        if (c <= t[i]) return c >= t[i - 1];
+    return false;
+}
+
+bool utf8_isalnum(uint32_t c) {
+    if (c < 128) return isalnum(c) != 0;
+    if ((c >= 0xFF10) & (c <= 0xFF19)) return true;
+    return utf8_islower(c) || utf8_isupper(c);
+}
+
+bool utf8_isalpha(uint32_t c) {
+    if (c < 128) return isalpha(c) != 0;
+    return utf8_islower(c) || utf8_isupper(c);
+}
+
+static struct fnfold {
+    int (*conv_asc)(int);
+    uint32_t (*conv_u8)(uint32_t);
+}
+fn_tolower = {tolower, utf8_tolower},
+fn_toupper = {toupper, utf8_toupper};
+
+
+static cstr cstr_casefold(const cstr* self, struct fnfold fold) {
+    csview sv = cstr_sv(self);
+    cstr out = cstr_null;
+    char *buf = cstr_reserve(&out, sv.size*3/2);
+    uint32_t cp; size_t sz = 0;
+    utf8_decode_t d = {UTF8_OK};
+
+    for (; *sv.str; sv.str += d.size) {
+        utf8_peek(&d, sv.str);
+        switch (d.size) {
+        case 1:
+            buf[sz++] = (char)fold.conv_asc(*sv.str);
+            break;
+        default:
+            cp = fold.conv_u8(d.codep);
+            sz += utf8_encode(buf + sz, cp);
+        }
+    }
+    _cstr_set_size(&out, sz);
+    cstr_shrink_to_fit(&out);
+    return out;
+}
+
+cstr cstr_tolower(const cstr* self) {
+    return cstr_casefold(self, fn_tolower);
+}
+
+cstr cstr_toupper(const cstr* self) {
+    return cstr_casefold(self, fn_toupper);
+}
+
+void cstr_lowercase(cstr* self) {
+    cstr_take(self, cstr_casefold(self, fn_tolower));
+}
+
+void cstr_uppercase(cstr* self) {
+    cstr_take(self, cstr_casefold(self, fn_toupper));
+}
-- 
cgit v1.2.3