diff options
| author | Tyge Løvset <[email protected]> | 2022-06-07 16:39:30 +0200 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2022-06-07 16:39:30 +0200 |
| commit | 48b3336d8f10d8097d7626732ede5896ec353407 (patch) | |
| tree | e83f87925eb84fbbaefeb3fe4cee39edacf232e0 /src | |
| parent | b65f70fdc80b19d869adabb5cb270807c96d152b (diff) | |
| download | STC-modified-48b3336d8f10d8097d7626732ede5896ec353407.tar.gz STC-modified-48b3336d8f10d8097d7626732ede5896ec353407.zip | |
Some improvements in utf8tabs.py
Diffstat (limited to 'src')
| -rw-r--r-- | src/utf8code.c | 2 | ||||
| -rw-r--r-- | src/utf8tabs.py | 25 |
2 files changed, 13 insertions, 14 deletions
diff --git a/src/utf8code.c b/src/utf8code.c index 4b55d2be..0cdcd8b6 100644 --- a/src/utf8code.c +++ b/src/utf8code.c @@ -105,7 +105,7 @@ int utf8_icmp_n(size_t u8max, const char* s1, const size_t n1, do { utf8_decode(&d2, (uint8_t)s2[j2++]); } while (d2.state); int c = utf8_tolower(d1.codep) - utf8_tolower(d2.codep); if (c || !s2[j2]) - return c; + return c; } return (j2 < n2) - (j1 < n1); } diff --git a/src/utf8tabs.py b/src/utf8tabs.py index 55be3f99..241f44af 100644 --- a/src/utf8tabs.py +++ b/src/utf8tabs.py @@ -60,7 +60,7 @@ def make_table(caselist): offset = b - a if abs(diff_a) > 2 or a - prev_a != diff_a or b - prev_b != diff_b or prev_offs != offset: - if j > 0 and start_a not in [0xAB70, 0x13F8]: + if j > 0 and start_a not in [0xAB70, 0x13F8]: # BUG in CaseFolding.txt V14 table.append([start_a, prev_a, prev_b, start_name]) if j < n_1: diff_a = caselist[j+1][0] - a @@ -103,7 +103,7 @@ def print_index_table(name, indtab): def make_inverse_ind(table): inv = [] for i in range(len(table)): - if table[i][2] not in [ord('i'), ord('s')]: # remove 'i'. 's' upcase mappings. + if table[i][2] not in [ord('i'), ord('s')]: # ignore 'i'. 's' upcase mappings. inv.append(i) # sort by mapped value table[:][2] (= inv) of the first element in each range entry inv.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0])) @@ -124,25 +124,24 @@ def main(): print('#include <stdint.h>\n') print('struct CaseMapping { uint16_t c0, c1, m1; };\n') - casemappings = compile_table('lowcase') # casefold - lowcase = compile_table('lowcase', 'Lu') # unicode lowercase + casemappings = compile_table('lowcase') # Casefolding.txt + upcase = compile_table('lowcase', 'Lu') # UnicodeData.txt uppercase - lowcase_ind = [] - for v in lowcase: - try: - lowcase_ind.append(casemappings.index(v)) - except: - lowcase_ind.append(len(casemappings)) + # merge in additional Lu => Ll mappings from UnicodeData.txt + for v in upcase: + if v not in casemappings: casemappings.append(v) - casefold_len = len(casemappings) + # sort casemappings by uppercase values: casemappings.sort(key=lambda x: x[0]) - print_table('casemappings', casemappings, style=1) + + # index list sorted by mapped lowercase values: upcase_ind = make_inverse_ind(casemappings) - print('enum { casefold_len = %d };' % casefold_len) + print('enum { casefold_len = %d };' % len(casemappings)) print_index_table('upcase_ind', upcase_ind) + ########### main: if __name__ == "__main__": |
