diff options
| author | Tyge Løvset <[email protected]> | 2022-06-03 17:29:24 +0200 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2022-06-03 17:29:24 +0200 |
| commit | bfcd2e3a6912ac392694c238fce8fc613d2f1525 (patch) | |
| tree | 3917609c6cec567db1345da7da79339c753e6a22 /src | |
| parent | dd5ff55b2ec373370c894ad7e3aec884ce640a43 (diff) | |
| download | STC-modified-bfcd2e3a6912ac392694c238fce8fc613d2f1525.tar.gz STC-modified-bfcd2e3a6912ac392694c238fce8fc613d2f1525.zip | |
refactored utf8tabs.py, still has a bug though
Diffstat (limited to 'src')
| -rw-r--r-- | src/utf8tabs.py | 135 |
1 files changed, 71 insertions, 64 deletions
diff --git a/src/utf8tabs.py b/src/utf8tabs.py index 1b6ad8d7..0bb8cf64 100644 --- a/src/utf8tabs.py +++ b/src/utf8tabs.py @@ -4,7 +4,7 @@ import numpy as np _UNICODE_DIR = "https://www.unicode.org/Public/14.0.0/ucd" -def read_unidata(category='Lu', casetype='lowcase', big=False): +def read_unidata(casetype='lowcase', category='Lu', big=False): df = pd.read_csv(_UNICODE_DIR+'/UnicodeData.txt', sep=';', converters={0: lambda x: int(x, base=16)}, names=['code', 'name', 'category', 'canclass', 'bidircat', 'chrdecomp', 'decdig', 'digval', 'numval', 'mirrored', 'uc1name', 'comment', @@ -40,28 +40,24 @@ def read_casefold(big=False): return df -def make_caselist(df): - letters=[] +def make_caselist(df, casetype): + caselist=[] for idx, row in df.iterrows(): - #print(idx+1, ':', row['code'], row['lowcase'] - row['code'], ',', chr(row['code']), chr(row['lowcase']), ',', row['name']) - letters.append([idx+1, row['code'], row['lowcase'], row['name']]) - return letters + caselist.append([idx+1, row['code'], row[casetype], row['name']]) + return caselist -def make_casefold(letters): +def make_table(caselist): prevoffset = 0 - diffoffset = 0 prev = [-1, 0, 0] diff = [-1, 0, 0] - - out = [] n = 1 - for x in letters: + + ranges = [] + for x in caselist: offset = x[2] - x[1] - diffoffset = prevoffset - offset if (diff[1] and x[1] - prev[1] != diff[1]) or (diff[2] and x[2] - prev[2] != diff[2]) or prevoffset != offset: - out.append([x[1], x[2], n, x[3]]) # , ';', chr(x[1]), chr(x[2]), ';', x[1], offset, "CHANGE") - #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset, "CHANGE", n+1) + ranges.append([x[1], x[2], n, x[3]]) diff[1] = 0 diff[2] = 0 n = 1 @@ -70,78 +66,89 @@ def make_casefold(letters): if diff[1] == 0: diff[1] = x[1] - prev[1] diff[2] = x[2] - prev[2] - #print(x[1], x[2], ';', chr(x[1]), chr(x[2]), ';', offset) diff[1] = x[1] - prev[1] diff[2] = x[2] - prev[2] prev[2] = x[2] prev[1] = x[1] prevoffset = offset - out.append(out[-1]) - out[-1][2] = 26 - cfold = [] - for i in range(0, len(out)-1): - d = out[i][1] - out[i][0] - cfold.append([out[i][0], out[i+1][2], d, out[i][3]]) - return cfold + ranges.append(ranges[-1]) + ranges[-1][2] = 26 + #for r in ranges: + # print("%04X, %04X, %d" % (r[0], r[1], r[2])) + #exit() + + # next two for loops could be combined to eliminate rangelist + rangelist = [] + for i in range(0, len(ranges)-1): + d = ranges[i][1] - ranges[i][0] + rangelist.append([ranges[i][0], ranges[i+1][2], d, ranges[i][3]]) -def print_casefold(cfold): - print('#include <stdint.h>\n') - print('struct CaseFold { uint16_t c0, c1, m1; };\n') - print('static struct CaseFold casefold[] = {\n ', end='') - n = 1 - s = 5 - count = 0 table = [] - for x in cfold: - d = 2 if abs(x[2]) == 1 else 1 + for x in rangelist: + if x[2] == -1: + print("ohoh:", x[0]) + delta = 2 if x[2] == 1 else 1 a = x[0] - b = x[0] + (x[1] - 1)*d + b = x[0] + (x[1] - 1)*delta c = b + x[2] if b >= 1<<16 or c >= 1<<16: # only to make sure... break - #print(' {%d, %d, %d}, // %s %s, %s\n ' % (a, b, c, chr(a), chr(a + x[2]), x[3]), end='') - if True: # compact - if n == s: - n = 0 - if a > 1000: - s = 4 - print('\n ', end='') - print(' {%d, %d, %d},' % (a, b, c), end='') - table.append((a, b, c)) - else: - print(' {%d, %d, %d}, // ' % (a, b, c), end='') - for y in range(x[0], x[0] + x[1], d): - print('%s %s, ' % (chr(y), chr(y + x[2])), end='') - print('') - count += 1 - n += 1 - print('\n}; // %d\n' % (count)) + table.append((a, b, c, x[3])) return table -def make_casetable(): - df = read_casefold() - #df = read_unidata() - letters = make_caselist(df) - cfold = make_casefold(letters) - return cfold +def print_table(name, table, style=1): + print('static struct CaseFold %s[] = {' % (name)) + for a,b,c,t in table: + if style == 1: # first char with name + print(' {%3d, %3d, %3d}, \t// %s %s, %s' % (a, b, c, chr(a), chr(a + c - b), t)) + elif style == 2: # all chars + #print(' {%3d, %3d, %3d}, \t// ' % (a, b, c), end='') + print(' {0x%04X, 0x%04X, 0x%04X},\t// ' % (a, b, c), end='') + n = 0 + for k in range(a, b+1, 2 if c - b == 1 else 1): + n += 1 + if n % 17 == 0: + print('\n \t// ', end='') + print('%s %s, ' % (chr(k), chr(k + c - b)), end='') + print('') + print('}; // %d\n' % (len(table))) -def print_casefold_inverse(table): - cfold_low = [i for i in range(len(table))] - cfold_low.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0])) +def print_table_inv(name, table): + lowcase = [i for i in range(len(table))] + # sort by mapped value table[:][2] (= lowcase) of the first element in each range entry + lowcase.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0])) - print('static uint8_t cfold_low[] = {\n ', end='') - for i in range(len(cfold_low)): - print(" %d," % (cfold_low[i]), end='\n ' if (i+1) % 20 == 0 else '') + print('static uint8_t %s[%d] = {\n ' % (name, len(table)), end='') + for i in range(len(lowcase)): + print(" %d," % (lowcase[i]), end='\n ' if (i+1) % 20 == 0 else '') print('\n};') +def compile_table(name, casetype='lowcase', category=None): + if category: + df = read_unidata(casetype, category) + else: + df = read_casefold() + caselist = make_caselist(df, casetype) + table = make_table(caselist) + print_table(name, table, 2) + return table + + +def main(): + print('#include <stdint.h>\n') + print('struct CaseFold { uint16_t c0, c1, m1; };\n') + + table = compile_table('casefold') # casefold + #table = compile_table('tolower_tab', 'lowcase', 'Lu') # unicode + print_table_inv('cfold_low', table) + + ########### main: if __name__ == "__main__": - cfold = make_casetable() - table = print_casefold(cfold) - print_casefold_inverse(table) + main() |
