summaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorTyge Løvset <[email protected]>2022-06-07 16:39:30 +0200
committerTyge Løvset <[email protected]>2022-06-07 16:39:30 +0200
commit48b3336d8f10d8097d7626732ede5896ec353407 (patch)
treee83f87925eb84fbbaefeb3fe4cee39edacf232e0 /src
parentb65f70fdc80b19d869adabb5cb270807c96d152b (diff)
downloadSTC-modified-48b3336d8f10d8097d7626732ede5896ec353407.tar.gz
STC-modified-48b3336d8f10d8097d7626732ede5896ec353407.zip
Some improvements in utf8tabs.py
Diffstat (limited to 'src')
-rw-r--r--src/utf8code.c2
-rw-r--r--src/utf8tabs.py25
2 files changed, 13 insertions, 14 deletions
diff --git a/src/utf8code.c b/src/utf8code.c
index 4b55d2be..0cdcd8b6 100644
--- a/src/utf8code.c
+++ b/src/utf8code.c
@@ -105,7 +105,7 @@ int utf8_icmp_n(size_t u8max, const char* s1, const size_t n1,
do { utf8_decode(&d2, (uint8_t)s2[j2++]); } while (d2.state);
int c = utf8_tolower(d1.codep) - utf8_tolower(d2.codep);
if (c || !s2[j2])
- return c;
+ return c;
}
return (j2 < n2) - (j1 < n1);
}
diff --git a/src/utf8tabs.py b/src/utf8tabs.py
index 55be3f99..241f44af 100644
--- a/src/utf8tabs.py
+++ b/src/utf8tabs.py
@@ -60,7 +60,7 @@ def make_table(caselist):
offset = b - a
if abs(diff_a) > 2 or a - prev_a != diff_a or b - prev_b != diff_b or prev_offs != offset:
- if j > 0 and start_a not in [0xAB70, 0x13F8]:
+ if j > 0 and start_a not in [0xAB70, 0x13F8]: # BUG in CaseFolding.txt V14
table.append([start_a, prev_a, prev_b, start_name])
if j < n_1:
diff_a = caselist[j+1][0] - a
@@ -103,7 +103,7 @@ def print_index_table(name, indtab):
def make_inverse_ind(table):
inv = []
for i in range(len(table)):
- if table[i][2] not in [ord('i'), ord('s')]: # remove 'i'. 's' upcase mappings.
+ if table[i][2] not in [ord('i'), ord('s')]: # ignore 'i'. 's' upcase mappings.
inv.append(i)
# sort by mapped value table[:][2] (= inv) of the first element in each range entry
inv.sort(key=lambda i: table[i][2] - (table[i][1] - table[i][0]))
@@ -124,25 +124,24 @@ def main():
print('#include <stdint.h>\n')
print('struct CaseMapping { uint16_t c0, c1, m1; };\n')
- casemappings = compile_table('lowcase') # casefold
- lowcase = compile_table('lowcase', 'Lu') # unicode lowercase
+ casemappings = compile_table('lowcase') # Casefolding.txt
+ upcase = compile_table('lowcase', 'Lu') # UnicodeData.txt uppercase
- lowcase_ind = []
- for v in lowcase:
- try:
- lowcase_ind.append(casemappings.index(v))
- except:
- lowcase_ind.append(len(casemappings))
+ # merge in additional Lu => Ll mappings from UnicodeData.txt
+ for v in upcase:
+ if v not in casemappings:
casemappings.append(v)
- casefold_len = len(casemappings)
+ # sort casemappings by uppercase values:
casemappings.sort(key=lambda x: x[0])
-
print_table('casemappings', casemappings, style=1)
+
+ # index list sorted by mapped lowercase values:
upcase_ind = make_inverse_ind(casemappings)
- print('enum { casefold_len = %d };' % casefold_len)
+ print('enum { casefold_len = %d };' % len(casemappings))
print_index_table('upcase_ind', upcase_ind)
+
########### main:
if __name__ == "__main__":