summaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorTyge Løvset <[email protected]>2022-06-09 15:41:18 +0200
committerTyge Løvset <[email protected]>2022-06-09 15:41:18 +0200
commit3ee36759b8567a72a8349c312fc7dbe975de9e02 (patch)
tree131444f3c7a679fb32415aa957b94181980fbf17 /src
parent7bbc74bb5c0778504f098974ba79de122cee7211 (diff)
downloadSTC-modified-3ee36759b8567a72a8349c312fc7dbe975de9e02.tar.gz
STC-modified-3ee36759b8567a72a8349c312fc7dbe975de9e02.zip
Removed adding circled letters and roman numerics from toupper/tolower
Diffstat (limited to 'src')
-rw-r--r--src/utf8code.c30
-rw-r--r--src/utf8tabs.inc20
-rw-r--r--src/utf8tabs.py11
3 files changed, 29 insertions, 32 deletions
diff --git a/src/utf8code.c b/src/utf8code.c
index 3f99c017..9613ba95 100644
--- a/src/utf8code.c
+++ b/src/utf8code.c
@@ -62,10 +62,10 @@ bool utf8_valid_n(const char* s, size_t n) {
uint32_t utf8_casefold(uint32_t c) {
for (size_t i=0; i < casefold_len; ++i) {
const struct CaseMapping entry = casemappings[i];
- if (c <= entry.c1) {
- if (c < entry.c0) return c;
- int d = entry.m1 - entry.c1;
- if (d == 1) return c + ((entry.c1 & 1) == (c & 1));
+ if (c <= entry.c2) {
+ if (c < entry.c1) return c;
+ int d = entry.m2 - entry.c2;
+ if (d == 1) return c + ((entry.c2 & 1) == (c & 1));
return c + d;
}
}
@@ -75,10 +75,10 @@ uint32_t utf8_casefold(uint32_t c) {
uint32_t utf8_tolower(uint32_t c) {
for (size_t i=0; i < sizeof upcase_ind/sizeof *upcase_ind; ++i) {
const struct CaseMapping entry = casemappings[upcase_ind[i]];
- if (c <= entry.c1) {
- if (c < entry.c0) return c;
- int d = entry.m1 - entry.c1;
- if (d == 1) return c + ((entry.c1 & 1) == (c & 1));
+ if (c <= entry.c2) {
+ if (c < entry.c1) return c;
+ int d = entry.m2 - entry.c2;
+ if (d == 1) return c + ((entry.c2 & 1) == (c & 1));
return c + d;
}
}
@@ -88,10 +88,10 @@ uint32_t utf8_tolower(uint32_t c) {
uint32_t utf8_toupper(uint32_t c) {
for (size_t i=0; i < sizeof lowcase_ind/sizeof *lowcase_ind; ++i) {
const struct CaseMapping entry = casemappings[lowcase_ind[i]];
- if (c <= entry.m1) {
- int d = entry.m1 - entry.c1;
- if (c < (uint32_t)(entry.c0 + d)) return c;
- if (d == 1) return c - ((entry.m1 & 1) == (c & 1));
+ if (c <= entry.m2) {
+ int d = entry.m2 - entry.c2;
+ if (c < (uint32_t)(entry.c1 + d)) return c;
+ if (d == 1) return c - ((entry.m2 & 1) == (c & 1));
return c - d;
}
}
@@ -164,8 +164,8 @@ bool utf8_isalpha(uint32_t c) {
}
static struct fncase {
- int (*conv_asc)(int);
- uint32_t (*conv_u8)(uint32_t);
+ int (*conv_asc)(int);
+ uint32_t (*conv_utf)(uint32_t);
}
fn_tofold = {tolower, utf8_casefold},
fn_tolower = {tolower, utf8_tolower},
@@ -184,7 +184,7 @@ static cstr cstr_tocase(const cstr* self, struct fncase fn) {
if (d.codep < 128)
buf[sz++] = (char)fn.conv_asc(d.codep);
else {
- cp = fn.conv_u8(d.codep);
+ cp = fn.conv_utf(d.codep);
sz += utf8_encode(buf + sz, cp);
}
}
diff --git a/src/utf8tabs.inc b/src/utf8tabs.inc
index 28f30978..3ece077e 100644
--- a/src/utf8tabs.inc
+++ b/src/utf8tabs.inc
@@ -1,6 +1,6 @@
#include <stdint.h>
-struct CaseMapping { uint16_t c0, c1, m1; };
+struct CaseMapping { uint16_t c1, c2, m2; };
static struct CaseMapping casemappings[] = {
{0x0041, 0x005A, 0x007A}, // A a (26) LATIN CAPITAL LETTER A
@@ -225,19 +225,19 @@ static struct CaseMapping casemappings[] = {
enum { casefold_len = 192 };
-static uint8_t upcase_ind[164] = {
+static uint8_t upcase_ind[162] = {
0, 2, 3, 4, 192, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 43, 45, 193, 47, 48, 194, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 75, 80, 83, 85, 86, 87, 88, 89, 90, 91,
92, 93, 94, 95, 96, 97, 98, 99, 195, 196, 109, 110, 111, 113, 114, 115, 116, 117, 118, 119,
- 120, 121, 125, 126, 129, 131, 132, 133, 134, 135, 136, 137, 139, 140, 141, 142, 143, 144, 145, 146,
- 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166,
- 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
- 187, 188, 189, 191,
+ 120, 121, 125, 126, 129, 131, 132, 133, 134, 135, 136, 137, 139, 140, 141, 142, 144, 146, 147, 148,
+ 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+ 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188,
+ 189, 191,
};
-static uint8_t lowcase_ind[186] = {
+static uint8_t lowcase_ind[184] = {
0, 197, 113, 2, 3, 8, 4, 198, 5, 6, 7, 9, 199, 60, 12, 14, 16, 20, 50, 25,
57, 53, 29, 31, 33, 35, 37, 39, 40, 51, 41, 43, 45, 193, 17, 47, 48, 194, 52, 54,
56, 158, 59, 63, 154, 152, 155, 11, 13, 15, 18, 19, 174, 21, 175, 22, 170, 173, 24, 23,
@@ -245,7 +245,7 @@ static uint8_t lowcase_ind[186] = {
66, 88, 68, 69, 72, 200, 73, 70, 71, 201, 202, 203, 204, 75, 80, 205, 206, 86, 67, 207,
85, 87, 90, 89, 91, 92, 94, 93, 95, 96, 109, 110, 196, 208, 209, 210, 211, 212, 213, 214,
215, 167, 149, 185, 111, 216, 114, 115, 116, 117, 118, 119, 120, 121, 126, 129, 132, 136, 134, 137,
- 122, 123, 124, 125, 127, 217, 130, 131, 133, 135, 138, 142, 143, 144, 145, 146, 147, 55, 58, 151,
- 156, 157, 159, 160, 161, 97, 98, 99, 162, 163, 164, 165, 166, 168, 169, 171, 183, 172, 182, 186,
- 187, 188, 189, 181, 195, 191,
+ 122, 123, 124, 125, 127, 217, 130, 131, 133, 135, 138, 142, 144, 146, 147, 55, 58, 151, 156, 157,
+ 159, 160, 161, 97, 98, 99, 162, 163, 164, 165, 166, 168, 169, 171, 183, 172, 182, 186, 187, 188,
+ 189, 181, 195, 191,
};
diff --git a/src/utf8tabs.py b/src/utf8tabs.py
index dfaec055..4a5781f1 100644
--- a/src/utf8tabs.py
+++ b/src/utf8tabs.py
@@ -112,7 +112,7 @@ def compile_table(casetype='lowcase', category=None):
def main():
print('#include <stdint.h>\n')
- print('struct CaseMapping { uint16_t c0, c1, m1; };\n')
+ print('struct CaseMapping { uint16_t c1, c2, m2; };\n')
casemappings = compile_table('lowcase') # CaseFolding.txt
upcase = compile_table('lowcase', 'Lu') # UnicodeData.txt uppercase
@@ -145,15 +145,12 @@ def main():
print_table('casemappings', casemappings, style=1)
print('enum { casefold_len = %d };' % casefolding_len)
- # add "missing" mappings:
- for c in ('Ⅰ', 'Ⓐ'):
- upcase_ind.append(next(i for i,x in enumerate(casemappings) if x[0]==ord(c)))
- for c in ('ẞ', 'Ⅰ', 'Ⓐ'):
- lowcase_ind.append(next(i for i,x in enumerate(casemappings) if x[0]==ord(c)))
-
+ # upcase => low
upcase_ind.sort(key=lambda i: casemappings[i][0])
print_index_table('upcase_ind', upcase_ind)
+ # lowcase => up. add "missing" SHARP S caused by https://www.unicode.org/policies/stability_policy.html#Case_Pair
+ lowcase_ind.append(next(i for i,x in enumerate(casemappings) if x[0]==ord('ẞ')))
lowcase_ind.sort(key=lambda i: casemappings[i][2] - (casemappings[i][1] - casemappings[i][0]))
print_index_table('lowcase_ind', lowcase_ind)