diff options
| author | Tyge Lovset <[email protected]> | 2023-01-02 09:00:47 +0100 |
|---|---|---|
| committer | Tyge Lovset <[email protected]> | 2023-01-02 09:00:47 +0100 |
| commit | 8c6ba8a3444e4b8640e7fe04f565cef57c850432 (patch) | |
| tree | 10119ea3bbef5ac2818874849231bad19ad7aa5a /src | |
| parent | 91e79fc60713c1f09e940a7ee83ff2f8aa4f9d69 (diff) | |
| download | STC-modified-8c6ba8a3444e4b8640e7fe04f565cef57c850432.tar.gz STC-modified-8c6ba8a3444e4b8640e7fe04f565cef57c850432.zip | |
Added selected unicode character classes.
Diffstat (limited to 'src')
| -rw-r--r-- | src/cregex.c | 137 | ||||
| -rw-r--r-- | src/utf8code.c | 220 |
2 files changed, 281 insertions, 76 deletions
diff --git a/src/cregex.c b/src/cregex.c index e59e21a4..7b528550 100644 --- a/src/cregex.c +++ b/src/cregex.c @@ -130,14 +130,25 @@ enum { ASC_lo , ASC_LO, /* lower */ ASC_up , ASC_UP, /* upper */ ASC_xd , ASC_XD, /* hex */ - UTF_d , UTF_D, /* utf dec digit, non-digit */ - UTF_s , UTF_S, /* utf8 white space */ - UTF_w , UTF_W, /* utf8 word */ - UTF_al , UTF_AL, /* utf8 letter cased */ - UTF_lo , UTF_LO, /* utf8 letter lower */ - UTF_up , UTF_UP, /* utf8 letter upper */ - UTF_xd , UTF_XD, /* utf8 hex digit */ UTF_an , UTF_AN, /* utf8 alphanumeric */ + UTF_wr , UTF_WR, /* utf8 word */ + UTF_xd , UTF_XD, /* utf8 hex digit */ + U8G_tmp , U8G = U8G_tmp + (U8G_tmp & 1), /* force even */ + UTF_cc = U8G, UTF_CC, /* utf8 control char */ + UTF_lc , UTF_LC, /* utf8 letter cased */ + UTF_ll , UTF_LL, /* utf8 letter lowercase */ + UTF_lt , UTF_LT, /* utf8 letter titlecase */ + UTF_lu , UTF_LU, /* utf8 letter uppercase */ + UTF_nd , UTF_ND, /* utf8 number decimal */ + UTF_nl , UTF_NL, /* utf8 number letter */ + UTF_pc , UTF_PC, /* utf8 punct connector */ + UTF_pd , UTF_PD, /* utf8 punct dash */ + UTF_pf , UTF_PF, /* utf8 punct final */ + UTF_pi , UTF_PI, /* utf8 punct initial */ + UTF_sc , UTF_SC, /* utf8 symbol currency */ + UTF_zl , UTF_ZL, /* utf8 separator line */ + UTF_zp , UTF_ZP, /* utf8 separator paragraph */ + UTF_zs , UTF_ZS, /* utf8 separator space */ TOK_ANY = 0x8200000, /* Any character except newline, . */ TOK_ANYNL , /* Any character including newline, . */ TOK_NOP , /* No operation, internal use only */ @@ -625,19 +636,24 @@ static void _lexutfclass(_Parser *par, _Rune *rp) { static struct { const char* c; int n, r; } cls[] = { - {"{Space}", 7, UTF_s}, {"{Zs}", 4, UTF_s}, - {"{Digit}", 7, UTF_d}, {"{Nd}", 4, UTF_d}, - {"{Alpha}", 7, UTF_al}, {"{LC}", 4, UTF_al}, - {"{Lower}", 7, UTF_lo}, {"{Ll}", 4, UTF_lo}, - {"{Upper}", 7, UTF_up}, {"{Lu}", 4, UTF_up}, + {"{Alpha}", 7, UTF_lc}, {"{L&}", 4, UTF_lc}, + {"{Digit}", 7, UTF_nd}, {"{Nd}", 4, UTF_nd}, + {"{Lower}", 7, UTF_ll}, {"{Ll}", 4, UTF_ll}, + {"{Upper}", 7, UTF_lu}, {"{Lu}", 4, UTF_lu}, + {"{Space}", 7, UTF_zs}, {"{Zs}", 4, UTF_zs}, {"{Alnum}", 7, UTF_an}, {"{XDigit}", 8, UTF_xd}, + {"{Cc}", 4, UTF_cc}, {"{Sc}", 4, UTF_sc}, + {"{Lt}", 4, UTF_lt}, {"{Nl}", 4, UTF_nl}, + {"{Pc}", 4, UTF_pc}, {"{Pd}", 4, UTF_pd}, + {"{Pf}", 4, UTF_pf}, {"{Pi}", 4, UTF_pi}, + {"{Zl}", 4, UTF_zl}, {"{Zp}", 4, UTF_zp}, }; int inv = (*rp == 'P'); for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i) { if (!strncmp(par->exprp, cls[i].c, (size_t)cls[i].n)) { - if (par->rune_type == TOK_IRUNE && (cls[i].r == UTF_lo || cls[i].r == UTF_up)) - *rp = (_Rune)(UTF_al + inv); + if (par->rune_type == TOK_IRUNE && (cls[i].r == UTF_ll || cls[i].r == UTF_lu)) + *rp = (_Rune)(UTF_lc + inv); else *rp = (_Rune)(cls[i].r + inv); par->exprp += cls[i].n; @@ -653,12 +669,12 @@ _lexutfclass(_Parser *par, _Rune *rp) case 'v': rune = '\v'; break; \ case 'f': rune = '\f'; break; \ case 'a': rune = '\a'; break; \ - case 'd': rune = UTF_d; break; \ - case 'D': rune = UTF_D; break; \ - case 's': rune = UTF_s; break; \ - case 'S': rune = UTF_S; break; \ - case 'w': rune = UTF_w; break; \ - case 'W': rune = UTF_W; break + case 'd': rune = UTF_nd; break; \ + case 'D': rune = UTF_ND; break; \ + case 's': rune = UTF_zs; break; \ + case 'S': rune = UTF_ZS; break; \ + case 'w': rune = UTF_wr; break; \ + case 'W': rune = UTF_WR; break static _Token @@ -880,52 +896,43 @@ out: static int _runematch(_Rune s, _Rune r) { - int inv = 0; + int inv = 0, n; switch (s) { - case ASC_D: inv = 1; /* fallthrough */ - case ASC_d: return inv ^ (isdigit(r) != 0); - case ASC_S: inv = 1; - case ASC_s: return inv ^ (isspace(r) != 0); - case ASC_W: inv = 1; - case ASC_w: return inv ^ ((isalnum(r) != 0) | (r == '_')); - case ASC_AL: inv = 1; - case ASC_al: return inv ^ (isalpha(r) != 0); - case ASC_AN: inv = 1; - case ASC_an: return inv ^ (isalnum(r) != 0); - case ASC_AS: return (r >= 128); - case ASC_as: return (r < 128); - case ASC_BL: inv = 1; - case ASC_bl: return inv ^ ((r == ' ') | (r == '\t')); - case ASC_CT: inv = 1; - case ASC_ct: return inv ^ (iscntrl(r) != 0); - case ASC_GR: inv = 1; - case ASC_gr: return inv ^ (isgraph(r) != 0); - case ASC_PR: inv = 1; - case ASC_pr: return inv ^ (isprint(r) != 0); - case ASC_PU: inv = 1; - case ASC_pu: return inv ^ (ispunct(r) != 0); - case ASC_LO: inv = 1; - case ASC_lo: return inv ^ (islower(r) != 0); - case ASC_UP: inv = 1; - case ASC_up: return inv ^ (isupper(r) != 0); - case ASC_XD: inv = 1; - case ASC_xd: return inv ^ (isxdigit(r) != 0); - case UTF_D: inv = 1; - case UTF_d: return inv ^ (utf8_isdigit(r)); - case UTF_S: inv = 1; - case UTF_s: return inv ^ utf8_isspace(r); - case UTF_W: inv = 1; - case UTF_w: return inv ^ (utf8_isalnum(r) | (r == '_')); - case UTF_AL: inv = 1; - case UTF_al: return inv ^ utf8_isalpha(r); - case UTF_AN: inv = 1; - case UTF_an: return inv ^ utf8_isalnum(r); - case UTF_LO: inv = 1; - case UTF_lo: return inv ^ utf8_islower(r); - case UTF_UP: inv = 1; - case UTF_up: return inv ^ utf8_isupper(r); - case UTF_XD: inv = 1; - case UTF_xd: return inv ^ utf8_isxdigit(r); + case ASC_D: inv = 1; case ASC_d: return inv ^ (isdigit(r) != 0); + case ASC_S: inv = 1; case ASC_s: return inv ^ (isspace(r) != 0); + case ASC_W: inv = 1; case ASC_w: return inv ^ ((isalnum(r) != 0) | (r == '_')); + case ASC_AL: inv = 1; case ASC_al: return inv ^ (isalpha(r) != 0); + case ASC_AN: inv = 1; case ASC_an: return inv ^ (isalnum(r) != 0); + case ASC_AS: return (r >= 128); case ASC_as: return (r < 128); + case ASC_BL: inv = 1; case ASC_bl: return inv ^ ((r == ' ') | (r == '\t')); + case ASC_CT: inv = 1; case ASC_ct: return inv ^ (iscntrl(r) != 0); + case ASC_GR: inv = 1; case ASC_gr: return inv ^ (isgraph(r) != 0); + case ASC_PR: inv = 1; case ASC_pr: return inv ^ (isprint(r) != 0); + case ASC_PU: inv = 1; case ASC_pu: return inv ^ (ispunct(r) != 0); + case ASC_LO: inv = 1; case ASC_lo: return inv ^ (islower(r) != 0); + case ASC_UP: inv = 1; case ASC_up: return inv ^ (isupper(r) != 0); + case ASC_XD: inv = 1; case ASC_xd: return inv ^ (isxdigit(r) != 0); + case UTF_AN: inv = 1; case UTF_an: return inv ^ utf8_isalnum(r); + case UTF_WR: inv = 1; case UTF_wr: return inv ^ (utf8_isalnum(r) | (r == '_')); + case UTF_XD: inv = 1; case UTF_xd: return inv ^ utf8_isxdigit(r); + case UTF_LC: inv = 1; case UTF_lc: return inv ^ utf8_isalpha(r); + case UTF_CC: case UTF_cc: + case UTF_LL: case UTF_ll: + case UTF_LT: case UTF_lt: + case UTF_LU: case UTF_lu: + case UTF_ND: case UTF_nd: + case UTF_NL: case UTF_nl: + case UTF_PC: case UTF_pc: + case UTF_PD: case UTF_pd: + case UTF_PF: case UTF_pf: + case UTF_PI: case UTF_pi: + case UTF_SC: case UTF_sc: + case UTF_ZL: case UTF_zl: + case UTF_ZP: case UTF_zp: + case UTF_ZS: case UTF_zs: + n = s - U8G; + inv = n & 1; + return inv ^ utf8_isgroup(n / 2, r); } return s == r; } diff --git a/src/utf8code.c b/src/utf8code.c index 5dfb7d30..6fe8515e 100644 --- a/src/utf8code.c +++ b/src/utf8code.c @@ -112,17 +112,29 @@ int utf8_icmp_sv(const csview s1, const csview s2) { return (int)(s1.size - s2.size); } -bool utf8_isspace(uint32_t c) { - static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0, - 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000}; - for (size_t i=0; i<sizeof t/sizeof *t; ++i) - if (c == t[i]) return true; - return (c >= 0x2000) & (c <= 0x200A); -} +typedef struct { + uint16_t lo; + uint16_t hi; +} URange16; + +typedef struct { + const char *name; + const URange16 *r16; + int nr16; +} UGroup; + +static const UGroup unicode_groups[]; +static const int num_unicode_groups; +static const int Lt_group; -bool utf8_isdigit(uint32_t c) { - return ((c >= '0') & (c <= '9')) || - ((c >= 0xFF10) & (c <= 0xFF19)); +bool utf8_isgroup(int group, uint32_t c) { + for (int j=0; j<unicode_groups[group].nr16; ++j) { + if (c < unicode_groups[group].r16[j].lo) + return false; + if (c <= unicode_groups[group].r16[j].hi) + return true; + } + return false; } bool utf8_isxdigit(uint32_t c) { @@ -141,6 +153,192 @@ bool utf8_isalnum(uint32_t c) { bool utf8_isalpha(uint32_t c) { if (c < 128) return isalpha(c) != 0; - return utf8_islower(c) || utf8_isupper(c); + return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(Lt_group, c); } + +static const URange16 Cc_range16[] = { // Control + { 0, 31 }, + { 127, 159 }, +}; + +static const URange16 Lt_range16[] = { // Title case + { 453, 453 }, + { 456, 456 }, + { 459, 459 }, + { 498, 498 }, + { 8072, 8079 }, + { 8088, 8095 }, + { 8104, 8111 }, + { 8124, 8124 }, + { 8140, 8140 }, + { 8188, 8188 }, +}; + +static const URange16 Nd_range16[] = { // Decimal number + { 48, 57 }, + { 1632, 1641 }, + { 1776, 1785 }, + { 1984, 1993 }, + { 2406, 2415 }, + { 2534, 2543 }, + { 2662, 2671 }, + { 2790, 2799 }, + { 2918, 2927 }, + { 3046, 3055 }, + { 3174, 3183 }, + { 3302, 3311 }, + { 3430, 3439 }, + { 3558, 3567 }, + { 3664, 3673 }, + { 3792, 3801 }, + { 3872, 3881 }, + { 4160, 4169 }, + { 4240, 4249 }, + { 6112, 6121 }, + { 6160, 6169 }, + { 6470, 6479 }, + { 6608, 6617 }, + { 6784, 6793 }, + { 6800, 6809 }, + { 6992, 7001 }, + { 7088, 7097 }, + { 7232, 7241 }, + { 7248, 7257 }, + { 42528, 42537 }, + { 43216, 43225 }, + { 43264, 43273 }, + { 43472, 43481 }, + { 43504, 43513 }, + { 43600, 43609 }, + { 44016, 44025 }, + { 65296, 65305 }, +}; + +static const URange16 Nl_range16[] = { // Number letter + { 5870, 5872 }, + { 8544, 8578 }, + { 8581, 8584 }, + { 12295, 12295 }, + { 12321, 12329 }, + { 12344, 12346 }, + { 42726, 42735 }, +}; + +static const URange16 Pc_range16[] = { // Connector punctuation + { 95, 95 }, + { 8255, 8256 }, + { 8276, 8276 }, + { 65075, 65076 }, + { 65101, 65103 }, + { 65343, 65343 }, +}; + +static const URange16 Pd_range16[] = { // Dash punctuation + { 45, 45 }, + { 1418, 1418 }, + { 1470, 1470 }, + { 5120, 5120 }, + { 6150, 6150 }, + { 8208, 8213 }, + { 11799, 11799 }, + { 11802, 11802 }, + { 11834, 11835 }, + { 11840, 11840 }, + { 11869, 11869 }, + { 12316, 12316 }, + { 12336, 12336 }, + { 12448, 12448 }, + { 65073, 65074 }, + { 65112, 65112 }, + { 65123, 65123 }, + { 65293, 65293 }, +}; + +static const URange16 Pf_range16[] = { // Final punctuation + { 187, 187 }, + { 8217, 8217 }, + { 8221, 8221 }, + { 8250, 8250 }, + { 11779, 11779 }, + { 11781, 11781 }, + { 11786, 11786 }, + { 11789, 11789 }, + { 11805, 11805 }, + { 11809, 11809 }, +}; + +static const URange16 Pi_range16[] = { // Initial punctuation + { 171, 171 }, + { 8216, 8216 }, + { 8219, 8220 }, + { 8223, 8223 }, + { 8249, 8249 }, + { 11778, 11778 }, + { 11780, 11780 }, + { 11785, 11785 }, + { 11788, 11788 }, + { 11804, 11804 }, + { 11808, 11808 }, +}; + +static const URange16 Sc_range16[] = { // Currency symbol + { 36, 36 }, + { 162, 165 }, + { 1423, 1423 }, + { 1547, 1547 }, + { 2046, 2047 }, + { 2546, 2547 }, + { 2555, 2555 }, + { 2801, 2801 }, + { 3065, 3065 }, + { 3647, 3647 }, + { 6107, 6107 }, + { 8352, 8384 }, + { 43064, 43064 }, + { 65020, 65020 }, + { 65129, 65129 }, + { 65284, 65284 }, + { 65504, 65505 }, + { 65509, 65510 }, +}; + +static const URange16 Zl_range16[] = { // Line separator + { 8232, 8232 }, +}; + +static const URange16 Zp_range16[] = { // Paragraph separator + { 8233, 8233 }, +}; + +static const URange16 Zs_range16[] = { // Space separator + { 32, 32 }, + { 160, 160 }, + { 5760, 5760 }, + { 8192, 8202 }, + { 8239, 8239 }, + { 8287, 8287 }, + { 12288, 12288 }, +}; + +#define UNI_ENTRY(Code) \ + { #Code, Code##_range16, sizeof(Code##_range16)/2 } + +static const UGroup unicode_groups[] = { + UNI_ENTRY(Cc), + UNI_ENTRY(Lt), + UNI_ENTRY(Nd), + UNI_ENTRY(Nl), + UNI_ENTRY(Pc), + UNI_ENTRY(Pd), + UNI_ENTRY(Pf), + UNI_ENTRY(Pi), + UNI_ENTRY(Sc), + UNI_ENTRY(Zl), + UNI_ENTRY(Zp), + UNI_ENTRY(Zs), +}; + +static const int Lt_group = 1; +static const int num_unicode_groups = sizeof unicode_groups / sizeof unicode_groups[0]; + #endif |
