Added selected unicode character classes.

author: Tyge Lovset <[email protected]> 2023-01-02 09:00:47 +0100
committer: Tyge Lovset <[email protected]> 2023-01-02 09:00:47 +0100
commit: 8c6ba8a3444e4b8640e7fe04f565cef57c850432 (patch)
tree: 10119ea3bbef5ac2818874849231bad19ad7aa5a /src
parent: 91e79fc60713c1f09e940a7ee83ff2f8aa4f9d69 (diff)
download: STC-modified-8c6ba8a3444e4b8640e7fe04f565cef57c850432.tar.gz
STC-modified-8c6ba8a3444e4b8640e7fe04f565cef57c850432.zip
2 files changed, 281 insertions, 76 deletions
diff --git a/src/cregex.c b/src/cregex.c
index e59e21a4..7b528550 100644
--- a/src/cregex.c
+++ b/src/cregex.c
@@ -130,14 +130,25 @@ enum {
     ASC_lo      , ASC_LO,       /* lower */
     ASC_up      , ASC_UP,       /* upper */
     ASC_xd      , ASC_XD,       /* hex */
-    UTF_d       , UTF_D,        /* utf dec digit, non-digit */
-    UTF_s       , UTF_S,        /* utf8 white space */
-    UTF_w       , UTF_W,        /* utf8 word */
-    UTF_al      , UTF_AL,       /* utf8 letter cased */
-    UTF_lo      , UTF_LO,       /* utf8 letter lower */
-    UTF_up      , UTF_UP,       /* utf8 letter upper */
-    UTF_xd      , UTF_XD,       /* utf8 hex digit */
     UTF_an      , UTF_AN,       /* utf8 alphanumeric */
+    UTF_wr      , UTF_WR,       /* utf8 word */
+    UTF_xd      , UTF_XD,       /* utf8 hex digit */
+    U8G_tmp     , U8G = U8G_tmp + (U8G_tmp & 1), /* force even */
+    UTF_cc = U8G, UTF_CC,       /* utf8 control char */
+    UTF_lc      , UTF_LC,       /* utf8 letter cased */
+    UTF_ll      , UTF_LL,       /* utf8 letter lowercase */
+    UTF_lt      , UTF_LT,       /* utf8 letter titlecase */
+    UTF_lu      , UTF_LU,       /* utf8 letter uppercase */
+    UTF_nd      , UTF_ND,       /* utf8 number decimal */
+    UTF_nl      , UTF_NL,       /* utf8 number letter */
+    UTF_pc      , UTF_PC,       /* utf8 punct connector */
+    UTF_pd      , UTF_PD,       /* utf8 punct dash */
+    UTF_pf      , UTF_PF,       /* utf8 punct final */
+    UTF_pi      , UTF_PI,       /* utf8 punct initial */
+    UTF_sc      , UTF_SC,       /* utf8 symbol currency */
+    UTF_zl      , UTF_ZL,       /* utf8 separator line */
+    UTF_zp      , UTF_ZP,       /* utf8 separator paragraph */
+    UTF_zs      , UTF_ZS,       /* utf8 separator space */
     TOK_ANY     = 0x8200000,    /* Any character except newline, . */
     TOK_ANYNL   ,               /* Any character including newline, . */
     TOK_NOP     ,               /* No operation, internal use only */
@@ -625,19 +636,24 @@ static void
 _lexutfclass(_Parser *par, _Rune *rp)
 {
     static struct { const char* c; int n, r; } cls[] = {
-        {"{Space}", 7, UTF_s}, {"{Zs}", 4, UTF_s},
-        {"{Digit}", 7, UTF_d}, {"{Nd}", 4, UTF_d},
-        {"{Alpha}", 7, UTF_al}, {"{LC}", 4, UTF_al},
-        {"{Lower}", 7, UTF_lo}, {"{Ll}", 4, UTF_lo},
-        {"{Upper}", 7, UTF_up}, {"{Lu}", 4, UTF_up},
+        {"{Alpha}", 7, UTF_lc}, {"{L&}", 4, UTF_lc},
+        {"{Digit}", 7, UTF_nd}, {"{Nd}", 4, UTF_nd},
+        {"{Lower}", 7, UTF_ll}, {"{Ll}", 4, UTF_ll},
+        {"{Upper}", 7, UTF_lu}, {"{Lu}", 4, UTF_lu},
+        {"{Space}", 7, UTF_zs}, {"{Zs}", 4, UTF_zs},
         {"{Alnum}", 7, UTF_an},
         {"{XDigit}", 8, UTF_xd},
+        {"{Cc}", 4, UTF_cc}, {"{Sc}", 4, UTF_sc},
+        {"{Lt}", 4, UTF_lt}, {"{Nl}", 4, UTF_nl},
+        {"{Pc}", 4, UTF_pc}, {"{Pd}", 4, UTF_pd},
+        {"{Pf}", 4, UTF_pf}, {"{Pi}", 4, UTF_pi},
+        {"{Zl}", 4, UTF_zl}, {"{Zp}", 4, UTF_zp},
     };
     int inv = (*rp == 'P');
     for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i) {
         if (!strncmp(par->exprp, cls[i].c, (size_t)cls[i].n)) {
-            if (par->rune_type == TOK_IRUNE && (cls[i].r == UTF_lo || cls[i].r == UTF_up))
-                *rp = (_Rune)(UTF_al + inv);
+            if (par->rune_type == TOK_IRUNE && (cls[i].r == UTF_ll || cls[i].r == UTF_lu))
+                *rp = (_Rune)(UTF_lc + inv);
             else
                 *rp = (_Rune)(cls[i].r + inv);
             par->exprp += cls[i].n;
@@ -653,12 +669,12 @@ _lexutfclass(_Parser *par, _Rune *rp)
     case 'v': rune = '\v'; break; \
     case 'f': rune = '\f'; break; \
     case 'a': rune = '\a'; break; \
-    case 'd': rune = UTF_d; break; \
-    case 'D': rune = UTF_D; break; \
-    case 's': rune = UTF_s; break; \
-    case 'S': rune = UTF_S; break; \
-    case 'w': rune = UTF_w; break; \
-    case 'W': rune = UTF_W; break
+    case 'd': rune = UTF_nd; break; \
+    case 'D': rune = UTF_ND; break; \
+    case 's': rune = UTF_zs; break; \
+    case 'S': rune = UTF_ZS; break; \
+    case 'w': rune = UTF_wr; break; \
+    case 'W': rune = UTF_WR; break
 
 
 static _Token
@@ -880,52 +896,43 @@ out:
 static int
 _runematch(_Rune s, _Rune r)
 {
-    int inv = 0;
+    int inv = 0, n;
     switch (s) {
-    case ASC_D: inv = 1; /* fallthrough */
-    case ASC_d: return inv ^ (isdigit(r) != 0);
-    case ASC_S: inv = 1;
-    case ASC_s: return inv ^ (isspace(r) != 0);
-    case ASC_W: inv = 1;
-    case ASC_w: return inv ^ ((isalnum(r) != 0) | (r == '_'));
-    case ASC_AL: inv = 1;
-    case ASC_al: return inv ^ (isalpha(r) != 0);
-    case ASC_AN: inv = 1;
-    case ASC_an: return inv ^ (isalnum(r) != 0);
-    case ASC_AS: return (r >= 128);
-    case ASC_as: return (r < 128);
-    case ASC_BL: inv = 1;
-    case ASC_bl: return inv ^ ((r == ' ') | (r == '\t'));
-    case ASC_CT: inv = 1;
-    case ASC_ct: return inv ^ (iscntrl(r) != 0);
-    case ASC_GR: inv = 1;
-    case ASC_gr: return inv ^ (isgraph(r) != 0);
-    case ASC_PR: inv = 1;
-    case ASC_pr: return inv ^ (isprint(r) != 0);
-    case ASC_PU: inv = 1;
-    case ASC_pu: return inv ^ (ispunct(r) != 0);
-    case ASC_LO: inv = 1;
-    case ASC_lo: return inv ^ (islower(r) != 0);
-    case ASC_UP: inv = 1;
-    case ASC_up: return inv ^ (isupper(r) != 0);
-    case ASC_XD: inv = 1;
-    case ASC_xd: return inv ^ (isxdigit(r) != 0);
-    case UTF_D: inv = 1;
-    case UTF_d: return inv ^ (utf8_isdigit(r));
-    case UTF_S: inv = 1;
-    case UTF_s: return inv ^ utf8_isspace(r);
-    case UTF_W: inv = 1;
-    case UTF_w: return inv ^ (utf8_isalnum(r) | (r == '_'));
-    case UTF_AL: inv = 1;
-    case UTF_al: return inv ^ utf8_isalpha(r);
-    case UTF_AN: inv = 1;
-    case UTF_an: return inv ^ utf8_isalnum(r);
-    case UTF_LO: inv = 1;
-    case UTF_lo: return inv ^ utf8_islower(r);
-    case UTF_UP: inv = 1;
-    case UTF_up: return inv ^ utf8_isupper(r);
-    case UTF_XD: inv = 1;
-    case UTF_xd: return inv ^ utf8_isxdigit(r);
+    case ASC_D: inv = 1; case ASC_d: return inv ^ (isdigit(r) != 0);
+    case ASC_S: inv = 1; case ASC_s: return inv ^ (isspace(r) != 0);
+    case ASC_W: inv = 1; case ASC_w: return inv ^ ((isalnum(r) != 0) | (r == '_'));
+    case ASC_AL: inv = 1; case ASC_al: return inv ^ (isalpha(r) != 0);
+    case ASC_AN: inv = 1; case ASC_an: return inv ^ (isalnum(r) != 0);
+    case ASC_AS: return (r >= 128); case ASC_as: return (r < 128);
+    case ASC_BL: inv = 1; case ASC_bl: return inv ^ ((r == ' ') | (r == '\t'));
+    case ASC_CT: inv = 1; case ASC_ct: return inv ^ (iscntrl(r) != 0);
+    case ASC_GR: inv = 1; case ASC_gr: return inv ^ (isgraph(r) != 0);
+    case ASC_PR: inv = 1; case ASC_pr: return inv ^ (isprint(r) != 0);
+    case ASC_PU: inv = 1; case ASC_pu: return inv ^ (ispunct(r) != 0);
+    case ASC_LO: inv = 1; case ASC_lo: return inv ^ (islower(r) != 0);
+    case ASC_UP: inv = 1; case ASC_up: return inv ^ (isupper(r) != 0);
+    case ASC_XD: inv = 1; case ASC_xd: return inv ^ (isxdigit(r) != 0);
+    case UTF_AN: inv = 1; case UTF_an: return inv ^ utf8_isalnum(r);
+    case UTF_WR: inv = 1; case UTF_wr: return inv ^ (utf8_isalnum(r) | (r == '_'));
+    case UTF_XD: inv = 1; case UTF_xd: return inv ^ utf8_isxdigit(r);
+    case UTF_LC: inv = 1; case UTF_lc: return inv ^ utf8_isalpha(r); 
+    case UTF_CC: case UTF_cc:
+    case UTF_LL: case UTF_ll:
+    case UTF_LT: case UTF_lt:
+    case UTF_LU: case UTF_lu:
+    case UTF_ND: case UTF_nd:
+    case UTF_NL: case UTF_nl:
+    case UTF_PC: case UTF_pc:
+    case UTF_PD: case UTF_pd:
+    case UTF_PF: case UTF_pf:
+    case UTF_PI: case UTF_pi:
+    case UTF_SC: case UTF_sc:
+    case UTF_ZL: case UTF_zl:
+    case UTF_ZP: case UTF_zp:
+    case UTF_ZS: case UTF_zs:
+        n = s - U8G;
+        inv = n & 1;
+        return inv ^ utf8_isgroup(n / 2, r);
     }
     return s == r;
 }
diff --git a/src/utf8code.c b/src/utf8code.c
index 5dfb7d30..6fe8515e 100644
--- a/src/utf8code.c
+++ b/src/utf8code.c
@@ -112,17 +112,29 @@ int utf8_icmp_sv(const csview s1, const csview s2) {
     return (int)(s1.size - s2.size);
 }
 
-bool utf8_isspace(uint32_t c) {
-    static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0,
-                           0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000};
-    for (size_t i=0; i<sizeof t/sizeof *t; ++i)
-        if (c == t[i]) return true;
-    return (c >= 0x2000) & (c <= 0x200A);
-}
+typedef struct {
+  uint16_t lo;
+  uint16_t hi;
+} URange16;
+
+typedef struct {
+  const char *name;
+  const URange16 *r16;
+  int nr16;
+} UGroup;
+
+static const UGroup unicode_groups[];
+static const int num_unicode_groups;
+static const int Lt_group;
 
-bool utf8_isdigit(uint32_t c) {
-    return ((c >= '0') & (c <= '9')) ||
-           ((c >= 0xFF10) & (c <= 0xFF19));
+bool utf8_isgroup(int group, uint32_t c) {
+    for (int j=0; j<unicode_groups[group].nr16; ++j) {
+        if (c < unicode_groups[group].r16[j].lo)
+            return false;
+        if (c <= unicode_groups[group].r16[j].hi)
+            return true;
+    }
+    return false;
 }
 
 bool utf8_isxdigit(uint32_t c) {
@@ -141,6 +153,192 @@ bool utf8_isalnum(uint32_t c) {
 
 bool utf8_isalpha(uint32_t c) {
     if (c < 128) return isalpha(c) != 0;
-    return utf8_islower(c) || utf8_isupper(c);
+    return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(Lt_group, c);
 }
+
+static const URange16 Cc_range16[] = { // Control
+    { 0, 31 },
+    { 127, 159 },
+};
+
+static const URange16 Lt_range16[] = { // Title case
+    { 453, 453 },
+    { 456, 456 },
+    { 459, 459 },
+    { 498, 498 },
+    { 8072, 8079 },
+    { 8088, 8095 },
+    { 8104, 8111 },
+    { 8124, 8124 },
+    { 8140, 8140 },
+    { 8188, 8188 },
+};
+
+static const URange16 Nd_range16[] = { // Decimal number
+    { 48, 57 },
+    { 1632, 1641 },
+    { 1776, 1785 },
+    { 1984, 1993 },
+    { 2406, 2415 },
+    { 2534, 2543 },
+    { 2662, 2671 },
+    { 2790, 2799 },
+    { 2918, 2927 },
+    { 3046, 3055 },
+    { 3174, 3183 },
+    { 3302, 3311 },
+    { 3430, 3439 },
+    { 3558, 3567 },
+    { 3664, 3673 },
+    { 3792, 3801 },
+    { 3872, 3881 },
+    { 4160, 4169 },
+    { 4240, 4249 },
+    { 6112, 6121 },
+    { 6160, 6169 },
+    { 6470, 6479 },
+    { 6608, 6617 },
+    { 6784, 6793 },
+    { 6800, 6809 },
+    { 6992, 7001 },
+    { 7088, 7097 },
+    { 7232, 7241 },
+    { 7248, 7257 },
+    { 42528, 42537 },
+    { 43216, 43225 },
+    { 43264, 43273 },
+    { 43472, 43481 },
+    { 43504, 43513 },
+    { 43600, 43609 },
+    { 44016, 44025 },
+    { 65296, 65305 },
+};
+
+static const URange16 Nl_range16[] = { // Number letter
+    { 5870, 5872 },
+    { 8544, 8578 },
+    { 8581, 8584 },
+    { 12295, 12295 },
+    { 12321, 12329 },
+    { 12344, 12346 },
+    { 42726, 42735 },
+};
+
+static const URange16 Pc_range16[] = { // Connector punctuation
+    { 95, 95 },
+    { 8255, 8256 },
+    { 8276, 8276 },
+    { 65075, 65076 },
+    { 65101, 65103 },
+    { 65343, 65343 },
+};
+
+static const URange16 Pd_range16[] = { // Dash punctuation
+    { 45, 45 },
+    { 1418, 1418 },
+    { 1470, 1470 },
+    { 5120, 5120 },
+    { 6150, 6150 },
+    { 8208, 8213 },
+    { 11799, 11799 },
+    { 11802, 11802 },
+    { 11834, 11835 },
+    { 11840, 11840 },
+    { 11869, 11869 },
+    { 12316, 12316 },
+    { 12336, 12336 },
+    { 12448, 12448 },
+    { 65073, 65074 },
+    { 65112, 65112 },
+    { 65123, 65123 },
+    { 65293, 65293 },
+};
+
+static const URange16 Pf_range16[] = { // Final punctuation
+    { 187, 187 },
+    { 8217, 8217 },
+    { 8221, 8221 },
+    { 8250, 8250 },
+    { 11779, 11779 },
+    { 11781, 11781 },
+    { 11786, 11786 },
+    { 11789, 11789 },
+    { 11805, 11805 },
+    { 11809, 11809 },
+};
+
+static const URange16 Pi_range16[] = { // Initial punctuation
+    { 171, 171 },
+    { 8216, 8216 },
+    { 8219, 8220 },
+    { 8223, 8223 },
+    { 8249, 8249 },
+    { 11778, 11778 },
+    { 11780, 11780 },
+    { 11785, 11785 },
+    { 11788, 11788 },
+    { 11804, 11804 },
+    { 11808, 11808 },
+};
+
+static const URange16 Sc_range16[] = { // Currency symbol
+    { 36, 36 },
+    { 162, 165 },
+    { 1423, 1423 },
+    { 1547, 1547 },
+    { 2046, 2047 },
+    { 2546, 2547 },
+    { 2555, 2555 },
+    { 2801, 2801 },
+    { 3065, 3065 },
+    { 3647, 3647 },
+    { 6107, 6107 },
+    { 8352, 8384 },
+    { 43064, 43064 },
+    { 65020, 65020 },
+    { 65129, 65129 },
+    { 65284, 65284 },
+    { 65504, 65505 },
+    { 65509, 65510 },
+};
+
+static const URange16 Zl_range16[] = { // Line separator
+    { 8232, 8232 },
+};
+
+static const URange16 Zp_range16[] = { // Paragraph separator
+    { 8233, 8233 },
+};
+
+static const URange16 Zs_range16[] = { // Space separator
+    { 32, 32 },
+    { 160, 160 },
+    { 5760, 5760 },
+    { 8192, 8202 },
+    { 8239, 8239 },
+    { 8287, 8287 },
+    { 12288, 12288 },
+};
+
+#define UNI_ENTRY(Code) \
+    { #Code, Code##_range16, sizeof(Code##_range16)/2 }
+
+static const UGroup unicode_groups[] = {
+    UNI_ENTRY(Cc),
+    UNI_ENTRY(Lt),
+    UNI_ENTRY(Nd),
+    UNI_ENTRY(Nl),
+    UNI_ENTRY(Pc),
+    UNI_ENTRY(Pd),
+    UNI_ENTRY(Pf),
+    UNI_ENTRY(Pi),
+    UNI_ENTRY(Sc),
+    UNI_ENTRY(Zl),
+    UNI_ENTRY(Zp),
+    UNI_ENTRY(Zs),
+};
+
+static const int Lt_group = 1;
+static const int num_unicode_groups = sizeof unicode_groups / sizeof unicode_groups[0];
+
 #endif
author	Tyge Lovset <[email protected]>	2023-01-02 09:00:47 +0100
committer	Tyge Lovset <[email protected]>	2023-01-02 09:00:47 +0100
commit	8c6ba8a3444e4b8640e7fe04f565cef57c850432 (patch)
tree	10119ea3bbef5ac2818874849231bad19ad7aa5a /src
parent	91e79fc60713c1f09e940a7ee83ff2f8aa4f9d69 (diff)
download	STC-modified-8c6ba8a3444e4b8640e7fe04f565cef57c850432.tar.gz STC-modified-8c6ba8a3444e4b8640e7fe04f565cef57c850432.zip