summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--docs/cregex_api.md21
-rw-r--r--include/stc/utf8.h1
-rw-r--r--misc/examples/new_queue.c3
-rw-r--r--src/cregex.c137
-rw-r--r--src/utf8code.c220
5 files changed, 300 insertions, 82 deletions
diff --git a/docs/cregex_api.md b/docs/cregex_api.md
index 3197a59e..689fd33d 100644
--- a/docs/cregex_api.md
+++ b/docs/cregex_api.md
@@ -184,11 +184,22 @@ For reference, **cregex** uses the following files:
| \D \S \W | Do not match the groups described above | |
| \p{Alnum} | Match UTF8 alpha numeric | * |
| \p{XDigit} | Match UTF8 hex number | * |
-| \p{Alpha} or \p{LC} | Match UTF8 cased letter | * |
-| \p{Digit} or \p{Nd} | Match UTF8 numeric | * |
-| \p{Lower} or \p{Ll} | Match UTF8 lower case | * |
-| \p{Upper} or \p{Lu} | Match UTF8 upper case | * |
-| \p{Space} or \p{Sz} | Match UTF8 whitespace | * |
+| \p{Nd} or \p{Digit} | Match UTF8 decimal number | * |
+| \p{Nl} | Match UTF8 numeric letter | * |
+| \p{Ll} or \p{Lower} | Match UTF8 lower case letter | * |
+| \p{Lu} or \p{Upper} | Match UTF8 upper case letter | * |
+| \p{Lt} | Match UTF8 titlecase letter | * |
+| \p{L&} or \p{Alpha} | Match UTF8 cased letter | * |
+| \p{Cc} | Match UTF8 control char | * |
+| \p{Pc} | Match UTF8 connector punctuation | * |
+| \p{Pd} | Match UTF8 dash punctuation | * |
+| \p{Pf} | Match UTF8 final punctuation | * |
+| \p{Pi} | Match UTF8 initial punctuation | * |
+| \p{Sc} | Match UTF8 currency symbol | * |
+| \p{Sk} | Match UTF8 modifier symbol | * |
+| \p{Zl} | Match UTF8 line separator | * |
+| \p{Zp} | Match UTF8 paragraph separator | * |
+| \p{Sz} or \p{Space} | Match UTF8 whitespace separator | * |
| \P{***Class***} | Do not match the classes described above | * |
| [:alnum:] [:alpha:] [:ascii:] | Match ASCII character class. NB: only to be used inside [] brackets | * |
| [:blank:] [:cntrl:] [:digit:] | " | * |
diff --git a/include/stc/utf8.h b/include/stc/utf8.h
index db98acd9..b30b0061 100644
--- a/include/stc/utf8.h
+++ b/include/stc/utf8.h
@@ -11,6 +11,7 @@ extern bool utf8_isdigit(uint32_t c);
extern bool utf8_isxdigit(uint32_t c);
extern bool utf8_isalpha(uint32_t c);
extern bool utf8_isalnum(uint32_t c);
+extern bool utf8_isgroup(int group, uint32_t c);
extern uint32_t utf8_casefold(uint32_t c);
extern uint32_t utf8_tolower(uint32_t c);
extern uint32_t utf8_toupper(uint32_t c);
diff --git a/misc/examples/new_queue.c b/misc/examples/new_queue.c
index c72c94a0..828387b5 100644
--- a/misc/examples/new_queue.c
+++ b/misc/examples/new_queue.c
@@ -27,12 +27,13 @@ int main() {
c_AUTO (IQ, Q)
{
- // Push eight million random numbers onto the queue.
+ // Push 50'000'000 random numbers onto the queue.
c_FORRANGE (n)
IQ_push(&Q, stc64_uniform(&rng, &dist));
// Push or pop on the queue 50 million times
printf("befor: size %" c_ZU ", capacity %" c_ZU "\n", IQ_size(&Q), IQ_capacity(&Q));
+
c_FORRANGE (n) {
int r = stc64_uniform(&rng, &dist);
if (r & 3)
diff --git a/src/cregex.c b/src/cregex.c
index e59e21a4..7b528550 100644
--- a/src/cregex.c
+++ b/src/cregex.c
@@ -130,14 +130,25 @@ enum {
ASC_lo , ASC_LO, /* lower */
ASC_up , ASC_UP, /* upper */
ASC_xd , ASC_XD, /* hex */
- UTF_d , UTF_D, /* utf dec digit, non-digit */
- UTF_s , UTF_S, /* utf8 white space */
- UTF_w , UTF_W, /* utf8 word */
- UTF_al , UTF_AL, /* utf8 letter cased */
- UTF_lo , UTF_LO, /* utf8 letter lower */
- UTF_up , UTF_UP, /* utf8 letter upper */
- UTF_xd , UTF_XD, /* utf8 hex digit */
UTF_an , UTF_AN, /* utf8 alphanumeric */
+ UTF_wr , UTF_WR, /* utf8 word */
+ UTF_xd , UTF_XD, /* utf8 hex digit */
+ U8G_tmp , U8G = U8G_tmp + (U8G_tmp & 1), /* force even */
+ UTF_cc = U8G, UTF_CC, /* utf8 control char */
+ UTF_lc , UTF_LC, /* utf8 letter cased */
+ UTF_ll , UTF_LL, /* utf8 letter lowercase */
+ UTF_lt , UTF_LT, /* utf8 letter titlecase */
+ UTF_lu , UTF_LU, /* utf8 letter uppercase */
+ UTF_nd , UTF_ND, /* utf8 number decimal */
+ UTF_nl , UTF_NL, /* utf8 number letter */
+ UTF_pc , UTF_PC, /* utf8 punct connector */
+ UTF_pd , UTF_PD, /* utf8 punct dash */
+ UTF_pf , UTF_PF, /* utf8 punct final */
+ UTF_pi , UTF_PI, /* utf8 punct initial */
+ UTF_sc , UTF_SC, /* utf8 symbol currency */
+ UTF_zl , UTF_ZL, /* utf8 separator line */
+ UTF_zp , UTF_ZP, /* utf8 separator paragraph */
+ UTF_zs , UTF_ZS, /* utf8 separator space */
TOK_ANY = 0x8200000, /* Any character except newline, . */
TOK_ANYNL , /* Any character including newline, . */
TOK_NOP , /* No operation, internal use only */
@@ -625,19 +636,24 @@ static void
_lexutfclass(_Parser *par, _Rune *rp)
{
static struct { const char* c; int n, r; } cls[] = {
- {"{Space}", 7, UTF_s}, {"{Zs}", 4, UTF_s},
- {"{Digit}", 7, UTF_d}, {"{Nd}", 4, UTF_d},
- {"{Alpha}", 7, UTF_al}, {"{LC}", 4, UTF_al},
- {"{Lower}", 7, UTF_lo}, {"{Ll}", 4, UTF_lo},
- {"{Upper}", 7, UTF_up}, {"{Lu}", 4, UTF_up},
+ {"{Alpha}", 7, UTF_lc}, {"{L&}", 4, UTF_lc},
+ {"{Digit}", 7, UTF_nd}, {"{Nd}", 4, UTF_nd},
+ {"{Lower}", 7, UTF_ll}, {"{Ll}", 4, UTF_ll},
+ {"{Upper}", 7, UTF_lu}, {"{Lu}", 4, UTF_lu},
+ {"{Space}", 7, UTF_zs}, {"{Zs}", 4, UTF_zs},
{"{Alnum}", 7, UTF_an},
{"{XDigit}", 8, UTF_xd},
+ {"{Cc}", 4, UTF_cc}, {"{Sc}", 4, UTF_sc},
+ {"{Lt}", 4, UTF_lt}, {"{Nl}", 4, UTF_nl},
+ {"{Pc}", 4, UTF_pc}, {"{Pd}", 4, UTF_pd},
+ {"{Pf}", 4, UTF_pf}, {"{Pi}", 4, UTF_pi},
+ {"{Zl}", 4, UTF_zl}, {"{Zp}", 4, UTF_zp},
};
int inv = (*rp == 'P');
for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i) {
if (!strncmp(par->exprp, cls[i].c, (size_t)cls[i].n)) {
- if (par->rune_type == TOK_IRUNE && (cls[i].r == UTF_lo || cls[i].r == UTF_up))
- *rp = (_Rune)(UTF_al + inv);
+ if (par->rune_type == TOK_IRUNE && (cls[i].r == UTF_ll || cls[i].r == UTF_lu))
+ *rp = (_Rune)(UTF_lc + inv);
else
*rp = (_Rune)(cls[i].r + inv);
par->exprp += cls[i].n;
@@ -653,12 +669,12 @@ _lexutfclass(_Parser *par, _Rune *rp)
case 'v': rune = '\v'; break; \
case 'f': rune = '\f'; break; \
case 'a': rune = '\a'; break; \
- case 'd': rune = UTF_d; break; \
- case 'D': rune = UTF_D; break; \
- case 's': rune = UTF_s; break; \
- case 'S': rune = UTF_S; break; \
- case 'w': rune = UTF_w; break; \
- case 'W': rune = UTF_W; break
+ case 'd': rune = UTF_nd; break; \
+ case 'D': rune = UTF_ND; break; \
+ case 's': rune = UTF_zs; break; \
+ case 'S': rune = UTF_ZS; break; \
+ case 'w': rune = UTF_wr; break; \
+ case 'W': rune = UTF_WR; break
static _Token
@@ -880,52 +896,43 @@ out:
static int
_runematch(_Rune s, _Rune r)
{
- int inv = 0;
+ int inv = 0, n;
switch (s) {
- case ASC_D: inv = 1; /* fallthrough */
- case ASC_d: return inv ^ (isdigit(r) != 0);
- case ASC_S: inv = 1;
- case ASC_s: return inv ^ (isspace(r) != 0);
- case ASC_W: inv = 1;
- case ASC_w: return inv ^ ((isalnum(r) != 0) | (r == '_'));
- case ASC_AL: inv = 1;
- case ASC_al: return inv ^ (isalpha(r) != 0);
- case ASC_AN: inv = 1;
- case ASC_an: return inv ^ (isalnum(r) != 0);
- case ASC_AS: return (r >= 128);
- case ASC_as: return (r < 128);
- case ASC_BL: inv = 1;
- case ASC_bl: return inv ^ ((r == ' ') | (r == '\t'));
- case ASC_CT: inv = 1;
- case ASC_ct: return inv ^ (iscntrl(r) != 0);
- case ASC_GR: inv = 1;
- case ASC_gr: return inv ^ (isgraph(r) != 0);
- case ASC_PR: inv = 1;
- case ASC_pr: return inv ^ (isprint(r) != 0);
- case ASC_PU: inv = 1;
- case ASC_pu: return inv ^ (ispunct(r) != 0);
- case ASC_LO: inv = 1;
- case ASC_lo: return inv ^ (islower(r) != 0);
- case ASC_UP: inv = 1;
- case ASC_up: return inv ^ (isupper(r) != 0);
- case ASC_XD: inv = 1;
- case ASC_xd: return inv ^ (isxdigit(r) != 0);
- case UTF_D: inv = 1;
- case UTF_d: return inv ^ (utf8_isdigit(r));
- case UTF_S: inv = 1;
- case UTF_s: return inv ^ utf8_isspace(r);
- case UTF_W: inv = 1;
- case UTF_w: return inv ^ (utf8_isalnum(r) | (r == '_'));
- case UTF_AL: inv = 1;
- case UTF_al: return inv ^ utf8_isalpha(r);
- case UTF_AN: inv = 1;
- case UTF_an: return inv ^ utf8_isalnum(r);
- case UTF_LO: inv = 1;
- case UTF_lo: return inv ^ utf8_islower(r);
- case UTF_UP: inv = 1;
- case UTF_up: return inv ^ utf8_isupper(r);
- case UTF_XD: inv = 1;
- case UTF_xd: return inv ^ utf8_isxdigit(r);
+ case ASC_D: inv = 1; case ASC_d: return inv ^ (isdigit(r) != 0);
+ case ASC_S: inv = 1; case ASC_s: return inv ^ (isspace(r) != 0);
+ case ASC_W: inv = 1; case ASC_w: return inv ^ ((isalnum(r) != 0) | (r == '_'));
+ case ASC_AL: inv = 1; case ASC_al: return inv ^ (isalpha(r) != 0);
+ case ASC_AN: inv = 1; case ASC_an: return inv ^ (isalnum(r) != 0);
+ case ASC_AS: return (r >= 128); case ASC_as: return (r < 128);
+ case ASC_BL: inv = 1; case ASC_bl: return inv ^ ((r == ' ') | (r == '\t'));
+ case ASC_CT: inv = 1; case ASC_ct: return inv ^ (iscntrl(r) != 0);
+ case ASC_GR: inv = 1; case ASC_gr: return inv ^ (isgraph(r) != 0);
+ case ASC_PR: inv = 1; case ASC_pr: return inv ^ (isprint(r) != 0);
+ case ASC_PU: inv = 1; case ASC_pu: return inv ^ (ispunct(r) != 0);
+ case ASC_LO: inv = 1; case ASC_lo: return inv ^ (islower(r) != 0);
+ case ASC_UP: inv = 1; case ASC_up: return inv ^ (isupper(r) != 0);
+ case ASC_XD: inv = 1; case ASC_xd: return inv ^ (isxdigit(r) != 0);
+ case UTF_AN: inv = 1; case UTF_an: return inv ^ utf8_isalnum(r);
+ case UTF_WR: inv = 1; case UTF_wr: return inv ^ (utf8_isalnum(r) | (r == '_'));
+ case UTF_XD: inv = 1; case UTF_xd: return inv ^ utf8_isxdigit(r);
+ case UTF_LC: inv = 1; case UTF_lc: return inv ^ utf8_isalpha(r);
+ case UTF_CC: case UTF_cc:
+ case UTF_LL: case UTF_ll:
+ case UTF_LT: case UTF_lt:
+ case UTF_LU: case UTF_lu:
+ case UTF_ND: case UTF_nd:
+ case UTF_NL: case UTF_nl:
+ case UTF_PC: case UTF_pc:
+ case UTF_PD: case UTF_pd:
+ case UTF_PF: case UTF_pf:
+ case UTF_PI: case UTF_pi:
+ case UTF_SC: case UTF_sc:
+ case UTF_ZL: case UTF_zl:
+ case UTF_ZP: case UTF_zp:
+ case UTF_ZS: case UTF_zs:
+ n = s - U8G;
+ inv = n & 1;
+ return inv ^ utf8_isgroup(n / 2, r);
}
return s == r;
}
diff --git a/src/utf8code.c b/src/utf8code.c
index 5dfb7d30..6fe8515e 100644
--- a/src/utf8code.c
+++ b/src/utf8code.c
@@ -112,17 +112,29 @@ int utf8_icmp_sv(const csview s1, const csview s2) {
return (int)(s1.size - s2.size);
}
-bool utf8_isspace(uint32_t c) {
- static uint16_t t[] = {0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0xA0,
- 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000};
- for (size_t i=0; i<sizeof t/sizeof *t; ++i)
- if (c == t[i]) return true;
- return (c >= 0x2000) & (c <= 0x200A);
-}
+typedef struct {
+ uint16_t lo;
+ uint16_t hi;
+} URange16;
+
+typedef struct {
+ const char *name;
+ const URange16 *r16;
+ int nr16;
+} UGroup;
+
+static const UGroup unicode_groups[];
+static const int num_unicode_groups;
+static const int Lt_group;
-bool utf8_isdigit(uint32_t c) {
- return ((c >= '0') & (c <= '9')) ||
- ((c >= 0xFF10) & (c <= 0xFF19));
+bool utf8_isgroup(int group, uint32_t c) {
+ for (int j=0; j<unicode_groups[group].nr16; ++j) {
+ if (c < unicode_groups[group].r16[j].lo)
+ return false;
+ if (c <= unicode_groups[group].r16[j].hi)
+ return true;
+ }
+ return false;
}
bool utf8_isxdigit(uint32_t c) {
@@ -141,6 +153,192 @@ bool utf8_isalnum(uint32_t c) {
bool utf8_isalpha(uint32_t c) {
if (c < 128) return isalpha(c) != 0;
- return utf8_islower(c) || utf8_isupper(c);
+ return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(Lt_group, c);
}
+
+static const URange16 Cc_range16[] = { // Control
+ { 0, 31 },
+ { 127, 159 },
+};
+
+static const URange16 Lt_range16[] = { // Title case
+ { 453, 453 },
+ { 456, 456 },
+ { 459, 459 },
+ { 498, 498 },
+ { 8072, 8079 },
+ { 8088, 8095 },
+ { 8104, 8111 },
+ { 8124, 8124 },
+ { 8140, 8140 },
+ { 8188, 8188 },
+};
+
+static const URange16 Nd_range16[] = { // Decimal number
+ { 48, 57 },
+ { 1632, 1641 },
+ { 1776, 1785 },
+ { 1984, 1993 },
+ { 2406, 2415 },
+ { 2534, 2543 },
+ { 2662, 2671 },
+ { 2790, 2799 },
+ { 2918, 2927 },
+ { 3046, 3055 },
+ { 3174, 3183 },
+ { 3302, 3311 },
+ { 3430, 3439 },
+ { 3558, 3567 },
+ { 3664, 3673 },
+ { 3792, 3801 },
+ { 3872, 3881 },
+ { 4160, 4169 },
+ { 4240, 4249 },
+ { 6112, 6121 },
+ { 6160, 6169 },
+ { 6470, 6479 },
+ { 6608, 6617 },
+ { 6784, 6793 },
+ { 6800, 6809 },
+ { 6992, 7001 },
+ { 7088, 7097 },
+ { 7232, 7241 },
+ { 7248, 7257 },
+ { 42528, 42537 },
+ { 43216, 43225 },
+ { 43264, 43273 },
+ { 43472, 43481 },
+ { 43504, 43513 },
+ { 43600, 43609 },
+ { 44016, 44025 },
+ { 65296, 65305 },
+};
+
+static const URange16 Nl_range16[] = { // Number letter
+ { 5870, 5872 },
+ { 8544, 8578 },
+ { 8581, 8584 },
+ { 12295, 12295 },
+ { 12321, 12329 },
+ { 12344, 12346 },
+ { 42726, 42735 },
+};
+
+static const URange16 Pc_range16[] = { // Connector punctuation
+ { 95, 95 },
+ { 8255, 8256 },
+ { 8276, 8276 },
+ { 65075, 65076 },
+ { 65101, 65103 },
+ { 65343, 65343 },
+};
+
+static const URange16 Pd_range16[] = { // Dash punctuation
+ { 45, 45 },
+ { 1418, 1418 },
+ { 1470, 1470 },
+ { 5120, 5120 },
+ { 6150, 6150 },
+ { 8208, 8213 },
+ { 11799, 11799 },
+ { 11802, 11802 },
+ { 11834, 11835 },
+ { 11840, 11840 },
+ { 11869, 11869 },
+ { 12316, 12316 },
+ { 12336, 12336 },
+ { 12448, 12448 },
+ { 65073, 65074 },
+ { 65112, 65112 },
+ { 65123, 65123 },
+ { 65293, 65293 },
+};
+
+static const URange16 Pf_range16[] = { // Final punctuation
+ { 187, 187 },
+ { 8217, 8217 },
+ { 8221, 8221 },
+ { 8250, 8250 },
+ { 11779, 11779 },
+ { 11781, 11781 },
+ { 11786, 11786 },
+ { 11789, 11789 },
+ { 11805, 11805 },
+ { 11809, 11809 },
+};
+
+static const URange16 Pi_range16[] = { // Initial punctuation
+ { 171, 171 },
+ { 8216, 8216 },
+ { 8219, 8220 },
+ { 8223, 8223 },
+ { 8249, 8249 },
+ { 11778, 11778 },
+ { 11780, 11780 },
+ { 11785, 11785 },
+ { 11788, 11788 },
+ { 11804, 11804 },
+ { 11808, 11808 },
+};
+
+static const URange16 Sc_range16[] = { // Currency symbol
+ { 36, 36 },
+ { 162, 165 },
+ { 1423, 1423 },
+ { 1547, 1547 },
+ { 2046, 2047 },
+ { 2546, 2547 },
+ { 2555, 2555 },
+ { 2801, 2801 },
+ { 3065, 3065 },
+ { 3647, 3647 },
+ { 6107, 6107 },
+ { 8352, 8384 },
+ { 43064, 43064 },
+ { 65020, 65020 },
+ { 65129, 65129 },
+ { 65284, 65284 },
+ { 65504, 65505 },
+ { 65509, 65510 },
+};
+
+static const URange16 Zl_range16[] = { // Line separator
+ { 8232, 8232 },
+};
+
+static const URange16 Zp_range16[] = { // Paragraph separator
+ { 8233, 8233 },
+};
+
+static const URange16 Zs_range16[] = { // Space separator
+ { 32, 32 },
+ { 160, 160 },
+ { 5760, 5760 },
+ { 8192, 8202 },
+ { 8239, 8239 },
+ { 8287, 8287 },
+ { 12288, 12288 },
+};
+
+#define UNI_ENTRY(Code) \
+ { #Code, Code##_range16, sizeof(Code##_range16)/2 }
+
+static const UGroup unicode_groups[] = {
+ UNI_ENTRY(Cc),
+ UNI_ENTRY(Lt),
+ UNI_ENTRY(Nd),
+ UNI_ENTRY(Nl),
+ UNI_ENTRY(Pc),
+ UNI_ENTRY(Pd),
+ UNI_ENTRY(Pf),
+ UNI_ENTRY(Pi),
+ UNI_ENTRY(Sc),
+ UNI_ENTRY(Zl),
+ UNI_ENTRY(Zp),
+ UNI_ENTRY(Zs),
+};
+
+static const int Lt_group = 1;
+static const int num_unicode_groups = sizeof unicode_groups / sizeof unicode_groups[0];
+
#endif