Some improvements and cleanup: CRegex, CMap.

author: Tyge Løvset <[email protected]> 2022-02-18 16:25:25 +0100
committer: Tyge Løvset <[email protected]> 2022-02-18 16:25:25 +0100
commit: 4ac5ef68726057ff97e56c951e4faeaa29995e40 (patch)
tree: cc8f82362d1fa6a94c8e279d7ce0b6f3581225c2
parent: fde8f4a37c42a3611b544185806155a02e24227e (diff)
download: STC-modified-4ac5ef68726057ff97e56c951e4faeaa29995e40.tar.gz
STC-modified-4ac5ef68726057ff97e56c951e4faeaa29995e40.zip
9 files changed, 77 insertions, 99 deletions
diff --git a/benchmarks/picobench/picobench_cmap.cpp b/benchmarks/picobench/picobench_cmap.cpp
index f54b3d32..15ab2aa7 100644
--- a/benchmarks/picobench/picobench_cmap.cpp
+++ b/benchmarks/picobench/picobench_cmap.cpp
@@ -208,7 +208,8 @@ static void ins_and_access_cmap_s(picobench::state& s)
         randomize(str.str, cstr_size(str));
         //if (cstr_size(str) > 30) { printf("%s\n", str.str); exit(0); }
         cmap_str_emplace(&map, str.str, str.str);
-        //randomize(str.str, cstr_size(str));
+
+        randomize(str.str, cstr_size(str));
         result += cmap_str_erase(&map, str.str);
     }
     s.set_result(result + cmap_str_size(map));
diff --git a/benchmarks/shootout_hashmaps.cpp b/benchmarks/shootout_hashmaps.cpp
index 5307da0c..6c8c1f00 100644
--- a/benchmarks/shootout_hashmaps.cpp
+++ b/benchmarks/shootout_hashmaps.cpp
@@ -209,7 +209,7 @@ size_t seed;
         M##_PUT(X, RAND(keybits), i); \
     size_t x = 300000000/M##_SIZE(X); \
     clock_t difference, before = clock(); \
-    for (int k=0; k < x; k++) M##_FOR (X, it) \
+    for (size_t k=0; k < x; k++) M##_FOR (X, it) \
         sum += M##_ITEM(X, it); \
     difference = clock() - before; \
     printf(#M ": time: %5.02f, size: %zu, buckets: %8zu, repeats: %zu, sum: %zu\n", \
@@ -256,10 +256,10 @@ enum {
 
 int main(int argc, char* argv[])
 {
-    int n_mill = argc >= 2 ? atoi(argv[1]) : DEFAULT_N_MILL;
-    int keybits = argc >= 3 ? atoi(argv[2]) : DEFAULT_KEYBITS;
-    int n = n_mill * 1000000;
-    int N0 = n, N1 = n/2, N2 = n/2, N3 = n, N4 = n, N5 = n;
+    unsigned n_mill = argc >= 2 ? atoi(argv[1]) : DEFAULT_N_MILL;
+    unsigned keybits = argc >= 3 ? atoi(argv[2]) : DEFAULT_KEYBITS;
+    unsigned n = n_mill * 1000000;
+    unsigned N0 = n, N1 = n/2, N2 = n/2, N3 = n, N4 = n, N5 = n;
     seed = time(NULL); // 1636306010;
 
     printf("\nUnordered hash map shootout\n");
diff --git a/examples/new_map.c b/examples/new_map.c
index a7008e4f..97fce008 100644
--- a/examples/new_map.c
+++ b/examples/new_map.c
@@ -1,7 +1,7 @@
 #include <stc/cstr.h>
 #include <stc/forward.h>
 
-forward_cmap(cmap_pnt, struct Point, int, uint32_t);
+forward_cmap(cmap_pnt, struct Point, int);
 
 struct MyStruct {
     cmap_pnt pntmap;
diff --git a/examples/new_smap.c b/examples/new_smap.c
index 431f18c4..382d27ae 100644
--- a/examples/new_smap.c
+++ b/examples/new_smap.c
@@ -1,7 +1,7 @@
 #include <stc/cstr.h>
 #include <stc/forward.h>
 
-forward_csmap(PMap, struct Point, int, uint32_t);
+forward_csmap(PMap, struct Point, int);
 
 // Use forward declared PMap in struct
 struct MyStruct {
diff --git a/include/stc/ccommon.h b/include/stc/ccommon.h
index 292ac9cc..e5a595e0 100644
--- a/include/stc/ccommon.h
+++ b/include/stc/ccommon.h
@@ -122,13 +122,11 @@ STC_INLINE uint64_t c_default_hash(const void* key, size_t len) {
     while (--len) h = (h << 10) - h + *x++;
     return _c_ROTL(h, 26) ^ h;
 }
-STC_INLINE uint64_t c_hash32(const void* key, size_t len) {
-    uint32_t x; memcpy(&x, key, 4);
-    return x*0xc6a4a7935bd1e99d >> 15;
+STC_INLINE uint64_t c_hash32(const void* key, size_t n) {
+    return *(uint32_t *)key*0xc6a4a7935bd1e99d;
 }
-STC_INLINE uint64_t c_hash64(const void* key, size_t len) {
-    uint64_t x; memcpy(&x, key, 8);
-    return x*0xc6a4a7935bd1e99d;
+STC_INLINE uint64_t c_hash64(const void* key, size_t n) {
+    return *(uint64_t *)key*0xc6a4a7935bd1e99d;
 }
 
 STC_INLINE char* c_strnstrn(const char *s, const char *needle, size_t slen, const size_t nlen) {
diff --git a/include/stc/cmap.h b/include/stc/cmap.h
index af247ffc..a5ca353e 100644
--- a/include/stc/cmap.h
+++ b/include/stc/cmap.h
@@ -233,10 +233,10 @@ _cx_memb(_erase_at)(_cx_self* self, _cx_iter it) {
 #if defined(_i_implement)
 
 #ifndef CMAP_H_INCLUDED
-//STC_INLINE size_t fastrange_uint64_t(uint64_t x, uint64_t n)
-//    { uint64_t lo, hi; c_umul128(x, n, &lo, &hi); return hi; }
-#define fastrange_uint32_t(x, n) (uint32_t)((uint32_t)(x)*(uint64_t)(n) >> 32)
-#define chash_index_(h, entryPtr) ((entryPtr) - (h).table)
+STC_INLINE size_t fastrange_size_t(uint64_t x, uint64_t n)
+    { uint64_t lo, hi; c_umul128(x, n, &lo, &hi); return (size_t)hi; }
+STC_INLINE size_t fastrange_uint32_t(uint64_t x, uint64_t n)
+    { return (size_t)((uint32_t)x*n >> 32); }
 #endif // CMAP_H_INCLUDED
 
 STC_DEF _cx_self
@@ -362,11 +362,11 @@ _cx_memb(_reserve)(_cx_self* self, const size_t _newcap) {
 
 STC_DEF void
 _cx_memb(_erase_entry)(_cx_self* self, _cx_value* _val) {
-    _cx_size i = chash_index_(*self, _val), j = i, k;
+    _cx_size i = _val - self->table, j = i, k;
     const _cx_size _cap = self->bucket_count;
     _cx_value* _slot = self->table;
     uint8_t* _hashx = self->_hashx;
-    _cx_memb(_value_drop)(&_slot[i]);
+    _cx_memb(_value_drop)(_val);
     for (;;) { /* delete without leaving tombstone */
         if (++j == _cap)
             j = 0;
diff --git a/include/stc/forward.h b/include/stc/forward.h
index ca1fd540..e57aedfd 100644
--- a/include/stc/forward.h
+++ b/include/stc/forward.h
@@ -29,10 +29,12 @@
 #define forward_carr3(CX, VAL) _c_carr3_types(CX, VAL)
 #define forward_cdeq(CX, VAL) _c_cdeq_types(CX, VAL)
 #define forward_clist(CX, VAL) _c_clist_types(CX, VAL)
-#define forward_cmap(CX, KEY, VAL, SZ) _c_chash_types(CX, KEY, VAL, SZ, c_true, c_false)
-#define forward_csmap(CX, KEY, VAL, SZ) _c_aatree_types(CX, KEY, VAL, SZ, c_true, c_false)
-#define forward_cset(CX, KEY, SZ) _c_chash_types(CX, cset, KEY, KEY, SZ, c_false, c_true)
-#define forward_csset(CX, KEY, SZ) _c_aatree_types(CX, KEY, KEY, SZ, c_false, c_true)
+#define forward_cmap(CX, KEY, VAL) _c_chash_types(CX, KEY, VAL, uint32_t, c_true, c_false)
+#define forward_cmap_big(CX, KEY, VAL) _c_chash_types(CX, KEY, VAL, size_t, c_true, c_false)
+#define forward_cset(CX, KEY) _c_chash_types(CX, cset, KEY, KEY, uint32_t, c_false, c_true)
+#define forward_cset_big(CX, KEY) _c_chash_types(CX, cset, KEY, KEY, size_t, c_false, c_true)
+#define forward_csmap(CX, KEY, VAL) _c_aatree_types(CX, KEY, VAL, uint32_t, c_true, c_false)
+#define forward_csset(CX, KEY) _c_aatree_types(CX, KEY, KEY, uint32_t, c_false, c_true)
 #define forward_cbox(CX, VAL) _c_cbox_types(CX, VAL)
 #define forward_carc(CX, VAL) _c_carc_types(CX, VAL)
 #define forward_cpque(CX, VAL) _c_cpque_types(CX, VAL)
diff --git a/src/cregex.c b/src/cregex.c
index 804e8047..be185a40 100644
--- a/src/cregex.c
+++ b/src/cregex.c
@@ -120,26 +120,19 @@ enum {
     QUEST       ,           /* a? == a|nothing, i.e. 0 or 1 a's */
     RUNE        = 0x8100000,
     IRUNE,
-    ASC_d       , ASC_D,    /* dec digit, non-digit */
-    ASC_s       , ASC_S,    /* utf8 space, non-space */
-    ASC_w       , ASC_W,    /* utf8 word, non-word */
-    ASC_an      , ASC_AN,   /* alnum */
-    ASC_al      , ASC_AL,   /* alpha */
     ASC_bl      , ASC_BL,   /* blank */
     ASC_ct      , ASC_CT,   /* ctrl */
     ASC_gr      , ASC_GR,   /* graphic */
-    ASC_lo      , ASC_LO,   /* lower */
-    ASC_up      , ASC_UP,   /* upper */
     ASC_pr      , ASC_PR,   /* print */
     ASC_pt      , ASC_PT,   /* punct */
-    ASC_xd      , ASC_XD,   /* xdigit */
-    U8_La       , NU8_La,   /* utf8 alpha */
-    U8_Ll       , NU8_Ll,   /* utf8 lower */
-    U8_Lu       , NU8_Lu,   /* utf8 upper */
-    U8_Zs       , NU8_Zs,   /* utf8 white space */
-    U8_Xnx      , NU8_Xnx,  /* utf8 hex digit */
-    U8_Xan      , NU8_Xan,  /* utf8 alphanumeric */
-    U8_Xw       , NU8_Xw,   /* utf8 word */
+    U8_Nd       , U8N_Nd,    /* dec digit, non-digit */
+    U8_LC       , U8N_LC,   /* utf8 letter cased */
+    U8_Ll       , U8N_Ll,   /* utf8 letter lower */
+    U8_Lu       , U8N_Lu,   /* utf8 letter upper */
+    U8_Zs       , U8N_Zs,   /* utf8 white space */
+    U8_Xnx      , U8N_Xnx,  /* utf8 hex digit */
+    U8_Xan      , U8N_Xan,  /* utf8 alphanumeric */
+    U8_Xw       , U8N_Xw,   /* utf8 word */
     ANY         = 0x8200000, /* Any character except newline, . */
     ANYNL       ,           /* Any character including newline, . */
     NOP         ,           /* No operation, internal use only */
@@ -584,27 +577,35 @@ nextc(Parser *par, Rune *rp)
             case 'r': *rp = '\r'; break;
             case 'v': *rp = '\v'; break;
             case 'f': *rp = '\f'; break;
-            case 'd': *rp = ASC_d; break;
-            case 'D': *rp = ASC_D; break;
+            case 'd': *rp = U8_Nd; break;
+            case 'D': *rp = U8N_Nd; break;
             case 's': *rp = U8_Zs; break;
-            case 'S': *rp = NU8_Zs; break;
+            case 'S': *rp = U8N_Zs; break;
             case 'w': *rp = U8_Xw; break;
-            case 'W': *rp = NU8_Xw; break;
+            case 'W': *rp = U8N_Xw; break;
+            case 'x': if (*par->exprp != '{') break;
+                *rp = 0; sscanf(++par->exprp, "%x", rp);
+                while (*par->exprp) if (*par->exprp++ == '}') break;
+                break;
             case 'p': case 'P': { /* https://www.regular-expressions.info/unicode.html */
                 static struct { const char* c; int n, r; } cls[] = {
-                    {"{Ll}", 4, U8_Ll}, {"{Lowercase_Letter}", 18, U8_Ll},
-                    {"{Lu}", 4, U8_Lu}, {"{Uppercase_Letter}", 18, U8_Lu},
-                    {"{L}", 3, U8_La}, {"{L&}", 4, U8_La}, {"{Cased_Letter}", 14, U8_La},
-                    {"{Zs}", 4, U8_Zs}, {"{Space_Separator}", 17, U8_Zs},
-                    {"{Xnx}", 5, U8_Xnx}, {"{Hex_Digit}", 11, U8_Xnx},
-                    {"{Xan}", 5, U8_Xan}, {"{Alphanumeric}", 14, U8_Xan},
-                    {"{Xw}", 4, U8_Xw}, {"{Word}", 6, U8_Xw},
+                    {"{Alpha}", 7, U8_LC}, {"{LC}", 4, U8_LC}, 
+                    {"{Alnum}", 7, U8_Xan},
+                    {"{Digit}", 7, U8_Nd}, {"{Nd}", 4, U8_Nd},
+                    {"{Lower}", 7, U8_Ll}, {"{Ll}", 4, U8_Ll},
+                    {"{Space}", 7, U8_Zs}, {"{Zs}", 4, U8_Zs},
+                    {"{Upper}", 7, U8_Lu}, {"{Lu}", 4, U8_Lu},
+                    {"{XDigit}", 8, U8_Xnx},
+                    {"{Blank}", 7, ASC_bl},
+                    {"{Graph}", 7, ASC_gr},
+                    {"{Print}", 7, ASC_pr},
+                    {"{Punct}", 7, ASC_pt},
                 };
                 int inv = *rp == 'P';
                 for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i)
                     if (!strncmp(par->exprp, cls[i].c, cls[i].n)) {
                         if (par->rune_type == IRUNE && (cls[i].r == U8_Ll || cls[i].r == U8_Lu))
-                            *rp = U8_La + inv;
+                            *rp = U8_LC + inv;
                         else
                             *rp = cls[i].r + inv;
                         par->exprp += cls[i].n;
@@ -716,24 +717,6 @@ bldcclass(Parser *par)
                     ep[-1] = rune;
                     continue;
                 }
-            } else if (rune == '[' && *par->exprp == ':') {
-                static struct { const char* c; int n, r; } cls[] = {
-                    {"alnum:]", 7, ASC_an}, {"alpha:]", 7, ASC_al}, {"blank:]", 7, ASC_bl},
-                    {"cntrl:]", 7, ASC_ct}, {"digit:]", 7, ASC_d}, {"graph:]", 7, ASC_gr},
-                    {"lower:]", 7, ASC_lo}, {"print:]", 7, ASC_pr}, {"punct:]", 7, ASC_pt},
-                    {"space:]", 7, ASC_s}, {"upper:]", 7, ASC_up}, {"xdigit:]", 8, ASC_xd},
-                    {"word:]", 6, ASC_w},
-                };
-                int inv = par->exprp[1] == '^', off = 1 + inv;
-                for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i)
-                    if (!strncmp(par->exprp + off, cls[i].c, cls[i].n)) {
-                        if (par->rune_type == IRUNE && (cls[i].r == ASC_lo || cls[i].r == ASC_up))
-                            rune = ASC_al + inv;
-                        else
-                            rune = cls[i].r + inv;
-                        par->exprp += off + cls[i].n;
-                        break;
-                    }
             }
         }
         *ep++ = rune;
@@ -858,45 +841,31 @@ runematch(Rune s, Rune r, bool icase)
 {
     int inv = 0;
     switch (s) {
-    case ASC_D: inv = 1; /* fallthrough */
-    case ASC_d: return inv ^ (isdigit(r) != 0);
-    case ASC_S: inv = 1;
-    case ASC_s: return inv ^ (isspace(r) != 0);
-    case ASC_W: inv = 1;
-    case ASC_w: return inv ^ (isalnum(r) | (r == '_'));
-    case ASC_AL: inv = 1;
-    case ASC_al: return inv ^ (isalpha(r) != 0);
-    case ASC_LO: inv = 1;
-    case ASC_lo: return inv ^ (islower(r) != 0);
-    case ASC_UP: inv = 1;
-    case ASC_up: return inv ^ (isupper(r) != 0);
-    case ASC_BL: inv = 1;
+    case ASC_BL: inv = 1; /* fallthrough */
     case ASC_bl: return inv ^ ((r == ' ') | (r == '\t'));
     case ASC_CT: inv = 1;
     case ASC_ct: return inv ^ (iscntrl(r) != 0);
     case ASC_GR: inv = 1;
     case ASC_gr: return inv ^ (isgraph(r) != 0);
-    case ASC_AN: inv = 1;
-    case ASC_an: return inv ^ (isalnum(r) != 0);
     case ASC_PR: inv = 1;
     case ASC_pr: return inv ^ (isprint(r) != 0);
     case ASC_PT: inv = 1;
     case ASC_pt: return inv ^ (ispunct(r) != 0);
-    case ASC_XD: inv = 1;
-    case ASC_xd: return inv ^ (isxdigit(r) != 0);
-    case NU8_La: inv = 1;
-    case U8_La: return inv ^ utf8_isalpha(r);
-    case NU8_Ll: inv = 1;
+    case U8N_Nd: inv = 1;
+    case U8_Nd: return inv ^ (utf8_isdigit(r));
+    case U8N_LC: inv = 1;
+    case U8_LC: return inv ^ utf8_isalpha(r);
+    case U8N_Ll: inv = 1;
     case U8_Ll: return inv ^ utf8_islower(r);
-    case NU8_Lu: inv = 1;
+    case U8N_Lu: inv = 1;
     case U8_Lu: return inv ^ utf8_isupper(r);
-    case NU8_Zs: inv = 1;
+    case U8N_Zs: inv = 1;
     case U8_Zs: return inv ^ utf8_isspace(r);
-    case NU8_Xan: inv = 1;
+    case U8N_Xan: inv = 1;
     case U8_Xan: return inv ^ utf8_isalnum(r);
-    case NU8_Xnx: inv = 1;
+    case U8N_Xnx: inv = 1;
     case U8_Xnx: return inv ^ utf8_isxdigit(r);
-    case NU8_Xw: inv = 1;
+    case U8N_Xw: inv = 1;
     case U8_Xw: return inv ^ (utf8_isalnum(r) | (r == '_'));
     }
     return icase ? utf8_tolower(s) == utf8_tolower(r) : s == r;
diff --git a/src/cregex_utf8.c b/src/cregex_utf8.c
index dac38ce7..afdda635 100644
--- a/src/cregex_utf8.c
+++ b/src/cregex_utf8.c
@@ -1062,11 +1062,6 @@ static inline int utf8_isalpha(uint32_t codep) {
     return cfold_lookup(codep) != -1;
 }
 
-static inline int utf8_isalnum(uint32_t codep) {
-    if (codep < 128) return isalnum(codep) != 0;
-    return cfold_lookup(codep) != -1;
-}
-
 static inline int utf8_isspace(uint32_t codep) {
     static uint16_t t[] = {0x09, 0x0D, 0x20, 0x85, 0xA0, 0x1680,
                            0x2028, 0x2029, 0x202F, 0x205F, 0x3000};
@@ -1075,6 +1070,12 @@ static inline int utf8_isspace(uint32_t codep) {
     return (codep >= 0x2000) & (codep <= 0x200A);
 }
 
+static inline int utf8_isdigit(uint32_t codep) {
+    return ((codep >= '0') & (codep <= '9')) || 
+           ((codep >= 0xFF10) & (codep <= 0xFF19)) ||
+           ((codep >= 0x1D7CE) & (codep <= 0x1D7FF));
+}
+
 static inline int utf8_isxdigit(uint32_t codep) {
     static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66,
                            0xFF10, 0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46};
@@ -1083,6 +1084,13 @@ static inline int utf8_isxdigit(uint32_t codep) {
     return false;
 }
 
+static inline int utf8_isalnum(uint32_t codep) {
+    if (codep < 128) return isalnum(codep) != 0;
+    if ((codep >= 0xFF10) & (codep <= 0xFF19) ||
+       ((codep >= 0x1D7CE) & (codep <= 0x1D7FF))) return true;
+    return cfold_lookup(codep) != -1;
+}
+
 // ------------------------------------------------------------
 #if 0
 #include <stdio.h>
author	Tyge Løvset <[email protected]>	2022-02-18 16:25:25 +0100
committer	Tyge Løvset <[email protected]>	2022-02-18 16:25:25 +0100
commit	4ac5ef68726057ff97e56c951e4faeaa29995e40 (patch)
tree	cc8f82362d1fa6a94c8e279d7ce0b6f3581225c2
parent	fde8f4a37c42a3611b544185806155a02e24227e (diff)
download	STC-modified-4ac5ef68726057ff97e56c951e4faeaa29995e40.tar.gz STC-modified-4ac5ef68726057ff97e56c951e4faeaa29995e40.zip