summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorTyge Løvset <[email protected]>2022-02-18 16:25:25 +0100
committerTyge Løvset <[email protected]>2022-02-18 16:25:25 +0100
commit4ac5ef68726057ff97e56c951e4faeaa29995e40 (patch)
treecc8f82362d1fa6a94c8e279d7ce0b6f3581225c2
parentfde8f4a37c42a3611b544185806155a02e24227e (diff)
downloadSTC-modified-4ac5ef68726057ff97e56c951e4faeaa29995e40.tar.gz
STC-modified-4ac5ef68726057ff97e56c951e4faeaa29995e40.zip
Some improvements and cleanup: CRegex, CMap.
-rw-r--r--benchmarks/picobench/picobench_cmap.cpp3
-rw-r--r--benchmarks/shootout_hashmaps.cpp10
-rw-r--r--examples/new_map.c2
-rw-r--r--examples/new_smap.c2
-rw-r--r--include/stc/ccommon.h10
-rw-r--r--include/stc/cmap.h12
-rw-r--r--include/stc/forward.h10
-rw-r--r--src/cregex.c109
-rw-r--r--src/cregex_utf8.c18
9 files changed, 77 insertions, 99 deletions
diff --git a/benchmarks/picobench/picobench_cmap.cpp b/benchmarks/picobench/picobench_cmap.cpp
index f54b3d32..15ab2aa7 100644
--- a/benchmarks/picobench/picobench_cmap.cpp
+++ b/benchmarks/picobench/picobench_cmap.cpp
@@ -208,7 +208,8 @@ static void ins_and_access_cmap_s(picobench::state& s)
randomize(str.str, cstr_size(str));
//if (cstr_size(str) > 30) { printf("%s\n", str.str); exit(0); }
cmap_str_emplace(&map, str.str, str.str);
- //randomize(str.str, cstr_size(str));
+
+ randomize(str.str, cstr_size(str));
result += cmap_str_erase(&map, str.str);
}
s.set_result(result + cmap_str_size(map));
diff --git a/benchmarks/shootout_hashmaps.cpp b/benchmarks/shootout_hashmaps.cpp
index 5307da0c..6c8c1f00 100644
--- a/benchmarks/shootout_hashmaps.cpp
+++ b/benchmarks/shootout_hashmaps.cpp
@@ -209,7 +209,7 @@ size_t seed;
M##_PUT(X, RAND(keybits), i); \
size_t x = 300000000/M##_SIZE(X); \
clock_t difference, before = clock(); \
- for (int k=0; k < x; k++) M##_FOR (X, it) \
+ for (size_t k=0; k < x; k++) M##_FOR (X, it) \
sum += M##_ITEM(X, it); \
difference = clock() - before; \
printf(#M ": time: %5.02f, size: %zu, buckets: %8zu, repeats: %zu, sum: %zu\n", \
@@ -256,10 +256,10 @@ enum {
int main(int argc, char* argv[])
{
- int n_mill = argc >= 2 ? atoi(argv[1]) : DEFAULT_N_MILL;
- int keybits = argc >= 3 ? atoi(argv[2]) : DEFAULT_KEYBITS;
- int n = n_mill * 1000000;
- int N0 = n, N1 = n/2, N2 = n/2, N3 = n, N4 = n, N5 = n;
+ unsigned n_mill = argc >= 2 ? atoi(argv[1]) : DEFAULT_N_MILL;
+ unsigned keybits = argc >= 3 ? atoi(argv[2]) : DEFAULT_KEYBITS;
+ unsigned n = n_mill * 1000000;
+ unsigned N0 = n, N1 = n/2, N2 = n/2, N3 = n, N4 = n, N5 = n;
seed = time(NULL); // 1636306010;
printf("\nUnordered hash map shootout\n");
diff --git a/examples/new_map.c b/examples/new_map.c
index a7008e4f..97fce008 100644
--- a/examples/new_map.c
+++ b/examples/new_map.c
@@ -1,7 +1,7 @@
#include <stc/cstr.h>
#include <stc/forward.h>
-forward_cmap(cmap_pnt, struct Point, int, uint32_t);
+forward_cmap(cmap_pnt, struct Point, int);
struct MyStruct {
cmap_pnt pntmap;
diff --git a/examples/new_smap.c b/examples/new_smap.c
index 431f18c4..382d27ae 100644
--- a/examples/new_smap.c
+++ b/examples/new_smap.c
@@ -1,7 +1,7 @@
#include <stc/cstr.h>
#include <stc/forward.h>
-forward_csmap(PMap, struct Point, int, uint32_t);
+forward_csmap(PMap, struct Point, int);
// Use forward declared PMap in struct
struct MyStruct {
diff --git a/include/stc/ccommon.h b/include/stc/ccommon.h
index 292ac9cc..e5a595e0 100644
--- a/include/stc/ccommon.h
+++ b/include/stc/ccommon.h
@@ -122,13 +122,11 @@ STC_INLINE uint64_t c_default_hash(const void* key, size_t len) {
while (--len) h = (h << 10) - h + *x++;
return _c_ROTL(h, 26) ^ h;
}
-STC_INLINE uint64_t c_hash32(const void* key, size_t len) {
- uint32_t x; memcpy(&x, key, 4);
- return x*0xc6a4a7935bd1e99d >> 15;
+STC_INLINE uint64_t c_hash32(const void* key, size_t n) {
+ return *(uint32_t *)key*0xc6a4a7935bd1e99d;
}
-STC_INLINE uint64_t c_hash64(const void* key, size_t len) {
- uint64_t x; memcpy(&x, key, 8);
- return x*0xc6a4a7935bd1e99d;
+STC_INLINE uint64_t c_hash64(const void* key, size_t n) {
+ return *(uint64_t *)key*0xc6a4a7935bd1e99d;
}
STC_INLINE char* c_strnstrn(const char *s, const char *needle, size_t slen, const size_t nlen) {
diff --git a/include/stc/cmap.h b/include/stc/cmap.h
index af247ffc..a5ca353e 100644
--- a/include/stc/cmap.h
+++ b/include/stc/cmap.h
@@ -233,10 +233,10 @@ _cx_memb(_erase_at)(_cx_self* self, _cx_iter it) {
#if defined(_i_implement)
#ifndef CMAP_H_INCLUDED
-//STC_INLINE size_t fastrange_uint64_t(uint64_t x, uint64_t n)
-// { uint64_t lo, hi; c_umul128(x, n, &lo, &hi); return hi; }
-#define fastrange_uint32_t(x, n) (uint32_t)((uint32_t)(x)*(uint64_t)(n) >> 32)
-#define chash_index_(h, entryPtr) ((entryPtr) - (h).table)
+STC_INLINE size_t fastrange_size_t(uint64_t x, uint64_t n)
+ { uint64_t lo, hi; c_umul128(x, n, &lo, &hi); return (size_t)hi; }
+STC_INLINE size_t fastrange_uint32_t(uint64_t x, uint64_t n)
+ { return (size_t)((uint32_t)x*n >> 32); }
#endif // CMAP_H_INCLUDED
STC_DEF _cx_self
@@ -362,11 +362,11 @@ _cx_memb(_reserve)(_cx_self* self, const size_t _newcap) {
STC_DEF void
_cx_memb(_erase_entry)(_cx_self* self, _cx_value* _val) {
- _cx_size i = chash_index_(*self, _val), j = i, k;
+ _cx_size i = _val - self->table, j = i, k;
const _cx_size _cap = self->bucket_count;
_cx_value* _slot = self->table;
uint8_t* _hashx = self->_hashx;
- _cx_memb(_value_drop)(&_slot[i]);
+ _cx_memb(_value_drop)(_val);
for (;;) { /* delete without leaving tombstone */
if (++j == _cap)
j = 0;
diff --git a/include/stc/forward.h b/include/stc/forward.h
index ca1fd540..e57aedfd 100644
--- a/include/stc/forward.h
+++ b/include/stc/forward.h
@@ -29,10 +29,12 @@
#define forward_carr3(CX, VAL) _c_carr3_types(CX, VAL)
#define forward_cdeq(CX, VAL) _c_cdeq_types(CX, VAL)
#define forward_clist(CX, VAL) _c_clist_types(CX, VAL)
-#define forward_cmap(CX, KEY, VAL, SZ) _c_chash_types(CX, KEY, VAL, SZ, c_true, c_false)
-#define forward_csmap(CX, KEY, VAL, SZ) _c_aatree_types(CX, KEY, VAL, SZ, c_true, c_false)
-#define forward_cset(CX, KEY, SZ) _c_chash_types(CX, cset, KEY, KEY, SZ, c_false, c_true)
-#define forward_csset(CX, KEY, SZ) _c_aatree_types(CX, KEY, KEY, SZ, c_false, c_true)
+#define forward_cmap(CX, KEY, VAL) _c_chash_types(CX, KEY, VAL, uint32_t, c_true, c_false)
+#define forward_cmap_big(CX, KEY, VAL) _c_chash_types(CX, KEY, VAL, size_t, c_true, c_false)
+#define forward_cset(CX, KEY) _c_chash_types(CX, cset, KEY, KEY, uint32_t, c_false, c_true)
+#define forward_cset_big(CX, KEY) _c_chash_types(CX, cset, KEY, KEY, size_t, c_false, c_true)
+#define forward_csmap(CX, KEY, VAL) _c_aatree_types(CX, KEY, VAL, uint32_t, c_true, c_false)
+#define forward_csset(CX, KEY) _c_aatree_types(CX, KEY, KEY, uint32_t, c_false, c_true)
#define forward_cbox(CX, VAL) _c_cbox_types(CX, VAL)
#define forward_carc(CX, VAL) _c_carc_types(CX, VAL)
#define forward_cpque(CX, VAL) _c_cpque_types(CX, VAL)
diff --git a/src/cregex.c b/src/cregex.c
index 804e8047..be185a40 100644
--- a/src/cregex.c
+++ b/src/cregex.c
@@ -120,26 +120,19 @@ enum {
QUEST , /* a? == a|nothing, i.e. 0 or 1 a's */
RUNE = 0x8100000,
IRUNE,
- ASC_d , ASC_D, /* dec digit, non-digit */
- ASC_s , ASC_S, /* utf8 space, non-space */
- ASC_w , ASC_W, /* utf8 word, non-word */
- ASC_an , ASC_AN, /* alnum */
- ASC_al , ASC_AL, /* alpha */
ASC_bl , ASC_BL, /* blank */
ASC_ct , ASC_CT, /* ctrl */
ASC_gr , ASC_GR, /* graphic */
- ASC_lo , ASC_LO, /* lower */
- ASC_up , ASC_UP, /* upper */
ASC_pr , ASC_PR, /* print */
ASC_pt , ASC_PT, /* punct */
- ASC_xd , ASC_XD, /* xdigit */
- U8_La , NU8_La, /* utf8 alpha */
- U8_Ll , NU8_Ll, /* utf8 lower */
- U8_Lu , NU8_Lu, /* utf8 upper */
- U8_Zs , NU8_Zs, /* utf8 white space */
- U8_Xnx , NU8_Xnx, /* utf8 hex digit */
- U8_Xan , NU8_Xan, /* utf8 alphanumeric */
- U8_Xw , NU8_Xw, /* utf8 word */
+ U8_Nd , U8N_Nd, /* dec digit, non-digit */
+ U8_LC , U8N_LC, /* utf8 letter cased */
+ U8_Ll , U8N_Ll, /* utf8 letter lower */
+ U8_Lu , U8N_Lu, /* utf8 letter upper */
+ U8_Zs , U8N_Zs, /* utf8 white space */
+ U8_Xnx , U8N_Xnx, /* utf8 hex digit */
+ U8_Xan , U8N_Xan, /* utf8 alphanumeric */
+ U8_Xw , U8N_Xw, /* utf8 word */
ANY = 0x8200000, /* Any character except newline, . */
ANYNL , /* Any character including newline, . */
NOP , /* No operation, internal use only */
@@ -584,27 +577,35 @@ nextc(Parser *par, Rune *rp)
case 'r': *rp = '\r'; break;
case 'v': *rp = '\v'; break;
case 'f': *rp = '\f'; break;
- case 'd': *rp = ASC_d; break;
- case 'D': *rp = ASC_D; break;
+ case 'd': *rp = U8_Nd; break;
+ case 'D': *rp = U8N_Nd; break;
case 's': *rp = U8_Zs; break;
- case 'S': *rp = NU8_Zs; break;
+ case 'S': *rp = U8N_Zs; break;
case 'w': *rp = U8_Xw; break;
- case 'W': *rp = NU8_Xw; break;
+ case 'W': *rp = U8N_Xw; break;
+ case 'x': if (*par->exprp != '{') break;
+ *rp = 0; sscanf(++par->exprp, "%x", rp);
+ while (*par->exprp) if (*par->exprp++ == '}') break;
+ break;
case 'p': case 'P': { /* https://www.regular-expressions.info/unicode.html */
static struct { const char* c; int n, r; } cls[] = {
- {"{Ll}", 4, U8_Ll}, {"{Lowercase_Letter}", 18, U8_Ll},
- {"{Lu}", 4, U8_Lu}, {"{Uppercase_Letter}", 18, U8_Lu},
- {"{L}", 3, U8_La}, {"{L&}", 4, U8_La}, {"{Cased_Letter}", 14, U8_La},
- {"{Zs}", 4, U8_Zs}, {"{Space_Separator}", 17, U8_Zs},
- {"{Xnx}", 5, U8_Xnx}, {"{Hex_Digit}", 11, U8_Xnx},
- {"{Xan}", 5, U8_Xan}, {"{Alphanumeric}", 14, U8_Xan},
- {"{Xw}", 4, U8_Xw}, {"{Word}", 6, U8_Xw},
+ {"{Alpha}", 7, U8_LC}, {"{LC}", 4, U8_LC},
+ {"{Alnum}", 7, U8_Xan},
+ {"{Digit}", 7, U8_Nd}, {"{Nd}", 4, U8_Nd},
+ {"{Lower}", 7, U8_Ll}, {"{Ll}", 4, U8_Ll},
+ {"{Space}", 7, U8_Zs}, {"{Zs}", 4, U8_Zs},
+ {"{Upper}", 7, U8_Lu}, {"{Lu}", 4, U8_Lu},
+ {"{XDigit}", 8, U8_Xnx},
+ {"{Blank}", 7, ASC_bl},
+ {"{Graph}", 7, ASC_gr},
+ {"{Print}", 7, ASC_pr},
+ {"{Punct}", 7, ASC_pt},
};
int inv = *rp == 'P';
for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i)
if (!strncmp(par->exprp, cls[i].c, cls[i].n)) {
if (par->rune_type == IRUNE && (cls[i].r == U8_Ll || cls[i].r == U8_Lu))
- *rp = U8_La + inv;
+ *rp = U8_LC + inv;
else
*rp = cls[i].r + inv;
par->exprp += cls[i].n;
@@ -716,24 +717,6 @@ bldcclass(Parser *par)
ep[-1] = rune;
continue;
}
- } else if (rune == '[' && *par->exprp == ':') {
- static struct { const char* c; int n, r; } cls[] = {
- {"alnum:]", 7, ASC_an}, {"alpha:]", 7, ASC_al}, {"blank:]", 7, ASC_bl},
- {"cntrl:]", 7, ASC_ct}, {"digit:]", 7, ASC_d}, {"graph:]", 7, ASC_gr},
- {"lower:]", 7, ASC_lo}, {"print:]", 7, ASC_pr}, {"punct:]", 7, ASC_pt},
- {"space:]", 7, ASC_s}, {"upper:]", 7, ASC_up}, {"xdigit:]", 8, ASC_xd},
- {"word:]", 6, ASC_w},
- };
- int inv = par->exprp[1] == '^', off = 1 + inv;
- for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i)
- if (!strncmp(par->exprp + off, cls[i].c, cls[i].n)) {
- if (par->rune_type == IRUNE && (cls[i].r == ASC_lo || cls[i].r == ASC_up))
- rune = ASC_al + inv;
- else
- rune = cls[i].r + inv;
- par->exprp += off + cls[i].n;
- break;
- }
}
}
*ep++ = rune;
@@ -858,45 +841,31 @@ runematch(Rune s, Rune r, bool icase)
{
int inv = 0;
switch (s) {
- case ASC_D: inv = 1; /* fallthrough */
- case ASC_d: return inv ^ (isdigit(r) != 0);
- case ASC_S: inv = 1;
- case ASC_s: return inv ^ (isspace(r) != 0);
- case ASC_W: inv = 1;
- case ASC_w: return inv ^ (isalnum(r) | (r == '_'));
- case ASC_AL: inv = 1;
- case ASC_al: return inv ^ (isalpha(r) != 0);
- case ASC_LO: inv = 1;
- case ASC_lo: return inv ^ (islower(r) != 0);
- case ASC_UP: inv = 1;
- case ASC_up: return inv ^ (isupper(r) != 0);
- case ASC_BL: inv = 1;
+ case ASC_BL: inv = 1; /* fallthrough */
case ASC_bl: return inv ^ ((r == ' ') | (r == '\t'));
case ASC_CT: inv = 1;
case ASC_ct: return inv ^ (iscntrl(r) != 0);
case ASC_GR: inv = 1;
case ASC_gr: return inv ^ (isgraph(r) != 0);
- case ASC_AN: inv = 1;
- case ASC_an: return inv ^ (isalnum(r) != 0);
case ASC_PR: inv = 1;
case ASC_pr: return inv ^ (isprint(r) != 0);
case ASC_PT: inv = 1;
case ASC_pt: return inv ^ (ispunct(r) != 0);
- case ASC_XD: inv = 1;
- case ASC_xd: return inv ^ (isxdigit(r) != 0);
- case NU8_La: inv = 1;
- case U8_La: return inv ^ utf8_isalpha(r);
- case NU8_Ll: inv = 1;
+ case U8N_Nd: inv = 1;
+ case U8_Nd: return inv ^ (utf8_isdigit(r));
+ case U8N_LC: inv = 1;
+ case U8_LC: return inv ^ utf8_isalpha(r);
+ case U8N_Ll: inv = 1;
case U8_Ll: return inv ^ utf8_islower(r);
- case NU8_Lu: inv = 1;
+ case U8N_Lu: inv = 1;
case U8_Lu: return inv ^ utf8_isupper(r);
- case NU8_Zs: inv = 1;
+ case U8N_Zs: inv = 1;
case U8_Zs: return inv ^ utf8_isspace(r);
- case NU8_Xan: inv = 1;
+ case U8N_Xan: inv = 1;
case U8_Xan: return inv ^ utf8_isalnum(r);
- case NU8_Xnx: inv = 1;
+ case U8N_Xnx: inv = 1;
case U8_Xnx: return inv ^ utf8_isxdigit(r);
- case NU8_Xw: inv = 1;
+ case U8N_Xw: inv = 1;
case U8_Xw: return inv ^ (utf8_isalnum(r) | (r == '_'));
}
return icase ? utf8_tolower(s) == utf8_tolower(r) : s == r;
diff --git a/src/cregex_utf8.c b/src/cregex_utf8.c
index dac38ce7..afdda635 100644
--- a/src/cregex_utf8.c
+++ b/src/cregex_utf8.c
@@ -1062,11 +1062,6 @@ static inline int utf8_isalpha(uint32_t codep) {
return cfold_lookup(codep) != -1;
}
-static inline int utf8_isalnum(uint32_t codep) {
- if (codep < 128) return isalnum(codep) != 0;
- return cfold_lookup(codep) != -1;
-}
-
static inline int utf8_isspace(uint32_t codep) {
static uint16_t t[] = {0x09, 0x0D, 0x20, 0x85, 0xA0, 0x1680,
0x2028, 0x2029, 0x202F, 0x205F, 0x3000};
@@ -1075,6 +1070,12 @@ static inline int utf8_isspace(uint32_t codep) {
return (codep >= 0x2000) & (codep <= 0x200A);
}
+static inline int utf8_isdigit(uint32_t codep) {
+ return ((codep >= '0') & (codep <= '9')) ||
+ ((codep >= 0xFF10) & (codep <= 0xFF19)) ||
+ ((codep >= 0x1D7CE) & (codep <= 0x1D7FF));
+}
+
static inline int utf8_isxdigit(uint32_t codep) {
static uint16_t t[] = {0x30, 0x39, 0x41, 0x46, 0x61, 0x66,
0xFF10, 0xFF19, 0xFF21, 0xFF26, 0xFF41, 0xFF46};
@@ -1083,6 +1084,13 @@ static inline int utf8_isxdigit(uint32_t codep) {
return false;
}
+static inline int utf8_isalnum(uint32_t codep) {
+ if (codep < 128) return isalnum(codep) != 0;
+ if ((codep >= 0xFF10) & (codep <= 0xFF19) ||
+ ((codep >= 0x1D7CE) & (codep <= 0x1D7FF))) return true;
+ return cfold_lookup(codep) != -1;
+}
+
// ------------------------------------------------------------
#if 0
#include <stdio.h>