diff options
| author | Tyge Løvset <[email protected]> | 2022-01-07 14:33:29 +0100 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2022-01-07 14:33:29 +0100 |
| commit | bcd76fdeb1b7b5ac01ac9a204db74b537361c8b0 (patch) | |
| tree | aa294be93eeee868b6c27189f2238378a5e0eada | |
| parent | 8d5a9ea8b743253bd33b6ecca8e7e4e650aa6f07 (diff) | |
| download | STC-modified-bcd76fdeb1b7b5ac01ac9a204db74b537361c8b0.tar.gz STC-modified-bcd76fdeb1b7b5ac01ac9a204db74b537361c8b0.zip | |
Some renaming in cregex API. Added match function. Made utf8 operations branchless.
| -rw-r--r-- | examples/regex_match.c | 14 | ||||
| -rw-r--r-- | include/stc/cregex.h | 70 |
2 files changed, 46 insertions, 38 deletions
diff --git a/examples/regex_match.c b/examples/regex_match.c index dad28f84..20d331e2 100644 --- a/examples/regex_match.c +++ b/examples/regex_match.c @@ -15,22 +15,24 @@ int main() c_auto (cregex, re)
{
- re = cregex_compile("[+-]?([0-9]*\\.)?[0-9]+([Ee][-+]?[0-9]+)?");
- cregex_match_t m;
- if (cregex_match(re, s, &m)) {
+ re = cregex_new("[+-]?([0-9]*\\.)?[0-9]+([Ee][-+]?[0-9]+)?");
+ cregex_match m;
+ if (cregex_find(re, s, &m)) {
printf("Found digits at position %u-%u\n", m.match_begin, m.match_end);
} else {
printf("Could not find any digits\n");
}
- c_auto (cregex_res, matches) {
- matches = cregex_match_all(re, s);
+ c_auto (cregex_result, matches) {
+ matches = cregex_find_all(re, s);
csview sv = csview_from(s);
- c_foreach (i, cregex_res, matches) {
+ c_foreach (i, cregex_result, matches) {
csview r = csview_slice(sv, i.ref->match_begin, i.ref->match_end);
printf(c_svfmt " / ", c_svarg(r));
}
}
+
+
puts("");
}
}
\ No newline at end of file diff --git a/include/stc/cregex.h b/include/stc/cregex.h index cf9a9ba0..036508db 100644 --- a/include/stc/cregex.h +++ b/include/stc/cregex.h @@ -38,10 +38,10 @@ typedef struct { typedef struct { size_t match_begin; size_t match_end; -} cregex_match_t; +} cregex_match; -#define i_type cregex_res -#define i_val cregex_match_t +#define i_type cregex_result +#define i_val cregex_match #include "cstack.h" typedef enum { @@ -64,24 +64,27 @@ STC_INLINE cregex cregex_init(void) { cregex re = {NULL}; return re; } /* compile regular expression */ -STC_API cregex cregex_compile(const char *re); +STC_API cregex cregex_new(const char *re); /* get error type if a function failed */ STC_API cregex_error_t cregex_error(void); +/* check if input s matches re */ +STC_DEF bool cregex_is_match(cregex re, const char *s); + /* find the first matching substring in s */ -STC_API bool cregex_match(cregex re, const char *s, cregex_match_t *m); +STC_API bool cregex_find(cregex re, const char *s, cregex_match *m); /* get all non-overlapping matches in string s. returns NULL * if no matches are found. returned value must be freed */ -STC_API cregex_res cregex_match_all(cregex re, const char *s); +STC_API cregex_result cregex_find_all(cregex re, const char *s); /* get amount of capture groups inside of * a regular expression */ STC_API size_t cregex_captures_len(cregex re); /* get captured slice from capture group number index */ -STC_API const cregex_match_t *cregex_capture(cregex re, size_t index); +STC_API cregex_match cregex_capture(cregex re, size_t index); /* free regular expression */ STC_API void cregex_drop(cregex *re); @@ -114,9 +117,11 @@ static const uint8_t _rx_utf8d[] = { static inline uint32_t _rx_utf8_decode(uint32_t *state, uint32_t *codep, const uint32_t byte) { - uint32_t type = _rx_utf8d[byte]; - *codep = (*state != _rx_UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) - : (0xff >> type) & (byte); + const uint32_t type = _rx_utf8d[byte]; + const uint32_t x = (uint32_t) -(*state != _rx_UTF8_ACCEPT); + + *codep = (x & ((byte & 0x3fu) | (*codep << 6))) + | (~x & ((0xff >> type) & (byte))); *state = _rx_utf8d[256 + (*state << 4) + type]; return *state; @@ -125,9 +130,10 @@ static inline uint32_t _rx_utf8_decode(uint32_t *state, uint32_t *codep, static bool _rx_utf8_count_codepoints(size_t *count, const uint8_t *s) { uint32_t state = _rx_UTF8_ACCEPT, codepoint; + for (*count = 0; *s; ++s) *count += !_rx_utf8_decode(&state, &codepoint, *s); - return state == _rx_UTF8_ACCEPT; // NB! valid == true + return state == _rx_UTF8_ACCEPT; } STC_DEF bool cregex_valid_utf8(const char *s) @@ -156,10 +162,11 @@ static inline uint32_t _rx_utf8_char_width(uint8_t c) static inline const char *_rx_utf8_next(const char *s) { - if (*s == 0) - return NULL; - - return s + _rx_utf8_char_width((uint8_t)s[0]); + const char* t = s + _rx_utf8_char_width((uint8_t)s[0]); + + uintptr_t p = (uintptr_t)t; + p &= (uintptr_t) -(*s != 0); + return (const char *)p; } /* function pointer type used to evaluate if a regex node @@ -198,7 +205,7 @@ typedef struct { typedef struct { _rx_GenericNode generic; union cregex_node *subexp; - cregex_match_t cap; + cregex_match cap; } _rx_CapNode; typedef struct { @@ -897,7 +904,7 @@ static cregex_node *_rx_compile(const char *re, const char *end, cregex_node *no return cur; } -STC_DEF cregex cregex_compile(const char *re) +STC_DEF cregex cregex_new(const char *re) { cregex ret = {NULL}; @@ -934,15 +941,14 @@ STC_DEF cregex_error_t cregex_error(void) return _rx_CompileException.err; } -STC_DEF bool cregex_match(cregex re, const char *s, cregex_match_t *m) +STC_DEF bool cregex_is_match(cregex re, const char *s) { - clear_compile_exception(); - - if (re.nodes == NULL || s == NULL || m == NULL) { - _rx_CompileException.err = cregex_INVALID_PARAMS; - return false; - } + const char *next = NULL; + return _rx_is_match(re.nodes, s, s, &next); +} +STC_DEF bool cregex_find(cregex re, const char *s, cregex_match *m) +{ m->match_begin = SIZE_MAX; m->match_end = SIZE_MAX; @@ -963,22 +969,22 @@ STC_DEF void cregex_drop(cregex *re) free(re->nodes); } -STC_DEF cregex_res cregex_match_all(cregex re, const char *s) +STC_DEF cregex_result cregex_find_all(cregex re, const char *s) { - cregex_res matches = cregex_res_init(); + cregex_result matches = cregex_result_init(); size_t offset = 0; const char *s_end = s + strlen(s); while (s < s_end) { - cregex_match_t tmp; - if (cregex_match(re, s, &tmp)) { + cregex_match tmp; + if (cregex_find(re, s, &tmp)) { size_t end = tmp.match_end; s += end; tmp.match_begin += offset; tmp.match_end += offset; offset += end; - cregex_res_push(&matches, tmp); + cregex_result_push(&matches, tmp); } else { break; } @@ -1040,15 +1046,15 @@ static cregex_node *_rx_find_capture_node(cregex_node *node, size_t index) } } -STC_DEF const cregex_match_t *cregex_capture(cregex re, size_t index) +STC_DEF cregex_match cregex_capture(cregex re, size_t index) { _rx_CapNode *cap = (_rx_CapNode *)_rx_find_capture_node(re.nodes, index); if (cap == NULL) { - return NULL; + return c_make(cregex_match){0, 0}; } - return &cap->cap; + return cap->cap; } #endif |
