diff options
| -rw-r--r-- | examples/regex1.c | 26 | ||||
| -rw-r--r-- | examples/regex2.c | 31 | ||||
| -rw-r--r-- | examples/regex_match.c | 35 | ||||
| -rw-r--r-- | examples/stack.c | 1 | ||||
| -rw-r--r-- | include/stc/cregex.h | 993 | ||||
| -rw-r--r-- | tests/cregex_test.c | 370 |
6 files changed, 1455 insertions, 1 deletions
diff --git a/examples/regex1.c b/examples/regex1.c new file mode 100644 index 00000000..7385d69e --- /dev/null +++ b/examples/regex1.c @@ -0,0 +1,26 @@ +#include <stc/cstr.h> +#include <stc/cregex.h> + +int main() +{ + c_auto (cstr, input) + c_auto (cregex, float_expr) + { + float_expr = cregex_new("[+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)"); + // Until "q" is given, ask for another number + while (true) + { + printf("Enter float number (q for quit): "); + cstr_getline(&input, stdin); + + // Exit when the user inputs q + if (cstr_equals(input, "q")) + break; + + if (cregex_matches(&float_expr, input.str)) + printf("Input is a float\n"); + else + printf("Invalid input : Not a float\n"); + } + } +} diff --git a/examples/regex2.c b/examples/regex2.c new file mode 100644 index 00000000..5509baa5 --- /dev/null +++ b/examples/regex2.c @@ -0,0 +1,31 @@ +#include <stc/cregex.h> +#include <stc/csview.h> +#include <stc/cstr.h> + +int main() +{ + const char* inputs[] = {"birthday: 1964-12-05", "https://en.cppreference.com/w/cpp/regex/regex_search", "!123abcabc!"}; + const char* patterns[] = {"([0-9]{4})-(1[0-2]|0[1-9])-(3[01]|[12][0-9]|0[1-9])", + "(https?:\\/\\/|ftp:\\/\\/|www\\.)([0-9A-Za-z-@:%_+~#=]+\\.)+([a-z]{2,3})(\\/[\\/0-9A-Za-z-\\.@:%_+~#=\\?&]*)?", + "!((abc|123)+)!", + }; + c_forrange (i, c_arraylen(inputs)) + { + c_auto (cregex, re) + { + re = cregex_new(patterns[i]); + csview m; + if (cregex_find_sv(&re, inputs[i], &m)) + { + printf("input: %s\n", inputs[i]); + c_forrange (j, cregex_capture_size(re)) + { + csview cap; + bool res = cregex_capture_sv(&re, j, &cap); + if (res) printf(" submatch %zu: " c_PRIsv "\n", j, c_ARGsv(cap)); + } + puts(""); + } + } + } +} diff --git a/examples/regex_match.c b/examples/regex_match.c new file mode 100644 index 00000000..17e3355f --- /dev/null +++ b/examples/regex_match.c @@ -0,0 +1,35 @@ +#include <stdio.h> +#include <stc/csview.h> +#include <stc/cregex.h> +#include <stc/crandom.h> +#define i_val double +#define i_type Vecu64 +#include <stc/cstack.h> +#include <time.h> + + +int main() +{ + // Lets find the first sequence of digits in a string + const char *s = "Hello numeric world, there are 24 hours in a day, 3600 seconds in an hour." + " Around 365.25 days a year, and 52 weeks in a year." + " Boltzmann const: 1.38064852E-23, is very small." + " Bohrradius is 5.29177210903e-11, and Avogadros number is 6.02214076e23."; + + c_auto (cregex, re) + { + re = cregex_new("[+-]?([0-9]*\\.)?[0-9]+([Ee][-+]?[0-9]+)?"); + cregex_match match; + if (cregex_find(&re, s, &match)) { + printf("Found digits at position %zu-%zu\n", match.start, match.end); + } else { + printf("Could not find any digits\n"); + } + + csview sv = {0}; + while (cregex_find_next_sv(&re, s, &sv)) { + printf(c_PRIsv " ; ", c_ARGsv(sv)); + } + puts(""); + } +} diff --git a/examples/stack.c b/examples/stack.c index f392ecd1..29b39aef 100644 --- a/examples/stack.c +++ b/examples/stack.c @@ -1,6 +1,5 @@ #include <stdio.h>
-#include <stc/cstr.h>
#define i_tag i
#define i_val int
diff --git a/include/stc/cregex.h b/include/stc/cregex.h new file mode 100644 index 00000000..11b286d2 --- /dev/null +++ b/include/stc/cregex.h @@ -0,0 +1,993 @@ +/* + * Copyright (c) 2020 Fabian van Rissenbeck + * https://github.com/deinernstjetzt/mregexp + * 2009 Bjoern Hoehrmann <[email protected]> + * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + * 2022 Tyge Løvset + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +*/ +#ifndef CREGEX_INCLUDED +#define CREGEX_INCLUDED + +#include "csview.h" +#include <stdlib.h> +#include <setjmp.h> +#include <stdarg.h> + +typedef struct { + size_t start; + size_t end; +} cregex_match; + +typedef struct { + union cregex_node *nodes; + const char* input; + cregex_match match; +} cregex; + +typedef enum { + cregex_OK = 0, + cregex_FAILED_ALLOC, + cregex_INVALID_UTF8, + cregex_INVALID_PARAMS, + cregex_EARLY_QUANTIFIER, + cregex_INVALID_COMPLEX_QUANT, + cregex_UNEXPECTED_EOL, + cregex_INVALID_COMPLEX_CLASS, + cregex_UNCLOSED_SUBEXPRESSION, +} cregex_error_t; + +/* create an empty expression */ +STC_INLINE cregex cregex_init(void) + { cregex rx = {NULL}; return rx; } + +/* compile regular expression */ +STC_API cregex cregex_new(const char *re); + +/* get error type if a function failed */ +STC_API cregex_error_t cregex_error(void); + +/* check if input s matches re */ +STC_API bool cregex_matches(cregex *rx, const char *s); + +/* find the next matching substring in s */ +STC_API bool cregex_find_next(cregex *rx, const char *s, cregex_match *m); + +/* find the next matching substring in s, return as csview */ +STC_API bool cregex_find_next_sv(cregex *rx, const char *s, csview *sv); + +/* find the first matching substring in s */ +STC_INLINE bool cregex_find(cregex *rx, const char *s, cregex_match *m) { + m->start = m->end = 0; + return cregex_find_next(rx, s, m); +} + +STC_INLINE bool cregex_find_sv(cregex *rx, const char *s, csview *sv) { + sv->str = s, sv->size = 0; + return cregex_find_next_sv(rx, s, sv); +} + +/* get captured slice from capture group number index */ +STC_API bool cregex_capture(cregex *rx, size_t index, cregex_match *m); + +/* get captured slice from capture group number index as a csview */ +STC_INLINE bool cregex_capture_sv(cregex *rx, size_t index, csview *sv) { + cregex_match m; + bool ret = cregex_capture(rx, index, &m); + *sv = c_make(csview){rx->input + m.start, m.end - m.start}; + return ret; +} + +/* get amount of capture groups inside of + * the regular expression */ +STC_API size_t cregex_capture_size(cregex rx); + +/* free regular expression */ +STC_API void cregex_drop(cregex *rx); + +/* -------------------------- IMPLEMENTATION ------------------------- */ +#if defined(_i_implement) + + +/* function pointer type used to evaluate if a regex node + * matched a given string */ +typedef bool (*_rx_MatchFunc)(union cregex_node *node, const char *orig, + const char *cur, const char **next); + +typedef struct _rx_GenericNode { + union cregex_node *prev; + union cregex_node *next; + _rx_MatchFunc match; +} _rx_GenericNode; + +typedef struct { + _rx_GenericNode generic; + uint32_t chr; +} _rx_CharNode; + +typedef struct { + _rx_GenericNode generic; + union cregex_node *subexp; + size_t min, max; +} _rx_QuantNode; + +typedef struct { + _rx_GenericNode generic; + uint32_t first, last; +} _rx_RangeNode; + +typedef struct { + _rx_GenericNode generic; + _rx_RangeNode *ranges; + bool negate; +} _rx_ClassNode; + +typedef struct { + _rx_GenericNode generic; + union cregex_node *subexp; + cregex_match cap; +} _rx_CapNode; + +typedef struct { + _rx_GenericNode generic; + union cregex_node *left; + union cregex_node *right; +} _rx_OrNode; + +typedef union cregex_node { + _rx_GenericNode generic; + _rx_CharNode chr; + _rx_QuantNode quant; + _rx_ClassNode cls; + _rx_RangeNode range; + _rx_CapNode cap; + _rx_OrNode ior; +} cregex_node; + +static bool _rx_is_match(cregex_node *node, const char *orig, + const char *cur, const char **next) +{ + if (node == NULL) { + *next = cur; + return true; + } else { + return ((node->generic.match)(node, orig, cur, next)) && + _rx_is_match(node->generic.next, orig, *next, next); + } +} + +static bool _rx_char_is_match(cregex_node *node, const char *orig, + const char *cur, const char **next) +{ + if (*cur == 0) { + return false; + } + + *next = utf8_next(cur); + return node->chr.chr == utf8_peek(cur); +} + +static bool _rx_start_is_match(cregex_node *node, const char *orig, + const char *cur, const char **next) +{ + *next = cur; + return true; +} + +static bool _rx_anchor_begin_is_match(cregex_node *node, const char *orig, + const char *cur, const char **next) +{ + *next = cur; + return strlen(orig) == strlen(cur); +} + +static bool _rx_anchor_end_is_match(cregex_node *node, const char *orig, + const char *cur, const char **next) +{ + *next = cur; + return strlen(cur) == 0; +} + +static bool _rx_any_is_match(cregex_node *node, const char *orig, + const char *cur, const char **next) +{ + if (*cur) { + *next = utf8_next(cur); + return true; + } + + return false; +} + + + +static bool _rx_quant_is_match(cregex_node *node, const char *orig, + const char *cur, const char **next) +{ + _rx_QuantNode *quant = (_rx_QuantNode *)node; + size_t matches = 0; + + while (_rx_is_match(quant->subexp, orig, cur, next)) { + matches++; + cur = *next; + + if (matches >= quant->max) + break; + } + + *next = cur; + return matches >= quant->min; +} + +static bool _rx_class_is_match(cregex_node *node, const char *orig, + const char *cur, const char **next) +{ + _rx_ClassNode *cls = (_rx_ClassNode *)node; + + if (*cur == 0) + return false; + + const uint32_t chr = utf8_peek(cur); + *next = utf8_next(cur); + + bool found = false; + for (_rx_RangeNode *range = cls->ranges; range != NULL; + range = (_rx_RangeNode *)range->generic.next) { + if (chr >= range->first && chr <= range->last) { + found = true; + break; + } + } + + if (cls->negate) + found = !found; + + return found; +} +#include <stdio.h> +static bool _rx_cap_is_match(cregex_node *node, const char *orig, + const char *cur, const char **next) +{ + _rx_CapNode *cap = (_rx_CapNode *)node; + + if (_rx_is_match(cap->subexp, orig, cur, next)) { + cap->cap.start = cur - orig; + cap->cap.end = (*next) - orig; + //printf("Cap: %.*s\n", cap->cap.end - cap->cap.start, orig + cap->cap.start); + return true; + } + + return false; +} + +static bool _rx_or_is_match(cregex_node *node, const char *orig, + const char *cur, const char **next) +{ + _rx_OrNode *ior = (_rx_OrNode *)node; + + if (ior->generic.next != NULL) { + ior->right = ior->generic.next; + ior->generic.next = NULL; + } + + if (_rx_is_match(ior->left, orig, cur, next) && ior->left != NULL) { + return true; + } + + return _rx_is_match(ior->right, orig, cur, next) && ior->right != NULL; +} + +/* Global error value with callback address */ +struct { + cregex_error_t err; + const char *s; + jmp_buf buf; +} _rx_CompileException; + +/* set global error value to the default value */ +static inline void _rx_clear_compile_exception(void) +{ + _rx_CompileException.err = cregex_OK; + _rx_CompileException.s = NULL; +} + +/* set global error value and jump back to the exception handler */ +static void _rx_throw_compile_exception(cregex_error_t err, const char *s) +{ + _rx_CompileException.err = err; + _rx_CompileException.s = s; + longjmp(_rx_CompileException.buf, 1); +} + +static size_t _rx_calc_compiled_escaped_len(const char *s, const char **leftover) +{ + if (*s == 0) + _rx_throw_compile_exception(cregex_UNEXPECTED_EOL, s); + + const uint32_t chr = utf8_peek(s); + *leftover = utf8_next(s); + + switch (chr) { + case 's': + return 5; + + case 'S': + return 5; + + case 'd': + return 2; + + case 'D': + return 2; + + case 'w': + return 5; + + case 'W': + return 5; + + default: + return 1; + } +} + +static const size_t _rx_calc_compiled_class_len(const char *s, + const char **leftover) +{ + if (*s == '^') + s++; + + size_t ret = 1; + + while (*s && *s != ']') { + uint32_t chr = utf8_peek(s); + s = utf8_next(s); + if (chr == '\\') { + s = utf8_next(s); + } + + if (*s == '-' && s[1] != ']') { + s++; + chr = utf8_peek(s); + s = utf8_next(s); + + if (chr == '\\') + s = utf8_next(s); + } + + ret++; + } + + if (*s == ']') { + s++; + *leftover = s; + } else { + _rx_throw_compile_exception(cregex_INVALID_COMPLEX_CLASS, s); + } + + return ret; +} + +/* get required amount of memory in amount of nodes + * to _rx_compile regular expressions */ +static const size_t _rx_calc_compiled_len(const char *s) +{ + if (*s == 0) { + return 1; + } else { + const uint32_t chr = utf8_peek(s); + size_t ret = 0; + s = utf8_next(s); + + switch (chr) { + case '{': { + const char *end = strstr(s, "}"); + + if (end == NULL) + _rx_throw_compile_exception( + cregex_INVALID_COMPLEX_QUANT, s); + + s = end + 1; + ret = 1; + break; + } + + case '\\': + ret = _rx_calc_compiled_escaped_len(s, &s); + break; + + case '[': + ret = _rx_calc_compiled_class_len(s, &s); + break; + + default: + ret = 1; + break; + } + + return ret + _rx_calc_compiled_len(s); + } +} + +static void _rx_append_quant(cregex_node **prev, cregex_node *cur, size_t min, + size_t max, const char *re) +{ + cur->generic.match = _rx_quant_is_match; + cur->generic.next = NULL; + cur->generic.prev = NULL; + + cur->quant.max = max; + cur->quant.min = min; + cur->quant.subexp = *prev; + + *prev = (*prev)->generic.prev; + if (*prev == NULL) + _rx_throw_compile_exception(cregex_EARLY_QUANTIFIER, re); + + cur->quant.subexp->generic.next = NULL; + cur->quant.subexp->generic.prev = NULL; +} + +static inline bool _rx_is_digit(uint32_t c) +{ + return c >= '0' && c <= '9'; +} + +static size_t _rx_parse_digit(const char *s, const char **leftover) +{ + size_t ret = 0; + + while (*s) { + uint32_t chr = utf8_peek(s); + + if (_rx_is_digit(chr)) { + ret *= 10; + ret += chr - '0'; + s = utf8_next(s); + } else { + break; + } + } + + *leftover = s; + return ret; +} + +/* parse complex quantifier of format {m,n} + * valid formats: {,} {m,} {,n} {m} {m,n} */ +static void _rx_parse_complex_quant(const char *re, const char **leftover, + size_t *min_p, size_t *max_p) +{ + if (*re == 0) + _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re); + + uint32_t tmp = utf8_peek(re); + size_t min = 0, max = SIZE_MAX; + + if (_rx_is_digit(tmp)) { + min = _rx_parse_digit(re, &re); + } else if (tmp != ',') { + _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re); + } + + tmp = utf8_peek(re); + + if (tmp == ',') { + re = utf8_next(re); + if (_rx_is_digit(utf8_peek(re))) + max = _rx_parse_digit(re, &re); + else + max = SIZE_MAX; + } else { + max = min; + } + + tmp = utf8_peek(re); + if (tmp == '}') { + *leftover = re + 1; + *min_p = min; + *max_p = max; + } else { + _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re); + } +} + +/* append character class to linked list of nodes with + * ranges given as optional arguments. Returns pointer + * to next */ +static cregex_node *_rx_append_class(cregex_node *cur, bool negate, size_t n, ...) +{ + cur->cls.negate = negate; + cur->cls.ranges = (_rx_RangeNode *)(n ? cur + 1 : NULL); + cur->generic.match = _rx_class_is_match; + cur->generic.next = NULL; + cur->generic.prev = NULL; + + va_list ap; + va_start(ap, n); + cregex_node *prev = NULL; + cur = cur + 1; + + for (size_t i = 0; i < n; ++i) { + const uint32_t first = va_arg(ap, uint32_t); + const uint32_t last = va_arg(ap, uint32_t); + + cur->generic.next = NULL; + cur->generic.prev = prev; + + if (prev) + prev->generic.next = cur; + + cur->range.first = first; + cur->range.last = last; + + prev = cur; + cur = cur + 1; + } + + va_end(ap); + + return cur; +} + +/** _rx_compile escaped characters. return pointer to the next free node. */ +static cregex_node *_rx_compile_next_escaped(const char *re, const char **leftover, + cregex_node *cur) +{ + if (*re == 0) + _rx_throw_compile_exception(cregex_UNEXPECTED_EOL, re); + + const uint32_t chr = utf8_peek(re); + *leftover = utf8_next(re); + cregex_node *ret = cur + 1; + + switch (chr) { + case 'n': + cur->chr.chr = '\n'; + cur->generic.match = _rx_char_is_match; + break; + + case 't': + cur->chr.chr = '\t'; + cur->generic.match = _rx_char_is_match; + break; + + case 'r': + cur->chr.chr = '\r'; + cur->generic.match = _rx_char_is_match; + break; + + case 's': + ret = _rx_append_class(cur, false, 4, ' ', ' ', '\t', '\t', '\r', + '\r', '\n', '\n'); + break; + + case 'S': + ret = _rx_append_class(cur, true, 4, ' ', ' ', '\t', '\t', '\r', + '\r', '\n', '\n'); + break; + + case 'w': + ret = _rx_append_class(cur, false, 4, 'a', 'z', 'A', 'Z', '0', '9', + '_', '_'); + break; + + case 'W': + ret = _rx_append_class(cur, true, 4, 'a', 'z', 'A', 'Z', '0', '9', + '_', '_'); + break; + + case 'd': + ret = _rx_append_class(cur, false, 1, '0', '9'); + break; + + case 'D': + ret = _rx_append_class(cur, true, 1, '0', '9'); + break; + + default: + cur->chr.chr = chr; + cur->generic.match = _rx_char_is_match; + break; + } + + return ret; +} + +static cregex_node *_rx_compile_next_complex_class(const char *re, + const char **leftover, + cregex_node *cur) +{ + cur->generic.match = _rx_class_is_match; + cur->generic.next = NULL; + cur->generic.prev = NULL; + + if (*re == '^') { + re++; + cur->cls.negate = true; + } else { + cur->cls.negate = false; + } + + cur->cls.ranges = NULL; + + cur = cur + 1; + cregex_node *prev = NULL; + + while (*re && *re != ']') { + uint32_t first = 0, last = 0; + + first = utf8_peek(re); + re = utf8_next(re); + if (first == '\\') { + if (*re == 0) + _rx_throw_compile_exception( + cregex_INVALID_COMPLEX_CLASS, re); + + first = utf8_peek(re); + re = utf8_next(re); + } + + if (*re == '-' && re[1] != ']' && re[1]) { + re++; + last = utf8_peek(re); + re = utf8_next(re); + + if (last == '\\') { + if (*re == 0) + _rx_throw_compile_exception( + cregex_INVALID_COMPLEX_CLASS, + re); + + last = utf8_peek(re); + re = utf8_next(re); + } + } else { + last = first; + } + + cur->range.first = first; + cur->range.last = last; + cur->generic.prev = prev; + cur->generic.next = NULL; + + if (prev == NULL) { + (cur - 1)->cls.ranges = (_rx_RangeNode *)cur; + } else { + prev->generic.next = cur; + } + + prev = cur; + cur++; + } + + if (*re == ']') { + *leftover = re + 1; + return cur; + } else { + _rx_throw_compile_exception(cregex_INVALID_COMPLEX_CLASS, re); + return NULL; // Unreachable + } +} + +static const char *_rx_find_closing_par(const char *s) +{ + size_t level = 1; + + for (; *s && level != 0; ++s) { + if (*s == '\\') + s++; + else if (*s == '(') + level++; + else if (*s == ')') + level--; + } + + if (level == 0) + return s; + else + return NULL; +} + +static cregex_node *_rx_compile(const char *re, const char *end, cregex_node *nodes); + +static cregex_node *_rx_compile_next_cap(const char *re, const char **leftover, + cregex_node *cur) +{ + cur->cap.cap.start = 0; + cur->cap.cap.end = 0; + cur->cap.subexp = cur + 1; + cur->generic.next = NULL; + cur->generic.prev = NULL; + cur->generic.match = _rx_cap_is_match; + + const char *end = _rx_find_closing_par(re); + + if (end == NULL) + _rx_throw_compile_exception(cregex_UNCLOSED_SUBEXPRESSION, re); + + *leftover = end; + return _rx_compile(re, end - 1, cur + 1); +} + +static cregex_node *insert_or(cregex_node *cur, cregex_node **prev) { + cur->generic.match = _rx_or_is_match; + cur->generic.next = NULL; + cur->generic.prev = NULL; + + // Find last start node + cregex_node *begin = *prev; + + while (begin->generic.match != _rx_start_is_match) { + begin = begin->generic.prev; + } + + cur->ior.left = begin->generic.next; + *prev = begin; + + return cur + 1; +} + +/* _rx_compile next node. returns address of next available node. + * returns NULL if re is empty */ +static cregex_node *_rx_compile_next(const char *re, const char **leftover, + cregex_node *prev, cregex_node *cur) +{ + if (*re == 0) + return NULL; + + const uint32_t chr = utf8_peek(re); + re = utf8_next(re); + cregex_node *next = cur + 1; + + switch (chr) { + case '^': + cur->generic.match = _rx_anchor_begin_is_match; + break; + + case '$': + cur->generic.match = _rx_anchor_end_is_match; + break; + + case '.': + cur->generic.match = _rx_any_is_match; + break; + + case '*': + _rx_append_quant(&prev, cur, 0, SIZE_MAX, re); + break; + + case '+': + _rx_append_quant(&prev, cur, 1, SIZE_MAX, re); + break; + + case '?': + _rx_append_quant(&prev, cur, 0, 1, re); + break; + + case '{': { + size_t min = 0, max = SIZE_MAX; + const char *leftover = NULL; + _rx_parse_complex_quant(re, &leftover, &min, &max); + + _rx_append_quant(&prev, cur, min, max, re); + re = leftover; + break; + } + + case '[': + next = _rx_compile_next_complex_class(re, &re, cur); + break; + + case '(': + next = _rx_compile_next_cap(re, &re, cur); + break; + + case '\\': + next = _rx_compile_next_escaped(re, &re, cur); + break; + + case '|': + next = insert_or(cur, &prev); + break; + + default: + cur->chr.chr = chr; + cur->generic.match = _rx_char_is_match; + break; + } + + cur->generic.next = NULL; + cur->generic.prev = prev; + prev->generic.next = cur; + *leftover = re; + + return next; +} + +/* _rx_compile raw regular expression into a linked list of nodes. return leftover nodes */ +static cregex_node *_rx_compile(const char *re, const char *end, cregex_node *nodes) +{ + cregex_node *prev = nodes; + cregex_node *cur = nodes + 1; + + prev->generic.next = NULL; + prev->generic.prev = NULL; + prev->generic.match = _rx_start_is_match; + + while (cur != NULL && re != NULL && re < end) { + const char *next = NULL; + cregex_node *next_node = _rx_compile_next(re, &next, prev, cur); + + prev = cur; + cur = next_node; + re = next; + } + + return cur; +} + +STC_DEF cregex cregex_new(const char *re) +{ + cregex ret = {NULL}; + + _rx_clear_compile_exception(); + if (re == NULL) { + _rx_CompileException.err = cregex_INVALID_PARAMS; + return ret; + } + + if (!utf8_valid(re)) { + _rx_CompileException.err = cregex_INVALID_UTF8; + _rx_CompileException.s = NULL; + return ret; + } + + cregex_node *nodes = NULL; + + if (setjmp(_rx_CompileException.buf)) { + // Error callback + free(nodes); + return ret; + } + + const size_t compile_len = _rx_calc_compiled_len(re); + nodes = (cregex_node *)calloc(compile_len, sizeof(cregex_node)); + _rx_compile(re, re + strlen(re), nodes); + ret.nodes = nodes; + + return ret; +} + +STC_DEF cregex_error_t cregex_error(void) +{ + return _rx_CompileException.err; +} + +STC_DEF bool cregex_matches(cregex* rx, const char *s) +{ + const char* next; + bool res = _rx_is_match(rx->nodes, s, s, &next); + if (res && *next == 0) { + rx->match.start = 0; + rx->match.end = next - s; + rx->input = s; + return true; + } + rx->input = NULL; + return false; +} + +STC_DEF bool cregex_find_next(cregex *rx, const char *s, cregex_match *m) +{ + const char *it = s + m->end, *next; + + for (; *it; it = utf8_next(it)) { + if (_rx_is_match(rx->nodes, s, it, &next)) { + m->start = it - s; + m->end = next - s; + rx->match = *m; + rx->input = s; + return true; + } + } + rx->input = NULL; + return false; +} + +STC_API bool cregex_find_next_sv(cregex *rx, const char *s, csview *sv) +{ + if (!sv->str) sv->str = s; + cregex_match m = {(size_t)(sv->str - s), m.start + sv->size}; + + bool res = cregex_find_next(rx, s, &m); + if (res) *sv = c_make(csview){s + m.start, m.end - m.start}; + return res; +} + + +STC_DEF void cregex_drop(cregex *rx) +{ + free(rx->nodes); +} + +/* calculate amount of capture groups + * inside a regular expression */ +static size_t _rx_cap_node_count(cregex_node *nodes) +{ + if (nodes == NULL) { + return 0; + } else if (nodes->generic.match == _rx_quant_is_match) { + return _rx_cap_node_count(nodes->quant.subexp) + + _rx_cap_node_count(nodes->generic.next); + } else if (nodes->generic.match == _rx_cap_is_match) { + return _rx_cap_node_count(nodes->quant.subexp) + + _rx_cap_node_count(nodes->generic.next) + 1; + } else { + return _rx_cap_node_count(nodes->generic.next); + } +} + +STC_DEF size_t cregex_capture_size(cregex rx) +{ + return _rx_cap_node_count(rx.nodes) + (rx.input != NULL); +} + +static cregex_node *_rx_find_capture_node(cregex_node *node, size_t index) +{ + if (node == NULL) { + return NULL; + } else if (node->generic.match == _rx_cap_is_match) { + if (index == 0) { + return node; + } else { + const size_t subexp_len = + _rx_cap_node_count(node->cap.subexp); + if (index <= subexp_len) { + return _rx_find_capture_node(node->cap.subexp, + index - subexp_len); + } else { + return _rx_find_capture_node(node->generic.next, + index - subexp_len - 1); + } + } + } else if (node->generic.match == _rx_quant_is_match) { + const size_t subexp_len = _rx_cap_node_count(node->quant.subexp); + if (index < subexp_len) { + return _rx_find_capture_node(node->quant.subexp, index); + } else { + return _rx_find_capture_node(node->generic.next, index - 1); // FIX by Tyge, added: - 1 + } + } else { + return _rx_find_capture_node(node->generic.next, index); + } +} + +STC_DEF bool cregex_capture(cregex *rx, size_t index, cregex_match *m) +{ + if (index == 0) { *m = rx->match; return m->end != 0; } + + _rx_CapNode *cap = (_rx_CapNode *)_rx_find_capture_node(rx->nodes, index - 1); + if (cap) *m = cap->cap; + return cap != NULL; +} + +#endif +#endif +#undef i_opt diff --git a/tests/cregex_test.c b/tests/cregex_test.c new file mode 100644 index 00000000..a9549f96 --- /dev/null +++ b/tests/cregex_test.c @@ -0,0 +1,370 @@ +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +//#include <check.h> +#include <stc/cregex.h> + +#define START_TEST(f) void f(void) +#define END_TEST + +#define ck_assert(x) assert(x) +#define ck_assert_uint_ne(a, b) assert(a != b) +#define ck_assert_uint_eq(a, b) if (!((a) == (b))) printf("%u == %u: ", a, b), assert((a) == (b)) +#define ck_assert_int_ne(a, b) assert(a != b) +#define ck_assert_int_eq(a, b) assert(a == b) +#define ck_assert_ptr_ne(a, b) assert(a != b) +#define ck_assert_ptr_eq(a, b) assert(a == b) + + +START_TEST(compile_match_char) +{ + cregex re = cregex_new("äsdf"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + cregex_match match; + ck_assert(cregex_find(&re, "äsdf", &match)); + ck_assert_uint_eq(match.start, 0); + ck_assert_uint_eq(match.end, 5); // ä is two bytes wide + + ck_assert(cregex_find(&re, "zäsdf", &match)); + ck_assert_uint_eq(match.start, 1); + ck_assert_uint_eq(match.end, 6); + + ck_assert(cregex_find(&re, "äsdf", &match)); + ck_assert_uint_eq(match.start, 0); + ck_assert_uint_eq(match.end, 5); + + cregex_drop(&re); +} +END_TEST + +START_TEST(compile_match_anchors) +{ + cregex re[1] = {cregex_new("^äs.f$")}; + ck_assert_int_eq(cregex_error(), cregex_OK); + + cregex_match match; + ck_assert(cregex_find(re, "äsdf", &match)); + ck_assert_uint_eq(match.start, 0); + ck_assert_uint_eq(match.end, 5); + + ck_assert(cregex_find(re, "äs♥f", &match)); + ck_assert(cregex_find(re, "äsöf", &match)); + + cregex_drop(re); +} +END_TEST + +START_TEST(compile_match_quantifiers) +{ + c_auto (cregex, re) { + re = cregex_new("ä+"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + cregex_match match; + ck_assert(cregex_find(&re, "ääb", &match)); + ck_assert_uint_eq(match.start, 0); + ck_assert_uint_eq(match.end, 4); + + ck_assert(cregex_find(&re, "bäbb", &match)); + ck_assert_uint_eq(match.start, 1); + ck_assert_uint_eq(match.end, 3); + + ck_assert(!cregex_find(&re, "bbb", &match)); + } + c_auto (cregex, re) { + re = cregex_new("bä*"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + cregex_match match; + ck_assert(cregex_find(&re, "bääb", &match)); + ck_assert_uint_eq(match.start, 0); + ck_assert_uint_eq(match.end, 5); + + ck_assert(cregex_find(&re, "bäbb", &match)); + ck_assert_uint_eq(match.start, 0); + ck_assert_uint_eq(match.end, 3); + + ck_assert(cregex_find(&re, "bbb", &match)); + ck_assert_uint_eq(match.start, 0); + ck_assert_uint_eq(match.end, 1); + } +} +END_TEST + +START_TEST(compile_match_complex_quants) +{ + c_auto (cregex, re1, re2, re3, re4) + { + re1 = cregex_new("ä{1,3}"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + re2 = cregex_new("ä{1}"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + re3 = cregex_new("ä{,}"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + re4 = cregex_new("ä{,3}"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + cregex_match match; + ck_assert(cregex_find(&re1, "ääb", &match)); + ck_assert_uint_eq(match.start, 0); + ck_assert_uint_eq(match.end, 4); + ck_assert(cregex_find(&re1, "äääb", &match)); + ck_assert(cregex_find(&re1, "äb", &match)); + ck_assert(!cregex_find(&re1, "b", &match)); + + ck_assert(cregex_find(&re2, "ää", &match)); + ck_assert_uint_eq(match.start, 0); + ck_assert_uint_eq(match.end, 2); + ck_assert(cregex_find(&re2, "bbäb", &match)); + ck_assert(!cregex_find(&re2, "bbbb", &match)); + + ck_assert(cregex_find(&re3, "ääääääääääb", &match)); + ck_assert_uint_eq(match.start, 0); + ck_assert_uint_eq(match.end, 20); + ck_assert(cregex_find(&re3, "b", &match)); + + ck_assert(cregex_find(&re4, "bä", &match)); + ck_assert(cregex_find(&re4, "bää", &match)); + ck_assert(cregex_find(&re4, "bäää", &match)); + } +} +END_TEST + +START_TEST(compile_match_escaped_chars) +{ + cregex re = cregex_new("\\n\\r\\t\\{"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + cregex_match match; + ck_assert(cregex_find(&re, "\n\r\t{", &match)); + ck_assert(!cregex_find(&re, "\n\r\t", &match)); + + cregex_drop(&re); +} +END_TEST + +START_TEST(compile_match_class_simple) +{ + c_auto (cregex, re1, re2, re3) + { + re1 = cregex_new("\\s"); + ck_assert_int_eq(cregex_error(), cregex_OK); + re2 = cregex_new("\\w"); + ck_assert_int_eq(cregex_error(), cregex_OK); + re3 = cregex_new("\\D"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + cregex_match match; + ck_assert(cregex_find(&re1, " ", &match)); + ck_assert(cregex_find(&re1, "\r", &match)); + ck_assert(cregex_find(&re1, "\n", &match)); + + ck_assert(cregex_find(&re2, "a", &match)); + ck_assert(cregex_find(&re2, "0", &match)); + ck_assert(cregex_find(&re2, "_", &match)); + + ck_assert(cregex_find(&re3, "k", &match)); + ck_assert(!cregex_find(&re3, "0", &match)); + } +} +END_TEST + +START_TEST(compile_match_or) +{ + c_auto (cregex, re, re2) + { + re = cregex_new("as|df"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + cregex_match match; + ck_assert(cregex_find(&re, "as", &match)); + ck_assert(cregex_find(&re, "df", &match)); + + re2 = cregex_new("(as|df)"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + ck_assert(cregex_find(&re2, "as", &match)); + ck_assert(cregex_find(&re2, "df", &match)); + } +} +END_TEST + +START_TEST(compile_match_class_complex_0) +{ + cregex re = cregex_new("[asdf]"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + cregex_match match; + ck_assert(cregex_find(&re, "a", &match)); + ck_assert(cregex_find(&re, "s", &match)); + ck_assert(cregex_find(&re, "d", &match)); + ck_assert(cregex_find(&re, "f", &match)); + + cregex_drop(&re); +} +END_TEST + +START_TEST(compile_match_class_complex_1) +{ + cregex re = cregex_new("[a-zä0-9öA-Z]"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + cregex_match match; + ck_assert(cregex_find(&re, "a", &match)); + ck_assert(cregex_find(&re, "5", &match)); + ck_assert(cregex_find(&re, "A", &match)); + ck_assert(cregex_find(&re, "ä", &match)); + ck_assert(cregex_find(&re, "ö", &match)); + + cregex_drop(&re); +} +END_TEST + +START_TEST(compile_match_cap) +{ + cregex re = cregex_new("(abc)d"); + ck_assert_int_eq(cregex_error(), cregex_OK); + + cregex_match match; + ck_assert(cregex_find(&re, "abcd", &match)); + ck_assert(cregex_find(&re, "llljabcdkk", &match)); + ck_assert(!cregex_find(&re, "abc", &match)); + + cregex_drop(&re); +} +END_TEST + +START_TEST(invalid_quantifier) +{ + ck_assert_ptr_eq(cregex_new("+").nodes, NULL); + ck_assert_int_eq(cregex_error(), cregex_EARLY_QUANTIFIER); +} +END_TEST + +/* Test that invalid utf8 sequences cause a + * cregex_INVALID_UTF8 error */ +START_TEST(invalid_utf8) +{ + char s1[3] = "ä"; + s1[1] = 65; // invalid continuation byte + + ck_assert_ptr_eq(cregex_new(s1).nodes, NULL); + ck_assert_int_eq(cregex_error(), cregex_INVALID_UTF8); +} +END_TEST + +START_TEST(search_all) +{ + c_auto (cregex, re) + { + re = cregex_new("ab"); + cregex_match m = {0}; + bool res; + + res = cregex_find_next(&re, "ab,ab,ab", &m); + ck_assert(res && m.start == 0); + res = cregex_find_next(&re, "ab,ab,ab", &m); + ck_assert(res && m.start == 3); + res = cregex_find_next(&re, "ab,ab,ab", &m); + ck_assert(res && m.start == 6); + res = cregex_find_next(&re, "ab,ab,ab", &m); + ck_assert(!res); + } +} +END_TEST + +START_TEST(captures_len) +{ + c_auto (cregex, re) { + cregex re = cregex_new("(ab(cd))(ef)"); + ck_assert_uint_eq(cregex_capture_size(re), 3); + } +} +END_TEST + +START_TEST(captures_cap) +{ + c_auto (cregex, re) { + re = cregex_new("(ab)((cd)+)"); + ck_assert_uint_eq(cregex_capture_size(re), 3); + + cregex_match match; + ck_assert(cregex_find(&re, "xxabcdcde", &match)); + + cregex_match cap0, cap1, cap2; + cregex_capture(&re, 0, &cap0); + cregex_capture(&re, 1, &cap1); + cregex_capture(&re, 2, &cap2); + + ck_assert_uint_eq(cap0.start, 2); + ck_assert_uint_eq(cap0.end, 8); + ck_assert_uint_eq(cap1.start, 2); + ck_assert_uint_eq(cap1.end, 4); + ck_assert_uint_eq(cap2.start, 4); + ck_assert_uint_eq(cap2.end, 8); + + ck_assert(!cregex_matches(&re, "abcdcde")); + ck_assert(cregex_matches(&re, "abcdcdcd")); + } +} +END_TEST + +/* +Suite *cregex_test_suite(void) +{ + Suite *ret = suite_create("mregexp"); + TCase *tcase = tcase_create("mregexp"); + + tcase_add_test(tcase, compile_match_char); + tcase_add_test(tcase, invalid_utf8); + tcase_add_test(tcase, compile_match_anchors); + tcase_add_test(tcase, compile_match_quantifiers); + tcase_add_test(tcase, invalid_quantifier); + tcase_add_test(tcase, compile_match_complex_quants); + tcase_add_test(tcase, compile_match_escaped_chars); + tcase_add_test(tcase, compile_match_class_simple); + tcase_add_test(tcase, compile_match_class_complex_0); + tcase_add_test(tcase, compile_match_class_complex_1); + tcase_add_test(tcase, compile_match_cap); + tcase_add_test(tcase, search_all); + //tcase_add_test(tcase, captures_len); + //tcase_add_test(tcase, captures_cap); + tcase_add_test(tcase, compile_match_or); + + suite_add_tcase(ret, tcase); + return ret; +} + +int main(void) +{ + SRunner *sr = srunner_create(cregex_test_suite()); + srunner_run_all(sr, CK_NORMAL); + int fails = srunner_ntests_failed(sr); + srunner_free(sr); + return fails ? EXIT_FAILURE : EXIT_SUCCESS; +} +*/ + +int main() +{ + compile_match_char(); + invalid_utf8(); + compile_match_anchors(); + compile_match_quantifiers(); + invalid_quantifier(); + compile_match_complex_quants(); + compile_match_escaped_chars(); + compile_match_class_simple(); + compile_match_class_complex_0(); + compile_match_class_complex_1(); + compile_match_cap(); + search_all(); + //captures_len(); + //captures_cap(); + compile_match_or(); + printf("All tests succesful.\n"); +} |
