diff options
| author | Tyge Løvset <[email protected]> | 2022-01-11 00:27:44 +0100 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2022-01-11 00:27:44 +0100 |
| commit | af1b2068b39deebfbf723532a2eddc49da61894a (patch) | |
| tree | f26d06d6be52e280bea0f82558e3e040a89b4335 | |
| parent | 14c0c1b9ab2ce6a4f98baf389d712515b7c68933 (diff) | |
| download | STC-modified-af1b2068b39deebfbf723532a2eddc49da61894a.tar.gz STC-modified-af1b2068b39deebfbf723532a2eddc49da61894a.zip | |
Removed regex for now.
| -rw-r--r-- | examples/regex1.c | 26 | ||||
| -rw-r--r-- | examples/regex2.c | 26 | ||||
| -rw-r--r-- | examples/regex_match.c | 35 | ||||
| -rw-r--r-- | include/stc/cregex.h | 981 | ||||
| -rw-r--r-- | tests/cregex_test.c | 367 |
5 files changed, 0 insertions, 1435 deletions
diff --git a/examples/regex1.c b/examples/regex1.c deleted file mode 100644 index 9b450453..00000000 --- a/examples/regex1.c +++ /dev/null @@ -1,26 +0,0 @@ -#include <stc/cstr.h>
-#include <stc/cregex.h>
-
-int main()
-{
- c_auto (cstr, input)
- c_auto (cregex, float_expr)
- {
- float_expr = cregex_new("[+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)");
- // Until "q" is given, ask for another number
- while (true)
- {
- printf("Enter float number (q for quit): ");
- cstr_getline(&input, stdin);
-
- // Exit when the user inputs q
- if (cstr_equals(input, "q"))
- break;
-
- if (cregex_is_match(&float_expr, input.str))
- printf("Input is a float\n");
- else
- printf("Invalid input : Not a float\n");
- }
- }
-}
\ No newline at end of file diff --git a/examples/regex2.c b/examples/regex2.c deleted file mode 100644 index 0562af00..00000000 --- a/examples/regex2.c +++ /dev/null @@ -1,26 +0,0 @@ -#include <stc/cregex.h>
-#include <stc/csview.h>
-#include <stc/cstr.h>
-
-int main()
-{
- const char* fnames[] = {"home/foofile.txt", "cool/barfile.txt", "test/bazboy.dat", "hello/zoidberg"};
- c_auto (cregex, re)
- {
- re = cregex_new("([a-z]+)\\/([a-z]+)\\.([a-z]+)");
-
- c_forrange (i, c_arraylen(fnames))
- {
- printf("%s\n", fnames[i]);
- if (cregex_is_match(&re, fnames[i]))
- {
- c_forrange (j, cregex_capture_size(re))
- {
- csview cap; cregex_capture_v(&re, j, &cap);
- printf(" submatch %zu: " c_PRIsv "\n", j, c_ARGsv(cap));
- }
- puts("");
- }
- }
- }
-}
diff --git a/examples/regex_match.c b/examples/regex_match.c deleted file mode 100644 index 72adeb8e..00000000 --- a/examples/regex_match.c +++ /dev/null @@ -1,35 +0,0 @@ -#include <stdio.h>
-#include <stc/csview.h>
-#include <stc/cregex.h>
-#include <stc/crandom.h>
-#define i_val double
-#define i_type Vecu64
-#include <stc/cstack.h>
-#include <time.h>
-
-
-int main()
-{
- // Lets find the first sequence of digits in a string
- const char *s = "Hello numeric world, there are 24 hours in a day, 3600 seconds in an hour."
- " Around 365.25 days a year, and 52 weeks in a year."
- " Boltzmann const: 1.38064852E-23, is very small."
- " Bohrradius is 5.29177210903e-11, and Avogadros number is 6.02214076e23.";
-
- c_auto (cregex, re)
- {
- re = cregex_new("[+-]?([0-9]*\\.)?[0-9]+([Ee][-+]?[0-9]+)?");
- cregex_match match;
- if (cregex_find(&re, s, &match)) {
- printf("Found digits at position %zu-%zu\n", match.start, match.end);
- } else {
- printf("Could not find any digits\n");
- }
-
- csview sv = {0};
- while (cregex_find_next_v(&re, s, &sv)) {
- printf(c_PRIsv " ; ", c_ARGsv(sv));
- }
- puts("");
- }
-}
\ No newline at end of file diff --git a/include/stc/cregex.h b/include/stc/cregex.h deleted file mode 100644 index 84799d75..00000000 --- a/include/stc/cregex.h +++ /dev/null @@ -1,981 +0,0 @@ -/* - * Copyright (c) 2020 Fabian van Rissenbeck - * https://github.com/deinernstjetzt/mregexp - * 2009 Bjoern Hoehrmann <[email protected]> - * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ - * 2022 Tyge Løvset - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -*/ -#ifndef CREGEX_INCLUDED -#define CREGEX_INCLUDED - -#include "csview.h" -#include "utf8.h" -#include <stdlib.h> -#include <setjmp.h> -#include <stdarg.h> - -typedef struct { - union cregex_node *nodes; - const char* input; -} cregex; - -typedef struct { - size_t start; - size_t end; -} cregex_match; - -typedef enum { - cregex_OK = 0, - cregex_FAILED_ALLOC, - cregex_INVALID_UTF8, - cregex_INVALID_PARAMS, - cregex_EARLY_QUANTIFIER, - cregex_INVALID_COMPLEX_QUANT, - cregex_UNEXPECTED_EOL, - cregex_INVALID_COMPLEX_CLASS, - cregex_UNCLOSED_SUBEXPRESSION, -} cregex_error_t; - -/* create an empty expression */ -STC_INLINE cregex cregex_init(void) - { cregex rx = {NULL}; return rx; } - -/* compile regular expression */ -STC_API cregex cregex_new(const char *re); - -/* get error type if a function failed */ -STC_API cregex_error_t cregex_error(void); - -/* check if input s matches re */ -STC_API bool cregex_is_match(cregex *rx, const char *s); - -/* find the next matching substring in s */ -STC_API bool cregex_find_next(cregex *rx, const char *s, cregex_match *m); - -/* find the next matching substring in s, return as csview */ -STC_API bool cregex_find_next_v(cregex *rx, const char *s, csview *sv); - -/* find the first matching substring in s */ -STC_INLINE bool cregex_find(cregex *rx, const char *s, cregex_match *m) { - m->start = m->end = 0; - return cregex_find_next(rx, s, m); -} - -STC_INLINE bool cregex_find_v(cregex *rx, const char *s, csview *sv) { - sv->str = s, sv->size = 0; - return cregex_find_next_v(rx, s, sv); -} - -/* get captured slice from capture group number index */ -STC_API bool cregex_capture(cregex *rx, size_t index, cregex_match *m); - -/* get captured slice from capture group number index as a csview */ -STC_INLINE bool cregex_capture_v(cregex *rx, size_t index, csview *sv) { - cregex_match m; - bool ret = cregex_capture(rx, index, &m); - *sv = c_make(csview){rx->input + m.start, m.end - m.start}; - return ret; -} - -/* get amount of capture groups inside of - * the regular expression */ -STC_API size_t cregex_capture_size(cregex rx); - -/* free regular expression */ -STC_API void cregex_drop(cregex *rx); - -/* -------------------------- IMPLEMENTATION ------------------------- */ -#if defined(_i_implement) - - -/* function pointer type used to evaluate if a regex node - * matched a given string */ -typedef bool (*_rx_MatchFunc)(union cregex_node *node, const char *orig, - const char *cur, const char **next); - -typedef struct _rx_GenericNode { - union cregex_node *prev; - union cregex_node *next; - _rx_MatchFunc match; -} _rx_GenericNode; - -typedef struct { - _rx_GenericNode generic; - uint32_t chr; -} _rx_CharNode; - -typedef struct { - _rx_GenericNode generic; - union cregex_node *subexp; - size_t min, max; -} _rx_QuantNode; - -typedef struct { - _rx_GenericNode generic; - uint32_t first, last; -} _rx_RangeNode; - -typedef struct { - _rx_GenericNode generic; - _rx_RangeNode *ranges; - bool negate; -} _rx_ClassNode; - -typedef struct { - _rx_GenericNode generic; - union cregex_node *subexp; - cregex_match cap; -} _rx_CapNode; - -typedef struct { - _rx_GenericNode generic; - union cregex_node *left; - union cregex_node *right; -} _rx_OrNode; - -typedef union cregex_node { - _rx_GenericNode generic; - _rx_CharNode chr; - _rx_QuantNode quant; - _rx_ClassNode cls; - _rx_RangeNode range; - _rx_CapNode cap; - _rx_OrNode ior; -} cregex_node; - -static bool _rx_is_match(cregex_node *node, const char *orig, - const char *cur, const char **next) -{ - if (node == NULL) { - *next = cur; - return true; - } else { - return ((node->generic.match)(node, orig, cur, next)) && - _rx_is_match(node->generic.next, orig, *next, next); - } -} - -static bool _rx_char_is_match(cregex_node *node, const char *orig, - const char *cur, const char **next) -{ - if (*cur == 0) { - return false; - } - - *next = utf8_next(cur); - return node->chr.chr == utf8_peek(cur); -} - -static bool _rx_start_is_match(cregex_node *node, const char *orig, - const char *cur, const char **next) -{ - *next = cur; - return true; -} - -static bool _rx_anchor_begin_is_match(cregex_node *node, const char *orig, - const char *cur, const char **next) -{ - *next = cur; - return strlen(orig) == strlen(cur); -} - -static bool _rx_anchor_end_is_match(cregex_node *node, const char *orig, - const char *cur, const char **next) -{ - *next = cur; - return strlen(cur) == 0; -} - -static bool _rx_any_is_match(cregex_node *node, const char *orig, - const char *cur, const char **next) -{ - if (*cur) { - *next = utf8_next(cur); - return true; - } - - return false; -} - -static bool _rx_quant_is_match(cregex_node *node, const char *orig, - const char *cur, const char **next) -{ - _rx_QuantNode *quant = (_rx_QuantNode *)node; - size_t matches = 0; - - while (_rx_is_match(quant->subexp, orig, cur, next)) { - matches++; - cur = *next; - - if (matches >= quant->max) - break; - } - - *next = cur; - return matches >= quant->min; -} - -static bool _rx_class_is_match(cregex_node *node, const char *orig, - const char *cur, const char **next) -{ - _rx_ClassNode *cls = (_rx_ClassNode *)node; - - if (*cur == 0) - return false; - - const uint32_t chr = utf8_peek(cur); - *next = utf8_next(cur); - - bool found = false; - for (_rx_RangeNode *range = cls->ranges; range != NULL; - range = (_rx_RangeNode *)range->generic.next) { - if (chr >= range->first && chr <= range->last) { - found = true; - break; - } - } - - if (cls->negate) - found = !found; - - return found; -} - -static bool _rx_cap_is_match(cregex_node *node, const char *orig, - const char *cur, const char **next) -{ - _rx_CapNode *cap = (_rx_CapNode *)node; - - if (_rx_is_match(cap->subexp, orig, cur, next)) { - cap->cap.start = cur - orig; - cap->cap.end = (*next) - orig; - return true; - } - - return false; -} - -static bool _rx_or_is_match(cregex_node *node, const char *orig, - const char *cur, const char **next) -{ - _rx_OrNode *ior = (_rx_OrNode *)node; - - if (ior->generic.next != NULL) { - ior->right = ior->generic.next; - ior->generic.next = NULL; - } - - if (_rx_is_match(ior->left, orig, cur, next) && ior->left != NULL) { - return true; - } - - return _rx_is_match(ior->right, orig, cur, next) && ior->right != NULL; -} - -/* Global error value with callback address */ -struct { - cregex_error_t err; - const char *s; - jmp_buf buf; -} _rx_CompileException; - -/* set global error value to the default value */ -static inline void _rx_clear_compile_exception(void) -{ - _rx_CompileException.err = cregex_OK; - _rx_CompileException.s = NULL; -} - -/* set global error value and jump back to the exception handler */ -static void _rx_throw_compile_exception(cregex_error_t err, const char *s) -{ - _rx_CompileException.err = err; - _rx_CompileException.s = s; - longjmp(_rx_CompileException.buf, 1); -} - -static size_t _rx_calc_compiled_escaped_len(const char *s, const char **leftover) -{ - if (*s == 0) - _rx_throw_compile_exception(cregex_UNEXPECTED_EOL, s); - - const uint32_t chr = utf8_peek(s); - *leftover = utf8_next(s); - - switch (chr) { - case 's': - return 5; - - case 'S': - return 5; - - case 'd': - return 2; - - case 'D': - return 2; - - case 'w': - return 5; - - case 'W': - return 5; - - default: - return 1; - } -} - -static const size_t _rx_calc_compiled_class_len(const char *s, - const char **leftover) -{ - if (*s == '^') - s++; - - size_t ret = 1; - - while (*s && *s != ']') { - uint32_t chr = utf8_peek(s); - s = utf8_next(s); - if (chr == '\\') { - s = utf8_next(s); - } - - if (*s == '-' && s[1] != ']') { - s++; - chr = utf8_peek(s); - s = utf8_next(s); - - if (chr == '\\') - s = utf8_next(s); - } - - ret++; - } - - if (*s == ']') { - s++; - *leftover = s; - } else { - _rx_throw_compile_exception(cregex_INVALID_COMPLEX_CLASS, s); - } - - return ret; -} - -/* get required amount of memory in amount of nodes - * to _rx_compile regular expressions */ -static const size_t _rx_calc_compiled_len(const char *s) -{ - if (*s == 0) { - return 1; - } else { - const uint32_t chr = utf8_peek(s); - size_t ret = 0; - s = utf8_next(s); - - switch (chr) { - case '{': { - const char *end = strstr(s, "}"); - - if (end == NULL) - _rx_throw_compile_exception( - cregex_INVALID_COMPLEX_QUANT, s); - - s = end + 1; - ret = 1; - break; - } - - case '\\': - ret = _rx_calc_compiled_escaped_len(s, &s); - break; - - case '[': - ret = _rx_calc_compiled_class_len(s, &s); - break; - - default: - ret = 1; - break; - } - - return ret + _rx_calc_compiled_len(s); - } -} - -static void _rx_append_quant(cregex_node **prev, cregex_node *cur, size_t min, - size_t max, const char *re) -{ - cur->generic.match = _rx_quant_is_match; - cur->generic.next = NULL; - cur->generic.prev = NULL; - - cur->quant.max = max; - cur->quant.min = min; - cur->quant.subexp = *prev; - - *prev = (*prev)->generic.prev; - if (*prev == NULL) - _rx_throw_compile_exception(cregex_EARLY_QUANTIFIER, re); - - cur->quant.subexp->generic.next = NULL; - cur->quant.subexp->generic.prev = NULL; -} - -static inline bool _rx_is_digit(uint32_t c) -{ - return c >= '0' && c <= '9'; -} - -static size_t _rx_parse_digit(const char *s, const char **leftover) -{ - size_t ret = 0; - - while (*s) { - uint32_t chr = utf8_peek(s); - - if (_rx_is_digit(chr)) { - ret *= 10; - ret += chr - '0'; - s = utf8_next(s); - } else { - break; - } - } - - *leftover = s; - return ret; -} - -/* parse complex quantifier of format {m,n} - * valid formats: {,} {m,} {,n} {m} {m,n} */ -static void _rx_parse_complex_quant(const char *re, const char **leftover, - size_t *min_p, size_t *max_p) -{ - if (*re == 0) - _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re); - - uint32_t tmp = utf8_peek(re); - size_t min = 0, max = SIZE_MAX; - - if (_rx_is_digit(tmp)) { - min = _rx_parse_digit(re, &re); - } else if (tmp != ',') { - _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re); - } - - tmp = utf8_peek(re); - - if (tmp == ',') { - re = utf8_next(re); - if (_rx_is_digit(utf8_peek(re))) - max = _rx_parse_digit(re, &re); - else - max = SIZE_MAX; - } else { - max = min; - } - - tmp = utf8_peek(re); - if (tmp == '}') { - *leftover = re + 1; - *min_p = min; - *max_p = max; - } else { - _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re); - } -} - -/* append character class to linked list of nodes with - * ranges given as optional arguments. Returns pointer - * to next */ -static cregex_node *_rx_append_class(cregex_node *cur, bool negate, size_t n, ...) -{ - cur->cls.negate = negate; - cur->cls.ranges = (_rx_RangeNode *)(n ? cur + 1 : NULL); - cur->generic.match = _rx_class_is_match; - cur->generic.next = NULL; - cur->generic.prev = NULL; - - va_list ap; - va_start(ap, n); - cregex_node *prev = NULL; - cur = cur + 1; - - for (size_t i = 0; i < n; ++i) { - const uint32_t first = va_arg(ap, uint32_t); - const uint32_t last = va_arg(ap, uint32_t); - - cur->generic.next = NULL; - cur->generic.prev = prev; - - if (prev) - prev->generic.next = cur; - - cur->range.first = first; - cur->range.last = last; - - prev = cur; - cur = cur + 1; - } - - va_end(ap); - - return cur; -} - -/** _rx_compile escaped characters. return pointer to the next free node. */ -static cregex_node *_rx_compile_next_escaped(const char *re, const char **leftover, - cregex_node *cur) -{ - if (*re == 0) - _rx_throw_compile_exception(cregex_UNEXPECTED_EOL, re); - - const uint32_t chr = utf8_peek(re); - *leftover = utf8_next(re); - cregex_node *ret = cur + 1; - - switch (chr) { - case 'n': - cur->chr.chr = '\n'; - cur->generic.match = _rx_char_is_match; - break; - - case 't': - cur->chr.chr = '\t'; - cur->generic.match = _rx_char_is_match; - break; - - case 'r': - cur->chr.chr = '\r'; - cur->generic.match = _rx_char_is_match; - break; - - case 's': - ret = _rx_append_class(cur, false, 4, ' ', ' ', '\t', '\t', '\r', - '\r', '\n', '\n'); - break; - - case 'S': - ret = _rx_append_class(cur, true, 4, ' ', ' ', '\t', '\t', '\r', - '\r', '\n', '\n'); - break; - - case 'w': - ret = _rx_append_class(cur, false, 4, 'a', 'z', 'A', 'Z', '0', '9', - '_', '_'); - break; - - case 'W': - ret = _rx_append_class(cur, true, 4, 'a', 'z', 'A', 'Z', '0', '9', - '_', '_'); - break; - - case 'd': - ret = _rx_append_class(cur, false, 1, '0', '9'); - break; - - case 'D': - ret = _rx_append_class(cur, true, 1, '0', '9'); - break; - - default: - cur->chr.chr = chr; - cur->generic.match = _rx_char_is_match; - break; - } - - return ret; -} - -static cregex_node *_rx_compile_next_complex_class(const char *re, - const char **leftover, - cregex_node *cur) -{ - cur->generic.match = _rx_class_is_match; - cur->generic.next = NULL; - cur->generic.prev = NULL; - - if (*re == '^') { - re++; - cur->cls.negate = true; - } else { - cur->cls.negate = false; - } - - cur->cls.ranges = NULL; - - cur = cur + 1; - cregex_node *prev = NULL; - - while (*re && *re != ']') { - uint32_t first = 0, last = 0; - - first = utf8_peek(re); - re = utf8_next(re); - if (first == '\\') { - if (*re == 0) - _rx_throw_compile_exception( - cregex_INVALID_COMPLEX_CLASS, re); - - first = utf8_peek(re); - re = utf8_next(re); - } - - if (*re == '-' && re[1] != ']' && re[1]) { - re++; - last = utf8_peek(re); - re = utf8_next(re); - - if (last == '\\') { - if (*re == 0) - _rx_throw_compile_exception( - cregex_INVALID_COMPLEX_CLASS, - re); - - last = utf8_peek(re); - re = utf8_next(re); - } - } else { - last = first; - } - - cur->range.first = first; - cur->range.last = last; - cur->generic.prev = prev; - cur->generic.next = NULL; - - if (prev == NULL) { - (cur - 1)->cls.ranges = (_rx_RangeNode *)cur; - } else { - prev->generic.next = cur; - } - - prev = cur; - cur++; - } - - if (*re == ']') { - *leftover = re + 1; - return cur; - } else { - _rx_throw_compile_exception(cregex_INVALID_COMPLEX_CLASS, re); - return NULL; // Unreachable - } -} - -static const char *_rx_find_closing_par(const char *s) -{ - size_t level = 1; - - for (; *s && level != 0; ++s) { - if (*s == '\\') - s++; - else if (*s == '(') - level++; - else if (*s == ')') - level--; - } - - if (level == 0) - return s; - else - return NULL; -} - -static cregex_node *_rx_compile(const char *re, const char *end, cregex_node *nodes); - -static cregex_node *_rx_compile_next_cap(const char *re, const char **leftover, - cregex_node *cur) -{ - cur->cap.cap.start = 0; - cur->cap.cap.end = 0; - cur->cap.subexp = cur + 1; - cur->generic.next = NULL; - cur->generic.prev = NULL; - cur->generic.match = _rx_cap_is_match; - - const char *end = _rx_find_closing_par(re); - - if (end == NULL) - _rx_throw_compile_exception(cregex_UNCLOSED_SUBEXPRESSION, re); - - *leftover = end; - return _rx_compile(re, end - 1, cur + 1); -} - -static cregex_node *insert_or(cregex_node *cur, cregex_node **prev) { - cur->generic.match = _rx_or_is_match; - cur->generic.next = NULL; - cur->generic.prev = NULL; - - // Find last start node - cregex_node *begin = *prev; - - while (begin->generic.match != _rx_start_is_match) { - begin = begin->generic.prev; - } - - cur->ior.left = begin->generic.next; - *prev = begin; - - return cur + 1; -} - -/* _rx_compile next node. returns address of next available node. - * returns NULL if re is empty */ -static cregex_node *_rx_compile_next(const char *re, const char **leftover, - cregex_node *prev, cregex_node *cur) -{ - if (*re == 0) - return NULL; - - const uint32_t chr = utf8_peek(re); - re = utf8_next(re); - cregex_node *next = cur + 1; - - switch (chr) { - case '^': - cur->generic.match = _rx_anchor_begin_is_match; - break; - - case '$': - cur->generic.match = _rx_anchor_end_is_match; - break; - - case '.': - cur->generic.match = _rx_any_is_match; - break; - - case '*': - _rx_append_quant(&prev, cur, 0, SIZE_MAX, re); - break; - - case '+': - _rx_append_quant(&prev, cur, 1, SIZE_MAX, re); - break; - - case '?': - _rx_append_quant(&prev, cur, 0, 1, re); - break; - - case '{': { - size_t min = 0, max = SIZE_MAX; - const char *leftover = NULL; - _rx_parse_complex_quant(re, &leftover, &min, &max); - - _rx_append_quant(&prev, cur, min, max, re); - re = leftover; - break; - } - - case '[': - next = _rx_compile_next_complex_class(re, &re, cur); - break; - - case '(': - next = _rx_compile_next_cap(re, &re, cur); - break; - - case '\\': - next = _rx_compile_next_escaped(re, &re, cur); - break; - - case '|': - next = insert_or(cur, &prev); - break; - - default: - cur->chr.chr = chr; - cur->generic.match = _rx_char_is_match; - break; - } - - cur->generic.next = NULL; - cur->generic.prev = prev; - prev->generic.next = cur; - *leftover = re; - - return next; -} - -/* _rx_compile raw regular expression into a linked list of nodes. return leftover nodes */ -static cregex_node *_rx_compile(const char *re, const char *end, cregex_node *nodes) -{ - cregex_node *prev = nodes; - cregex_node *cur = nodes + 1; - - prev->generic.next = NULL; - prev->generic.prev = NULL; - prev->generic.match = _rx_start_is_match; - - while (cur != NULL && re != NULL && re < end) { - const char *next = NULL; - cregex_node *next_node = _rx_compile_next(re, &next, prev, cur); - - prev = cur; - cur = next_node; - re = next; - } - - return cur; -} - -STC_DEF cregex cregex_new(const char *re) -{ - cregex ret = {NULL}; - - _rx_clear_compile_exception(); - if (re == NULL) { - _rx_CompileException.err = cregex_INVALID_PARAMS; - return ret; - } - - if (!utf8_is_valid(re)) { - _rx_CompileException.err = cregex_INVALID_UTF8; - _rx_CompileException.s = NULL; - return ret; - } - - cregex_node *nodes = NULL; - - if (setjmp(_rx_CompileException.buf)) { - // Error callback - free(nodes); - return ret; - } - - const size_t compile_len = _rx_calc_compiled_len(re); - nodes = (cregex_node *)calloc(compile_len, sizeof(cregex_node)); - _rx_compile(re, re + strlen(re), nodes); - ret.nodes = nodes; - - return ret; -} - -STC_DEF cregex_error_t cregex_error(void) -{ - return _rx_CompileException.err; -} - -STC_DEF bool cregex_is_match(cregex* rx, const char *s) -{ - rx->input = s; - bool res = _rx_is_match(rx->nodes, s, s, &s); - return res && *s == 0; -} - -STC_DEF bool cregex_find_next(cregex *rx, const char *s, cregex_match *m) -{ - const char *it = s + m->end, *next; - rx->input = s; - - for (; *it; it = utf8_next(it)) { - if (_rx_is_match(rx->nodes, s, it, &next)) { - m->start = it - s; - m->end = next - s; - return true; - } - } - return false; -} - -STC_API bool cregex_find_next_v(cregex *rx, const char *s, csview *sv) -{ - cregex_match m = {(size_t)(sv->str - s), m.start + sv->size}; - - bool res = cregex_find_next(rx, s, &m); - *sv = c_make(csview){s + m.start, m.end - m.start}; - return res; -} - - -STC_DEF void cregex_drop(cregex *rx) -{ - free(rx->nodes); -} - -/* calculate amount of capture groups - * inside a regular expression */ -static size_t _rx_cap_node_count(cregex_node *nodes) -{ - if (nodes == NULL) { - return 0; - } else if (nodes->generic.match == _rx_quant_is_match) { - return _rx_cap_node_count(nodes->quant.subexp) + - _rx_cap_node_count(nodes->generic.next); - } else if (nodes->generic.match == _rx_cap_is_match) { - return _rx_cap_node_count(nodes->quant.subexp) + - _rx_cap_node_count(nodes->generic.next) + 1; - } else { - return _rx_cap_node_count(nodes->generic.next); - } -} - -STC_DEF size_t cregex_capture_size(cregex rx) -{ - return _rx_cap_node_count(rx.nodes); -} - -static cregex_node *_rx_find_capture_node(cregex_node *node, size_t index) -{ - if (node == NULL) { - return NULL; - } else if (node->generic.match == _rx_cap_is_match) { - if (index == 0) { - return node; - } else { - const size_t subexp_len = - _rx_cap_node_count(node->cap.subexp); - if (index <= subexp_len) { - return _rx_find_capture_node(node->cap.subexp, - index - subexp_len); - } else { - return _rx_find_capture_node(node->generic.next, - index - 1 - subexp_len); - } - } - } else if (node->generic.match == _rx_quant_is_match) { - const size_t subexp_len = _rx_cap_node_count(node->quant.subexp); - if (index < subexp_len) { - return _rx_find_capture_node(node->quant.subexp, index); - } else { - return _rx_find_capture_node(node->generic.next, index); - } - } else { - return _rx_find_capture_node(node->generic.next, index); - } -} - -STC_DEF bool cregex_capture(cregex *rx, size_t index, cregex_match *m) -{ - _rx_CapNode *cap = (_rx_CapNode *)_rx_find_capture_node(rx->nodes, index); - - if (!cap) return false; - - *m = cap->cap; - return true; -} - -#endif -#endif -#undef i_opt diff --git a/tests/cregex_test.c b/tests/cregex_test.c deleted file mode 100644 index a2691152..00000000 --- a/tests/cregex_test.c +++ /dev/null @@ -1,367 +0,0 @@ -#include <stdio.h> -#include <stdlib.h> -#include <assert.h> -//#include <check.h> -#include <stc/cregex.h> - -#define START_TEST(f) void f(void) -#define END_TEST - -#define ck_assert(x) assert(x) -#define ck_assert_uint_ne(a, b) assert(a != b) -#define ck_assert_uint_eq(a, b) if (!((a) == (b))) printf("%u == %u: ", a, b), assert((a) == (b)) -#define ck_assert_int_ne(a, b) assert(a != b) -#define ck_assert_int_eq(a, b) assert(a == b) -#define ck_assert_ptr_ne(a, b) assert(a != b) -#define ck_assert_ptr_eq(a, b) assert(a == b) - - -START_TEST(compile_match_char) -{ - cregex re = cregex_new("äsdf"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - cregex_match match; - ck_assert(cregex_find(&re, "äsdf", &match)); - ck_assert_uint_eq(match.start, 0); - ck_assert_uint_eq(match.end, 5); // ä is two bytes wide - - ck_assert(cregex_find(&re, "zäsdf", &match)); - ck_assert_uint_eq(match.start, 1); - ck_assert_uint_eq(match.end, 6); - - ck_assert(cregex_find(&re, "äsdf", &match)); - ck_assert_uint_eq(match.start, 0); - ck_assert_uint_eq(match.end, 5); - - cregex_drop(&re); -} -END_TEST - -START_TEST(compile_match_anchors) -{ - cregex re[1] = {cregex_new("^äs.f$")}; - ck_assert_int_eq(cregex_error(), cregex_OK); - - cregex_match match; - ck_assert(cregex_find(re, "äsdf", &match)); - ck_assert_uint_eq(match.start, 0); - ck_assert_uint_eq(match.end, 5); - - ck_assert(cregex_find(re, "äs♥f", &match)); - ck_assert(cregex_find(re, "äsöf", &match)); - - cregex_drop(re); -} -END_TEST - -START_TEST(compile_match_quantifiers) -{ - c_auto (cregex, re) { - re = cregex_new("ä+"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - cregex_match match; - ck_assert(cregex_find(&re, "ääb", &match)); - ck_assert_uint_eq(match.start, 0); - ck_assert_uint_eq(match.end, 4); - - ck_assert(cregex_find(&re, "bäbb", &match)); - ck_assert_uint_eq(match.start, 1); - ck_assert_uint_eq(match.end, 3); - - ck_assert(!cregex_find(&re, "bbb", &match)); - } - c_auto (cregex, re) { - re = cregex_new("bä*"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - cregex_match match; - ck_assert(cregex_find(&re, "bääb", &match)); - ck_assert_uint_eq(match.start, 0); - ck_assert_uint_eq(match.end, 5); - - ck_assert(cregex_find(&re, "bäbb", &match)); - ck_assert_uint_eq(match.start, 0); - ck_assert_uint_eq(match.end, 3); - - ck_assert(cregex_find(&re, "bbb", &match)); - ck_assert_uint_eq(match.start, 0); - ck_assert_uint_eq(match.end, 1); - } -} -END_TEST - -START_TEST(compile_match_complex_quants) -{ - c_auto (cregex, re1, re2, re3, re4) - { - re1 = cregex_new("ä{1,3}"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - re2 = cregex_new("ä{1}"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - re3 = cregex_new("ä{,}"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - re4 = cregex_new("ä{,3}"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - cregex_match match; - ck_assert(cregex_find(&re1, "ääb", &match)); - ck_assert_uint_eq(match.start, 0); - ck_assert_uint_eq(match.end, 4); - ck_assert(cregex_find(&re1, "äääb", &match)); - ck_assert(cregex_find(&re1, "äb", &match)); - ck_assert(!cregex_find(&re1, "b", &match)); - - ck_assert(cregex_find(&re2, "ää", &match)); - ck_assert_uint_eq(match.start, 0); - ck_assert_uint_eq(match.end, 2); - ck_assert(cregex_find(&re2, "bbäb", &match)); - ck_assert(!cregex_find(&re2, "bbbb", &match)); - - ck_assert(cregex_find(&re3, "ääääääääääb", &match)); - ck_assert_uint_eq(match.start, 0); - ck_assert_uint_eq(match.end, 20); - ck_assert(cregex_find(&re3, "b", &match)); - - ck_assert(cregex_find(&re4, "bä", &match)); - ck_assert(cregex_find(&re4, "bää", &match)); - ck_assert(cregex_find(&re4, "bäää", &match)); - } -} -END_TEST - -START_TEST(compile_match_escaped_chars) -{ - cregex re = cregex_new("\\n\\r\\t\\{"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - cregex_match match; - ck_assert(cregex_find(&re, "\n\r\t{", &match)); - ck_assert(!cregex_find(&re, "\n\r\t", &match)); - - cregex_drop(&re); -} -END_TEST - -START_TEST(compile_match_class_simple) -{ - c_auto (cregex, re1, re2, re3) - { - re1 = cregex_new("\\s"); - ck_assert_int_eq(cregex_error(), cregex_OK); - re2 = cregex_new("\\w"); - ck_assert_int_eq(cregex_error(), cregex_OK); - re3 = cregex_new("\\D"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - cregex_match match; - ck_assert(cregex_find(&re1, " ", &match)); - ck_assert(cregex_find(&re1, "\r", &match)); - ck_assert(cregex_find(&re1, "\n", &match)); - - ck_assert(cregex_find(&re2, "a", &match)); - ck_assert(cregex_find(&re2, "0", &match)); - ck_assert(cregex_find(&re2, "_", &match)); - - ck_assert(cregex_find(&re3, "k", &match)); - ck_assert(!cregex_find(&re3, "0", &match)); - } -} -END_TEST - -START_TEST(compile_match_or) -{ - c_auto (cregex, re, re2) - { - re = cregex_new("as|df"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - cregex_match match; - ck_assert(cregex_find(&re, "as", &match)); - ck_assert(cregex_find(&re, "df", &match)); - - re2 = cregex_new("(as|df)"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - ck_assert(cregex_find(&re2, "as", &match)); - ck_assert(cregex_find(&re2, "df", &match)); - } -} -END_TEST - -START_TEST(compile_match_class_complex_0) -{ - cregex re = cregex_new("[asdf]"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - cregex_match match; - ck_assert(cregex_find(&re, "a", &match)); - ck_assert(cregex_find(&re, "s", &match)); - ck_assert(cregex_find(&re, "d", &match)); - ck_assert(cregex_find(&re, "f", &match)); - - cregex_drop(&re); -} -END_TEST - -START_TEST(compile_match_class_complex_1) -{ - cregex re = cregex_new("[a-zä0-9öA-Z]"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - cregex_match match; - ck_assert(cregex_find(&re, "a", &match)); - ck_assert(cregex_find(&re, "5", &match)); - ck_assert(cregex_find(&re, "A", &match)); - ck_assert(cregex_find(&re, "ä", &match)); - ck_assert(cregex_find(&re, "ö", &match)); - - cregex_drop(&re); -} -END_TEST - -START_TEST(compile_match_cap) -{ - cregex re = cregex_new("(abc)d"); - ck_assert_int_eq(cregex_error(), cregex_OK); - - cregex_match match; - ck_assert(cregex_find(&re, "abcd", &match)); - ck_assert(cregex_find(&re, "llljabcdkk", &match)); - ck_assert(!cregex_find(&re, "abc", &match)); - - cregex_drop(&re); -} -END_TEST - -START_TEST(invalid_quantifier) -{ - ck_assert_ptr_eq(cregex_new("+").nodes, NULL); - ck_assert_int_eq(cregex_error(), cregex_EARLY_QUANTIFIER); -} -END_TEST - -/* Test that invalid utf8 sequences cause a - * cregex_INVALID_UTF8 error */ -START_TEST(invalid_utf8) -{ - char s1[3] = "ä"; - s1[1] = 65; // invalid continuation byte - - ck_assert_ptr_eq(cregex_new(s1).nodes, NULL); - ck_assert_int_eq(cregex_error(), cregex_INVALID_UTF8); -} -END_TEST - -START_TEST(search_all) -{ - c_auto (cregex, re) - { - re = cregex_new("ab"); - cregex_match m = {0}; - bool res; - - res = cregex_find_next(&re, "ab,ab,ab", &m); - ck_assert(res && m.start == 0); - res = cregex_find_next(&re, "ab,ab,ab", &m); - ck_assert(res && m.start == 3); - res = cregex_find_next(&re, "ab,ab,ab", &m); - ck_assert(res && m.start == 6); - res = cregex_find_next(&re, "ab,ab,ab", &m); - ck_assert(!res); - } -} -END_TEST - -START_TEST(captures_len) -{ - c_auto (cregex, re) { - cregex re = cregex_new("(ab(cd))(ef)"); - ck_assert_uint_eq(cregex_capture_size(re), 3); - } -} -END_TEST - -START_TEST(captures_cap) -{ - c_auto (cregex, re) { - re = cregex_new("(ab)((cd)+)"); - ck_assert_uint_eq(cregex_capture_size(re), 3); - - cregex_match match; - ck_assert(cregex_find(&re, "xxabcdcde", &match)); - - cregex_match cap0, cap1; - cregex_capture(&re, 0, &cap0); - cregex_capture(&re, 1, &cap1); - - ck_assert_uint_eq(cap0.start, 2); - ck_assert_uint_eq(cap0.end, 4); - ck_assert_uint_eq(cap1.start, 4); - ck_assert_uint_eq(cap1.end, 8); - - ck_assert(!cregex_is_match(&re, "abcdcde")); - ck_assert(cregex_is_match(&re, "abcdcdcd")); - } -} -END_TEST - -/* -Suite *cregex_test_suite(void) -{ - Suite *ret = suite_create("mregexp"); - TCase *tcase = tcase_create("mregexp"); - - tcase_add_test(tcase, compile_match_char); - tcase_add_test(tcase, invalid_utf8); - tcase_add_test(tcase, compile_match_anchors); - tcase_add_test(tcase, compile_match_quantifiers); - tcase_add_test(tcase, invalid_quantifier); - tcase_add_test(tcase, compile_match_complex_quants); - tcase_add_test(tcase, compile_match_escaped_chars); - tcase_add_test(tcase, compile_match_class_simple); - tcase_add_test(tcase, compile_match_class_complex_0); - tcase_add_test(tcase, compile_match_class_complex_1); - tcase_add_test(tcase, compile_match_cap); - tcase_add_test(tcase, search_all); - tcase_add_test(tcase, captures_len); - tcase_add_test(tcase, captures_cap); - tcase_add_test(tcase, compile_match_or); - - suite_add_tcase(ret, tcase); - return ret; -} - -int main(void) -{ - SRunner *sr = srunner_create(cregex_test_suite()); - srunner_run_all(sr, CK_NORMAL); - int fails = srunner_ntests_failed(sr); - srunner_free(sr); - return fails ? EXIT_FAILURE : EXIT_SUCCESS; -} -*/ - -int main() -{ - compile_match_char(); - invalid_utf8(); - compile_match_anchors(); - compile_match_quantifiers(); - invalid_quantifier(); - compile_match_complex_quants(); - compile_match_escaped_chars(); - compile_match_class_simple(); - compile_match_class_complex_0(); - compile_match_class_complex_1(); - compile_match_cap(); - search_all(); - captures_len(); - captures_cap(); - compile_match_or(); - printf("All tests succesful.\n"); -} |
