summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--examples/regex1.c26
-rw-r--r--examples/regex2.c31
-rw-r--r--examples/regex_match.c35
-rw-r--r--examples/stack.c1
-rw-r--r--include/stc/cregex.h993
-rw-r--r--tests/cregex_test.c370
6 files changed, 1455 insertions, 1 deletions
diff --git a/examples/regex1.c b/examples/regex1.c
new file mode 100644
index 00000000..7385d69e
--- /dev/null
+++ b/examples/regex1.c
@@ -0,0 +1,26 @@
+#include <stc/cstr.h>
+#include <stc/cregex.h>
+
+int main()
+{
+ c_auto (cstr, input)
+ c_auto (cregex, float_expr)
+ {
+ float_expr = cregex_new("[+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)");
+ // Until "q" is given, ask for another number
+ while (true)
+ {
+ printf("Enter float number (q for quit): ");
+ cstr_getline(&input, stdin);
+
+ // Exit when the user inputs q
+ if (cstr_equals(input, "q"))
+ break;
+
+ if (cregex_matches(&float_expr, input.str))
+ printf("Input is a float\n");
+ else
+ printf("Invalid input : Not a float\n");
+ }
+ }
+}
diff --git a/examples/regex2.c b/examples/regex2.c
new file mode 100644
index 00000000..5509baa5
--- /dev/null
+++ b/examples/regex2.c
@@ -0,0 +1,31 @@
+#include <stc/cregex.h>
+#include <stc/csview.h>
+#include <stc/cstr.h>
+
+int main()
+{
+ const char* inputs[] = {"birthday: 1964-12-05", "https://en.cppreference.com/w/cpp/regex/regex_search", "!123abcabc!"};
+ const char* patterns[] = {"([0-9]{4})-(1[0-2]|0[1-9])-(3[01]|[12][0-9]|0[1-9])",
+ "(https?:\\/\\/|ftp:\\/\\/|www\\.)([0-9A-Za-z-@:%_+~#=]+\\.)+([a-z]{2,3})(\\/[\\/0-9A-Za-z-\\.@:%_+~#=\\?&]*)?",
+ "!((abc|123)+)!",
+ };
+ c_forrange (i, c_arraylen(inputs))
+ {
+ c_auto (cregex, re)
+ {
+ re = cregex_new(patterns[i]);
+ csview m;
+ if (cregex_find_sv(&re, inputs[i], &m))
+ {
+ printf("input: %s\n", inputs[i]);
+ c_forrange (j, cregex_capture_size(re))
+ {
+ csview cap;
+ bool res = cregex_capture_sv(&re, j, &cap);
+ if (res) printf(" submatch %zu: " c_PRIsv "\n", j, c_ARGsv(cap));
+ }
+ puts("");
+ }
+ }
+ }
+}
diff --git a/examples/regex_match.c b/examples/regex_match.c
new file mode 100644
index 00000000..17e3355f
--- /dev/null
+++ b/examples/regex_match.c
@@ -0,0 +1,35 @@
+#include <stdio.h>
+#include <stc/csview.h>
+#include <stc/cregex.h>
+#include <stc/crandom.h>
+#define i_val double
+#define i_type Vecu64
+#include <stc/cstack.h>
+#include <time.h>
+
+
+int main()
+{
+ // Lets find the first sequence of digits in a string
+ const char *s = "Hello numeric world, there are 24 hours in a day, 3600 seconds in an hour."
+ " Around 365.25 days a year, and 52 weeks in a year."
+ " Boltzmann const: 1.38064852E-23, is very small."
+ " Bohrradius is 5.29177210903e-11, and Avogadros number is 6.02214076e23.";
+
+ c_auto (cregex, re)
+ {
+ re = cregex_new("[+-]?([0-9]*\\.)?[0-9]+([Ee][-+]?[0-9]+)?");
+ cregex_match match;
+ if (cregex_find(&re, s, &match)) {
+ printf("Found digits at position %zu-%zu\n", match.start, match.end);
+ } else {
+ printf("Could not find any digits\n");
+ }
+
+ csview sv = {0};
+ while (cregex_find_next_sv(&re, s, &sv)) {
+ printf(c_PRIsv " ; ", c_ARGsv(sv));
+ }
+ puts("");
+ }
+}
diff --git a/examples/stack.c b/examples/stack.c
index f392ecd1..29b39aef 100644
--- a/examples/stack.c
+++ b/examples/stack.c
@@ -1,6 +1,5 @@
#include <stdio.h>
-#include <stc/cstr.h>
#define i_tag i
#define i_val int
diff --git a/include/stc/cregex.h b/include/stc/cregex.h
new file mode 100644
index 00000000..11b286d2
--- /dev/null
+++ b/include/stc/cregex.h
@@ -0,0 +1,993 @@
+/*
+ * Copyright (c) 2020 Fabian van Rissenbeck
+ * https://github.com/deinernstjetzt/mregexp
+ * 2009 Bjoern Hoehrmann <[email protected]>
+ * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+ * 2022 Tyge Løvset
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+*/
+#ifndef CREGEX_INCLUDED
+#define CREGEX_INCLUDED
+
+#include "csview.h"
+#include <stdlib.h>
+#include <setjmp.h>
+#include <stdarg.h>
+
+typedef struct {
+ size_t start;
+ size_t end;
+} cregex_match;
+
+typedef struct {
+ union cregex_node *nodes;
+ const char* input;
+ cregex_match match;
+} cregex;
+
+typedef enum {
+ cregex_OK = 0,
+ cregex_FAILED_ALLOC,
+ cregex_INVALID_UTF8,
+ cregex_INVALID_PARAMS,
+ cregex_EARLY_QUANTIFIER,
+ cregex_INVALID_COMPLEX_QUANT,
+ cregex_UNEXPECTED_EOL,
+ cregex_INVALID_COMPLEX_CLASS,
+ cregex_UNCLOSED_SUBEXPRESSION,
+} cregex_error_t;
+
+/* create an empty expression */
+STC_INLINE cregex cregex_init(void)
+ { cregex rx = {NULL}; return rx; }
+
+/* compile regular expression */
+STC_API cregex cregex_new(const char *re);
+
+/* get error type if a function failed */
+STC_API cregex_error_t cregex_error(void);
+
+/* check if input s matches re */
+STC_API bool cregex_matches(cregex *rx, const char *s);
+
+/* find the next matching substring in s */
+STC_API bool cregex_find_next(cregex *rx, const char *s, cregex_match *m);
+
+/* find the next matching substring in s, return as csview */
+STC_API bool cregex_find_next_sv(cregex *rx, const char *s, csview *sv);
+
+/* find the first matching substring in s */
+STC_INLINE bool cregex_find(cregex *rx, const char *s, cregex_match *m) {
+ m->start = m->end = 0;
+ return cregex_find_next(rx, s, m);
+}
+
+STC_INLINE bool cregex_find_sv(cregex *rx, const char *s, csview *sv) {
+ sv->str = s, sv->size = 0;
+ return cregex_find_next_sv(rx, s, sv);
+}
+
+/* get captured slice from capture group number index */
+STC_API bool cregex_capture(cregex *rx, size_t index, cregex_match *m);
+
+/* get captured slice from capture group number index as a csview */
+STC_INLINE bool cregex_capture_sv(cregex *rx, size_t index, csview *sv) {
+ cregex_match m;
+ bool ret = cregex_capture(rx, index, &m);
+ *sv = c_make(csview){rx->input + m.start, m.end - m.start};
+ return ret;
+}
+
+/* get amount of capture groups inside of
+ * the regular expression */
+STC_API size_t cregex_capture_size(cregex rx);
+
+/* free regular expression */
+STC_API void cregex_drop(cregex *rx);
+
+/* -------------------------- IMPLEMENTATION ------------------------- */
+#if defined(_i_implement)
+
+
+/* function pointer type used to evaluate if a regex node
+ * matched a given string */
+typedef bool (*_rx_MatchFunc)(union cregex_node *node, const char *orig,
+ const char *cur, const char **next);
+
+typedef struct _rx_GenericNode {
+ union cregex_node *prev;
+ union cregex_node *next;
+ _rx_MatchFunc match;
+} _rx_GenericNode;
+
+typedef struct {
+ _rx_GenericNode generic;
+ uint32_t chr;
+} _rx_CharNode;
+
+typedef struct {
+ _rx_GenericNode generic;
+ union cregex_node *subexp;
+ size_t min, max;
+} _rx_QuantNode;
+
+typedef struct {
+ _rx_GenericNode generic;
+ uint32_t first, last;
+} _rx_RangeNode;
+
+typedef struct {
+ _rx_GenericNode generic;
+ _rx_RangeNode *ranges;
+ bool negate;
+} _rx_ClassNode;
+
+typedef struct {
+ _rx_GenericNode generic;
+ union cregex_node *subexp;
+ cregex_match cap;
+} _rx_CapNode;
+
+typedef struct {
+ _rx_GenericNode generic;
+ union cregex_node *left;
+ union cregex_node *right;
+} _rx_OrNode;
+
+typedef union cregex_node {
+ _rx_GenericNode generic;
+ _rx_CharNode chr;
+ _rx_QuantNode quant;
+ _rx_ClassNode cls;
+ _rx_RangeNode range;
+ _rx_CapNode cap;
+ _rx_OrNode ior;
+} cregex_node;
+
+static bool _rx_is_match(cregex_node *node, const char *orig,
+ const char *cur, const char **next)
+{
+ if (node == NULL) {
+ *next = cur;
+ return true;
+ } else {
+ return ((node->generic.match)(node, orig, cur, next)) &&
+ _rx_is_match(node->generic.next, orig, *next, next);
+ }
+}
+
+static bool _rx_char_is_match(cregex_node *node, const char *orig,
+ const char *cur, const char **next)
+{
+ if (*cur == 0) {
+ return false;
+ }
+
+ *next = utf8_next(cur);
+ return node->chr.chr == utf8_peek(cur);
+}
+
+static bool _rx_start_is_match(cregex_node *node, const char *orig,
+ const char *cur, const char **next)
+{
+ *next = cur;
+ return true;
+}
+
+static bool _rx_anchor_begin_is_match(cregex_node *node, const char *orig,
+ const char *cur, const char **next)
+{
+ *next = cur;
+ return strlen(orig) == strlen(cur);
+}
+
+static bool _rx_anchor_end_is_match(cregex_node *node, const char *orig,
+ const char *cur, const char **next)
+{
+ *next = cur;
+ return strlen(cur) == 0;
+}
+
+static bool _rx_any_is_match(cregex_node *node, const char *orig,
+ const char *cur, const char **next)
+{
+ if (*cur) {
+ *next = utf8_next(cur);
+ return true;
+ }
+
+ return false;
+}
+
+
+
+static bool _rx_quant_is_match(cregex_node *node, const char *orig,
+ const char *cur, const char **next)
+{
+ _rx_QuantNode *quant = (_rx_QuantNode *)node;
+ size_t matches = 0;
+
+ while (_rx_is_match(quant->subexp, orig, cur, next)) {
+ matches++;
+ cur = *next;
+
+ if (matches >= quant->max)
+ break;
+ }
+
+ *next = cur;
+ return matches >= quant->min;
+}
+
+static bool _rx_class_is_match(cregex_node *node, const char *orig,
+ const char *cur, const char **next)
+{
+ _rx_ClassNode *cls = (_rx_ClassNode *)node;
+
+ if (*cur == 0)
+ return false;
+
+ const uint32_t chr = utf8_peek(cur);
+ *next = utf8_next(cur);
+
+ bool found = false;
+ for (_rx_RangeNode *range = cls->ranges; range != NULL;
+ range = (_rx_RangeNode *)range->generic.next) {
+ if (chr >= range->first && chr <= range->last) {
+ found = true;
+ break;
+ }
+ }
+
+ if (cls->negate)
+ found = !found;
+
+ return found;
+}
+#include <stdio.h>
+static bool _rx_cap_is_match(cregex_node *node, const char *orig,
+ const char *cur, const char **next)
+{
+ _rx_CapNode *cap = (_rx_CapNode *)node;
+
+ if (_rx_is_match(cap->subexp, orig, cur, next)) {
+ cap->cap.start = cur - orig;
+ cap->cap.end = (*next) - orig;
+ //printf("Cap: %.*s\n", cap->cap.end - cap->cap.start, orig + cap->cap.start);
+ return true;
+ }
+
+ return false;
+}
+
+static bool _rx_or_is_match(cregex_node *node, const char *orig,
+ const char *cur, const char **next)
+{
+ _rx_OrNode *ior = (_rx_OrNode *)node;
+
+ if (ior->generic.next != NULL) {
+ ior->right = ior->generic.next;
+ ior->generic.next = NULL;
+ }
+
+ if (_rx_is_match(ior->left, orig, cur, next) && ior->left != NULL) {
+ return true;
+ }
+
+ return _rx_is_match(ior->right, orig, cur, next) && ior->right != NULL;
+}
+
+/* Global error value with callback address */
+struct {
+ cregex_error_t err;
+ const char *s;
+ jmp_buf buf;
+} _rx_CompileException;
+
+/* set global error value to the default value */
+static inline void _rx_clear_compile_exception(void)
+{
+ _rx_CompileException.err = cregex_OK;
+ _rx_CompileException.s = NULL;
+}
+
+/* set global error value and jump back to the exception handler */
+static void _rx_throw_compile_exception(cregex_error_t err, const char *s)
+{
+ _rx_CompileException.err = err;
+ _rx_CompileException.s = s;
+ longjmp(_rx_CompileException.buf, 1);
+}
+
+static size_t _rx_calc_compiled_escaped_len(const char *s, const char **leftover)
+{
+ if (*s == 0)
+ _rx_throw_compile_exception(cregex_UNEXPECTED_EOL, s);
+
+ const uint32_t chr = utf8_peek(s);
+ *leftover = utf8_next(s);
+
+ switch (chr) {
+ case 's':
+ return 5;
+
+ case 'S':
+ return 5;
+
+ case 'd':
+ return 2;
+
+ case 'D':
+ return 2;
+
+ case 'w':
+ return 5;
+
+ case 'W':
+ return 5;
+
+ default:
+ return 1;
+ }
+}
+
+static const size_t _rx_calc_compiled_class_len(const char *s,
+ const char **leftover)
+{
+ if (*s == '^')
+ s++;
+
+ size_t ret = 1;
+
+ while (*s && *s != ']') {
+ uint32_t chr = utf8_peek(s);
+ s = utf8_next(s);
+ if (chr == '\\') {
+ s = utf8_next(s);
+ }
+
+ if (*s == '-' && s[1] != ']') {
+ s++;
+ chr = utf8_peek(s);
+ s = utf8_next(s);
+
+ if (chr == '\\')
+ s = utf8_next(s);
+ }
+
+ ret++;
+ }
+
+ if (*s == ']') {
+ s++;
+ *leftover = s;
+ } else {
+ _rx_throw_compile_exception(cregex_INVALID_COMPLEX_CLASS, s);
+ }
+
+ return ret;
+}
+
+/* get required amount of memory in amount of nodes
+ * to _rx_compile regular expressions */
+static const size_t _rx_calc_compiled_len(const char *s)
+{
+ if (*s == 0) {
+ return 1;
+ } else {
+ const uint32_t chr = utf8_peek(s);
+ size_t ret = 0;
+ s = utf8_next(s);
+
+ switch (chr) {
+ case '{': {
+ const char *end = strstr(s, "}");
+
+ if (end == NULL)
+ _rx_throw_compile_exception(
+ cregex_INVALID_COMPLEX_QUANT, s);
+
+ s = end + 1;
+ ret = 1;
+ break;
+ }
+
+ case '\\':
+ ret = _rx_calc_compiled_escaped_len(s, &s);
+ break;
+
+ case '[':
+ ret = _rx_calc_compiled_class_len(s, &s);
+ break;
+
+ default:
+ ret = 1;
+ break;
+ }
+
+ return ret + _rx_calc_compiled_len(s);
+ }
+}
+
+static void _rx_append_quant(cregex_node **prev, cregex_node *cur, size_t min,
+ size_t max, const char *re)
+{
+ cur->generic.match = _rx_quant_is_match;
+ cur->generic.next = NULL;
+ cur->generic.prev = NULL;
+
+ cur->quant.max = max;
+ cur->quant.min = min;
+ cur->quant.subexp = *prev;
+
+ *prev = (*prev)->generic.prev;
+ if (*prev == NULL)
+ _rx_throw_compile_exception(cregex_EARLY_QUANTIFIER, re);
+
+ cur->quant.subexp->generic.next = NULL;
+ cur->quant.subexp->generic.prev = NULL;
+}
+
+static inline bool _rx_is_digit(uint32_t c)
+{
+ return c >= '0' && c <= '9';
+}
+
+static size_t _rx_parse_digit(const char *s, const char **leftover)
+{
+ size_t ret = 0;
+
+ while (*s) {
+ uint32_t chr = utf8_peek(s);
+
+ if (_rx_is_digit(chr)) {
+ ret *= 10;
+ ret += chr - '0';
+ s = utf8_next(s);
+ } else {
+ break;
+ }
+ }
+
+ *leftover = s;
+ return ret;
+}
+
+/* parse complex quantifier of format {m,n}
+ * valid formats: {,} {m,} {,n} {m} {m,n} */
+static void _rx_parse_complex_quant(const char *re, const char **leftover,
+ size_t *min_p, size_t *max_p)
+{
+ if (*re == 0)
+ _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re);
+
+ uint32_t tmp = utf8_peek(re);
+ size_t min = 0, max = SIZE_MAX;
+
+ if (_rx_is_digit(tmp)) {
+ min = _rx_parse_digit(re, &re);
+ } else if (tmp != ',') {
+ _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re);
+ }
+
+ tmp = utf8_peek(re);
+
+ if (tmp == ',') {
+ re = utf8_next(re);
+ if (_rx_is_digit(utf8_peek(re)))
+ max = _rx_parse_digit(re, &re);
+ else
+ max = SIZE_MAX;
+ } else {
+ max = min;
+ }
+
+ tmp = utf8_peek(re);
+ if (tmp == '}') {
+ *leftover = re + 1;
+ *min_p = min;
+ *max_p = max;
+ } else {
+ _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re);
+ }
+}
+
+/* append character class to linked list of nodes with
+ * ranges given as optional arguments. Returns pointer
+ * to next */
+static cregex_node *_rx_append_class(cregex_node *cur, bool negate, size_t n, ...)
+{
+ cur->cls.negate = negate;
+ cur->cls.ranges = (_rx_RangeNode *)(n ? cur + 1 : NULL);
+ cur->generic.match = _rx_class_is_match;
+ cur->generic.next = NULL;
+ cur->generic.prev = NULL;
+
+ va_list ap;
+ va_start(ap, n);
+ cregex_node *prev = NULL;
+ cur = cur + 1;
+
+ for (size_t i = 0; i < n; ++i) {
+ const uint32_t first = va_arg(ap, uint32_t);
+ const uint32_t last = va_arg(ap, uint32_t);
+
+ cur->generic.next = NULL;
+ cur->generic.prev = prev;
+
+ if (prev)
+ prev->generic.next = cur;
+
+ cur->range.first = first;
+ cur->range.last = last;
+
+ prev = cur;
+ cur = cur + 1;
+ }
+
+ va_end(ap);
+
+ return cur;
+}
+
+/** _rx_compile escaped characters. return pointer to the next free node. */
+static cregex_node *_rx_compile_next_escaped(const char *re, const char **leftover,
+ cregex_node *cur)
+{
+ if (*re == 0)
+ _rx_throw_compile_exception(cregex_UNEXPECTED_EOL, re);
+
+ const uint32_t chr = utf8_peek(re);
+ *leftover = utf8_next(re);
+ cregex_node *ret = cur + 1;
+
+ switch (chr) {
+ case 'n':
+ cur->chr.chr = '\n';
+ cur->generic.match = _rx_char_is_match;
+ break;
+
+ case 't':
+ cur->chr.chr = '\t';
+ cur->generic.match = _rx_char_is_match;
+ break;
+
+ case 'r':
+ cur->chr.chr = '\r';
+ cur->generic.match = _rx_char_is_match;
+ break;
+
+ case 's':
+ ret = _rx_append_class(cur, false, 4, ' ', ' ', '\t', '\t', '\r',
+ '\r', '\n', '\n');
+ break;
+
+ case 'S':
+ ret = _rx_append_class(cur, true, 4, ' ', ' ', '\t', '\t', '\r',
+ '\r', '\n', '\n');
+ break;
+
+ case 'w':
+ ret = _rx_append_class(cur, false, 4, 'a', 'z', 'A', 'Z', '0', '9',
+ '_', '_');
+ break;
+
+ case 'W':
+ ret = _rx_append_class(cur, true, 4, 'a', 'z', 'A', 'Z', '0', '9',
+ '_', '_');
+ break;
+
+ case 'd':
+ ret = _rx_append_class(cur, false, 1, '0', '9');
+ break;
+
+ case 'D':
+ ret = _rx_append_class(cur, true, 1, '0', '9');
+ break;
+
+ default:
+ cur->chr.chr = chr;
+ cur->generic.match = _rx_char_is_match;
+ break;
+ }
+
+ return ret;
+}
+
+static cregex_node *_rx_compile_next_complex_class(const char *re,
+ const char **leftover,
+ cregex_node *cur)
+{
+ cur->generic.match = _rx_class_is_match;
+ cur->generic.next = NULL;
+ cur->generic.prev = NULL;
+
+ if (*re == '^') {
+ re++;
+ cur->cls.negate = true;
+ } else {
+ cur->cls.negate = false;
+ }
+
+ cur->cls.ranges = NULL;
+
+ cur = cur + 1;
+ cregex_node *prev = NULL;
+
+ while (*re && *re != ']') {
+ uint32_t first = 0, last = 0;
+
+ first = utf8_peek(re);
+ re = utf8_next(re);
+ if (first == '\\') {
+ if (*re == 0)
+ _rx_throw_compile_exception(
+ cregex_INVALID_COMPLEX_CLASS, re);
+
+ first = utf8_peek(re);
+ re = utf8_next(re);
+ }
+
+ if (*re == '-' && re[1] != ']' && re[1]) {
+ re++;
+ last = utf8_peek(re);
+ re = utf8_next(re);
+
+ if (last == '\\') {
+ if (*re == 0)
+ _rx_throw_compile_exception(
+ cregex_INVALID_COMPLEX_CLASS,
+ re);
+
+ last = utf8_peek(re);
+ re = utf8_next(re);
+ }
+ } else {
+ last = first;
+ }
+
+ cur->range.first = first;
+ cur->range.last = last;
+ cur->generic.prev = prev;
+ cur->generic.next = NULL;
+
+ if (prev == NULL) {
+ (cur - 1)->cls.ranges = (_rx_RangeNode *)cur;
+ } else {
+ prev->generic.next = cur;
+ }
+
+ prev = cur;
+ cur++;
+ }
+
+ if (*re == ']') {
+ *leftover = re + 1;
+ return cur;
+ } else {
+ _rx_throw_compile_exception(cregex_INVALID_COMPLEX_CLASS, re);
+ return NULL; // Unreachable
+ }
+}
+
+static const char *_rx_find_closing_par(const char *s)
+{
+ size_t level = 1;
+
+ for (; *s && level != 0; ++s) {
+ if (*s == '\\')
+ s++;
+ else if (*s == '(')
+ level++;
+ else if (*s == ')')
+ level--;
+ }
+
+ if (level == 0)
+ return s;
+ else
+ return NULL;
+}
+
+static cregex_node *_rx_compile(const char *re, const char *end, cregex_node *nodes);
+
+static cregex_node *_rx_compile_next_cap(const char *re, const char **leftover,
+ cregex_node *cur)
+{
+ cur->cap.cap.start = 0;
+ cur->cap.cap.end = 0;
+ cur->cap.subexp = cur + 1;
+ cur->generic.next = NULL;
+ cur->generic.prev = NULL;
+ cur->generic.match = _rx_cap_is_match;
+
+ const char *end = _rx_find_closing_par(re);
+
+ if (end == NULL)
+ _rx_throw_compile_exception(cregex_UNCLOSED_SUBEXPRESSION, re);
+
+ *leftover = end;
+ return _rx_compile(re, end - 1, cur + 1);
+}
+
+static cregex_node *insert_or(cregex_node *cur, cregex_node **prev) {
+ cur->generic.match = _rx_or_is_match;
+ cur->generic.next = NULL;
+ cur->generic.prev = NULL;
+
+ // Find last start node
+ cregex_node *begin = *prev;
+
+ while (begin->generic.match != _rx_start_is_match) {
+ begin = begin->generic.prev;
+ }
+
+ cur->ior.left = begin->generic.next;
+ *prev = begin;
+
+ return cur + 1;
+}
+
+/* _rx_compile next node. returns address of next available node.
+ * returns NULL if re is empty */
+static cregex_node *_rx_compile_next(const char *re, const char **leftover,
+ cregex_node *prev, cregex_node *cur)
+{
+ if (*re == 0)
+ return NULL;
+
+ const uint32_t chr = utf8_peek(re);
+ re = utf8_next(re);
+ cregex_node *next = cur + 1;
+
+ switch (chr) {
+ case '^':
+ cur->generic.match = _rx_anchor_begin_is_match;
+ break;
+
+ case '$':
+ cur->generic.match = _rx_anchor_end_is_match;
+ break;
+
+ case '.':
+ cur->generic.match = _rx_any_is_match;
+ break;
+
+ case '*':
+ _rx_append_quant(&prev, cur, 0, SIZE_MAX, re);
+ break;
+
+ case '+':
+ _rx_append_quant(&prev, cur, 1, SIZE_MAX, re);
+ break;
+
+ case '?':
+ _rx_append_quant(&prev, cur, 0, 1, re);
+ break;
+
+ case '{': {
+ size_t min = 0, max = SIZE_MAX;
+ const char *leftover = NULL;
+ _rx_parse_complex_quant(re, &leftover, &min, &max);
+
+ _rx_append_quant(&prev, cur, min, max, re);
+ re = leftover;
+ break;
+ }
+
+ case '[':
+ next = _rx_compile_next_complex_class(re, &re, cur);
+ break;
+
+ case '(':
+ next = _rx_compile_next_cap(re, &re, cur);
+ break;
+
+ case '\\':
+ next = _rx_compile_next_escaped(re, &re, cur);
+ break;
+
+ case '|':
+ next = insert_or(cur, &prev);
+ break;
+
+ default:
+ cur->chr.chr = chr;
+ cur->generic.match = _rx_char_is_match;
+ break;
+ }
+
+ cur->generic.next = NULL;
+ cur->generic.prev = prev;
+ prev->generic.next = cur;
+ *leftover = re;
+
+ return next;
+}
+
+/* _rx_compile raw regular expression into a linked list of nodes. return leftover nodes */
+static cregex_node *_rx_compile(const char *re, const char *end, cregex_node *nodes)
+{
+ cregex_node *prev = nodes;
+ cregex_node *cur = nodes + 1;
+
+ prev->generic.next = NULL;
+ prev->generic.prev = NULL;
+ prev->generic.match = _rx_start_is_match;
+
+ while (cur != NULL && re != NULL && re < end) {
+ const char *next = NULL;
+ cregex_node *next_node = _rx_compile_next(re, &next, prev, cur);
+
+ prev = cur;
+ cur = next_node;
+ re = next;
+ }
+
+ return cur;
+}
+
+STC_DEF cregex cregex_new(const char *re)
+{
+ cregex ret = {NULL};
+
+ _rx_clear_compile_exception();
+ if (re == NULL) {
+ _rx_CompileException.err = cregex_INVALID_PARAMS;
+ return ret;
+ }
+
+ if (!utf8_valid(re)) {
+ _rx_CompileException.err = cregex_INVALID_UTF8;
+ _rx_CompileException.s = NULL;
+ return ret;
+ }
+
+ cregex_node *nodes = NULL;
+
+ if (setjmp(_rx_CompileException.buf)) {
+ // Error callback
+ free(nodes);
+ return ret;
+ }
+
+ const size_t compile_len = _rx_calc_compiled_len(re);
+ nodes = (cregex_node *)calloc(compile_len, sizeof(cregex_node));
+ _rx_compile(re, re + strlen(re), nodes);
+ ret.nodes = nodes;
+
+ return ret;
+}
+
+STC_DEF cregex_error_t cregex_error(void)
+{
+ return _rx_CompileException.err;
+}
+
+STC_DEF bool cregex_matches(cregex* rx, const char *s)
+{
+ const char* next;
+ bool res = _rx_is_match(rx->nodes, s, s, &next);
+ if (res && *next == 0) {
+ rx->match.start = 0;
+ rx->match.end = next - s;
+ rx->input = s;
+ return true;
+ }
+ rx->input = NULL;
+ return false;
+}
+
+STC_DEF bool cregex_find_next(cregex *rx, const char *s, cregex_match *m)
+{
+ const char *it = s + m->end, *next;
+
+ for (; *it; it = utf8_next(it)) {
+ if (_rx_is_match(rx->nodes, s, it, &next)) {
+ m->start = it - s;
+ m->end = next - s;
+ rx->match = *m;
+ rx->input = s;
+ return true;
+ }
+ }
+ rx->input = NULL;
+ return false;
+}
+
+STC_API bool cregex_find_next_sv(cregex *rx, const char *s, csview *sv)
+{
+ if (!sv->str) sv->str = s;
+ cregex_match m = {(size_t)(sv->str - s), m.start + sv->size};
+
+ bool res = cregex_find_next(rx, s, &m);
+ if (res) *sv = c_make(csview){s + m.start, m.end - m.start};
+ return res;
+}
+
+
+STC_DEF void cregex_drop(cregex *rx)
+{
+ free(rx->nodes);
+}
+
+/* calculate amount of capture groups
+ * inside a regular expression */
+static size_t _rx_cap_node_count(cregex_node *nodes)
+{
+ if (nodes == NULL) {
+ return 0;
+ } else if (nodes->generic.match == _rx_quant_is_match) {
+ return _rx_cap_node_count(nodes->quant.subexp) +
+ _rx_cap_node_count(nodes->generic.next);
+ } else if (nodes->generic.match == _rx_cap_is_match) {
+ return _rx_cap_node_count(nodes->quant.subexp) +
+ _rx_cap_node_count(nodes->generic.next) + 1;
+ } else {
+ return _rx_cap_node_count(nodes->generic.next);
+ }
+}
+
+STC_DEF size_t cregex_capture_size(cregex rx)
+{
+ return _rx_cap_node_count(rx.nodes) + (rx.input != NULL);
+}
+
+static cregex_node *_rx_find_capture_node(cregex_node *node, size_t index)
+{
+ if (node == NULL) {
+ return NULL;
+ } else if (node->generic.match == _rx_cap_is_match) {
+ if (index == 0) {
+ return node;
+ } else {
+ const size_t subexp_len =
+ _rx_cap_node_count(node->cap.subexp);
+ if (index <= subexp_len) {
+ return _rx_find_capture_node(node->cap.subexp,
+ index - subexp_len);
+ } else {
+ return _rx_find_capture_node(node->generic.next,
+ index - subexp_len - 1);
+ }
+ }
+ } else if (node->generic.match == _rx_quant_is_match) {
+ const size_t subexp_len = _rx_cap_node_count(node->quant.subexp);
+ if (index < subexp_len) {
+ return _rx_find_capture_node(node->quant.subexp, index);
+ } else {
+ return _rx_find_capture_node(node->generic.next, index - 1); // FIX by Tyge, added: - 1
+ }
+ } else {
+ return _rx_find_capture_node(node->generic.next, index);
+ }
+}
+
+STC_DEF bool cregex_capture(cregex *rx, size_t index, cregex_match *m)
+{
+ if (index == 0) { *m = rx->match; return m->end != 0; }
+
+ _rx_CapNode *cap = (_rx_CapNode *)_rx_find_capture_node(rx->nodes, index - 1);
+ if (cap) *m = cap->cap;
+ return cap != NULL;
+}
+
+#endif
+#endif
+#undef i_opt
diff --git a/tests/cregex_test.c b/tests/cregex_test.c
new file mode 100644
index 00000000..a9549f96
--- /dev/null
+++ b/tests/cregex_test.c
@@ -0,0 +1,370 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+//#include <check.h>
+#include <stc/cregex.h>
+
+#define START_TEST(f) void f(void)
+#define END_TEST
+
+#define ck_assert(x) assert(x)
+#define ck_assert_uint_ne(a, b) assert(a != b)
+#define ck_assert_uint_eq(a, b) if (!((a) == (b))) printf("%u == %u: ", a, b), assert((a) == (b))
+#define ck_assert_int_ne(a, b) assert(a != b)
+#define ck_assert_int_eq(a, b) assert(a == b)
+#define ck_assert_ptr_ne(a, b) assert(a != b)
+#define ck_assert_ptr_eq(a, b) assert(a == b)
+
+
+START_TEST(compile_match_char)
+{
+ cregex re = cregex_new("äsdf");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ cregex_match match;
+ ck_assert(cregex_find(&re, "äsdf", &match));
+ ck_assert_uint_eq(match.start, 0);
+ ck_assert_uint_eq(match.end, 5); // ä is two bytes wide
+
+ ck_assert(cregex_find(&re, "zäsdf", &match));
+ ck_assert_uint_eq(match.start, 1);
+ ck_assert_uint_eq(match.end, 6);
+
+ ck_assert(cregex_find(&re, "äsdf", &match));
+ ck_assert_uint_eq(match.start, 0);
+ ck_assert_uint_eq(match.end, 5);
+
+ cregex_drop(&re);
+}
+END_TEST
+
+START_TEST(compile_match_anchors)
+{
+ cregex re[1] = {cregex_new("^äs.f$")};
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ cregex_match match;
+ ck_assert(cregex_find(re, "äsdf", &match));
+ ck_assert_uint_eq(match.start, 0);
+ ck_assert_uint_eq(match.end, 5);
+
+ ck_assert(cregex_find(re, "äs♥f", &match));
+ ck_assert(cregex_find(re, "äsöf", &match));
+
+ cregex_drop(re);
+}
+END_TEST
+
+START_TEST(compile_match_quantifiers)
+{
+ c_auto (cregex, re) {
+ re = cregex_new("ä+");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ cregex_match match;
+ ck_assert(cregex_find(&re, "ääb", &match));
+ ck_assert_uint_eq(match.start, 0);
+ ck_assert_uint_eq(match.end, 4);
+
+ ck_assert(cregex_find(&re, "bäbb", &match));
+ ck_assert_uint_eq(match.start, 1);
+ ck_assert_uint_eq(match.end, 3);
+
+ ck_assert(!cregex_find(&re, "bbb", &match));
+ }
+ c_auto (cregex, re) {
+ re = cregex_new("bä*");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ cregex_match match;
+ ck_assert(cregex_find(&re, "bääb", &match));
+ ck_assert_uint_eq(match.start, 0);
+ ck_assert_uint_eq(match.end, 5);
+
+ ck_assert(cregex_find(&re, "bäbb", &match));
+ ck_assert_uint_eq(match.start, 0);
+ ck_assert_uint_eq(match.end, 3);
+
+ ck_assert(cregex_find(&re, "bbb", &match));
+ ck_assert_uint_eq(match.start, 0);
+ ck_assert_uint_eq(match.end, 1);
+ }
+}
+END_TEST
+
+START_TEST(compile_match_complex_quants)
+{
+ c_auto (cregex, re1, re2, re3, re4)
+ {
+ re1 = cregex_new("ä{1,3}");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ re2 = cregex_new("ä{1}");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ re3 = cregex_new("ä{,}");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ re4 = cregex_new("ä{,3}");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ cregex_match match;
+ ck_assert(cregex_find(&re1, "ääb", &match));
+ ck_assert_uint_eq(match.start, 0);
+ ck_assert_uint_eq(match.end, 4);
+ ck_assert(cregex_find(&re1, "äääb", &match));
+ ck_assert(cregex_find(&re1, "äb", &match));
+ ck_assert(!cregex_find(&re1, "b", &match));
+
+ ck_assert(cregex_find(&re2, "ää", &match));
+ ck_assert_uint_eq(match.start, 0);
+ ck_assert_uint_eq(match.end, 2);
+ ck_assert(cregex_find(&re2, "bbäb", &match));
+ ck_assert(!cregex_find(&re2, "bbbb", &match));
+
+ ck_assert(cregex_find(&re3, "ääääääääääb", &match));
+ ck_assert_uint_eq(match.start, 0);
+ ck_assert_uint_eq(match.end, 20);
+ ck_assert(cregex_find(&re3, "b", &match));
+
+ ck_assert(cregex_find(&re4, "bä", &match));
+ ck_assert(cregex_find(&re4, "bää", &match));
+ ck_assert(cregex_find(&re4, "bäää", &match));
+ }
+}
+END_TEST
+
+START_TEST(compile_match_escaped_chars)
+{
+ cregex re = cregex_new("\\n\\r\\t\\{");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ cregex_match match;
+ ck_assert(cregex_find(&re, "\n\r\t{", &match));
+ ck_assert(!cregex_find(&re, "\n\r\t", &match));
+
+ cregex_drop(&re);
+}
+END_TEST
+
+START_TEST(compile_match_class_simple)
+{
+ c_auto (cregex, re1, re2, re3)
+ {
+ re1 = cregex_new("\\s");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+ re2 = cregex_new("\\w");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+ re3 = cregex_new("\\D");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ cregex_match match;
+ ck_assert(cregex_find(&re1, " ", &match));
+ ck_assert(cregex_find(&re1, "\r", &match));
+ ck_assert(cregex_find(&re1, "\n", &match));
+
+ ck_assert(cregex_find(&re2, "a", &match));
+ ck_assert(cregex_find(&re2, "0", &match));
+ ck_assert(cregex_find(&re2, "_", &match));
+
+ ck_assert(cregex_find(&re3, "k", &match));
+ ck_assert(!cregex_find(&re3, "0", &match));
+ }
+}
+END_TEST
+
+START_TEST(compile_match_or)
+{
+ c_auto (cregex, re, re2)
+ {
+ re = cregex_new("as|df");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ cregex_match match;
+ ck_assert(cregex_find(&re, "as", &match));
+ ck_assert(cregex_find(&re, "df", &match));
+
+ re2 = cregex_new("(as|df)");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ ck_assert(cregex_find(&re2, "as", &match));
+ ck_assert(cregex_find(&re2, "df", &match));
+ }
+}
+END_TEST
+
+START_TEST(compile_match_class_complex_0)
+{
+ cregex re = cregex_new("[asdf]");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ cregex_match match;
+ ck_assert(cregex_find(&re, "a", &match));
+ ck_assert(cregex_find(&re, "s", &match));
+ ck_assert(cregex_find(&re, "d", &match));
+ ck_assert(cregex_find(&re, "f", &match));
+
+ cregex_drop(&re);
+}
+END_TEST
+
+START_TEST(compile_match_class_complex_1)
+{
+ cregex re = cregex_new("[a-zä0-9öA-Z]");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ cregex_match match;
+ ck_assert(cregex_find(&re, "a", &match));
+ ck_assert(cregex_find(&re, "5", &match));
+ ck_assert(cregex_find(&re, "A", &match));
+ ck_assert(cregex_find(&re, "ä", &match));
+ ck_assert(cregex_find(&re, "ö", &match));
+
+ cregex_drop(&re);
+}
+END_TEST
+
+START_TEST(compile_match_cap)
+{
+ cregex re = cregex_new("(abc)d");
+ ck_assert_int_eq(cregex_error(), cregex_OK);
+
+ cregex_match match;
+ ck_assert(cregex_find(&re, "abcd", &match));
+ ck_assert(cregex_find(&re, "llljabcdkk", &match));
+ ck_assert(!cregex_find(&re, "abc", &match));
+
+ cregex_drop(&re);
+}
+END_TEST
+
+START_TEST(invalid_quantifier)
+{
+ ck_assert_ptr_eq(cregex_new("+").nodes, NULL);
+ ck_assert_int_eq(cregex_error(), cregex_EARLY_QUANTIFIER);
+}
+END_TEST
+
+/* Test that invalid utf8 sequences cause a
+ * cregex_INVALID_UTF8 error */
+START_TEST(invalid_utf8)
+{
+ char s1[3] = "ä";
+ s1[1] = 65; // invalid continuation byte
+
+ ck_assert_ptr_eq(cregex_new(s1).nodes, NULL);
+ ck_assert_int_eq(cregex_error(), cregex_INVALID_UTF8);
+}
+END_TEST
+
+START_TEST(search_all)
+{
+ c_auto (cregex, re)
+ {
+ re = cregex_new("ab");
+ cregex_match m = {0};
+ bool res;
+
+ res = cregex_find_next(&re, "ab,ab,ab", &m);
+ ck_assert(res && m.start == 0);
+ res = cregex_find_next(&re, "ab,ab,ab", &m);
+ ck_assert(res && m.start == 3);
+ res = cregex_find_next(&re, "ab,ab,ab", &m);
+ ck_assert(res && m.start == 6);
+ res = cregex_find_next(&re, "ab,ab,ab", &m);
+ ck_assert(!res);
+ }
+}
+END_TEST
+
+START_TEST(captures_len)
+{
+ c_auto (cregex, re) {
+ cregex re = cregex_new("(ab(cd))(ef)");
+ ck_assert_uint_eq(cregex_capture_size(re), 3);
+ }
+}
+END_TEST
+
+START_TEST(captures_cap)
+{
+ c_auto (cregex, re) {
+ re = cregex_new("(ab)((cd)+)");
+ ck_assert_uint_eq(cregex_capture_size(re), 3);
+
+ cregex_match match;
+ ck_assert(cregex_find(&re, "xxabcdcde", &match));
+
+ cregex_match cap0, cap1, cap2;
+ cregex_capture(&re, 0, &cap0);
+ cregex_capture(&re, 1, &cap1);
+ cregex_capture(&re, 2, &cap2);
+
+ ck_assert_uint_eq(cap0.start, 2);
+ ck_assert_uint_eq(cap0.end, 8);
+ ck_assert_uint_eq(cap1.start, 2);
+ ck_assert_uint_eq(cap1.end, 4);
+ ck_assert_uint_eq(cap2.start, 4);
+ ck_assert_uint_eq(cap2.end, 8);
+
+ ck_assert(!cregex_matches(&re, "abcdcde"));
+ ck_assert(cregex_matches(&re, "abcdcdcd"));
+ }
+}
+END_TEST
+
+/*
+Suite *cregex_test_suite(void)
+{
+ Suite *ret = suite_create("mregexp");
+ TCase *tcase = tcase_create("mregexp");
+
+ tcase_add_test(tcase, compile_match_char);
+ tcase_add_test(tcase, invalid_utf8);
+ tcase_add_test(tcase, compile_match_anchors);
+ tcase_add_test(tcase, compile_match_quantifiers);
+ tcase_add_test(tcase, invalid_quantifier);
+ tcase_add_test(tcase, compile_match_complex_quants);
+ tcase_add_test(tcase, compile_match_escaped_chars);
+ tcase_add_test(tcase, compile_match_class_simple);
+ tcase_add_test(tcase, compile_match_class_complex_0);
+ tcase_add_test(tcase, compile_match_class_complex_1);
+ tcase_add_test(tcase, compile_match_cap);
+ tcase_add_test(tcase, search_all);
+ //tcase_add_test(tcase, captures_len);
+ //tcase_add_test(tcase, captures_cap);
+ tcase_add_test(tcase, compile_match_or);
+
+ suite_add_tcase(ret, tcase);
+ return ret;
+}
+
+int main(void)
+{
+ SRunner *sr = srunner_create(cregex_test_suite());
+ srunner_run_all(sr, CK_NORMAL);
+ int fails = srunner_ntests_failed(sr);
+ srunner_free(sr);
+ return fails ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+*/
+
+int main()
+{
+ compile_match_char();
+ invalid_utf8();
+ compile_match_anchors();
+ compile_match_quantifiers();
+ invalid_quantifier();
+ compile_match_complex_quants();
+ compile_match_escaped_chars();
+ compile_match_class_simple();
+ compile_match_class_complex_0();
+ compile_match_class_complex_1();
+ compile_match_cap();
+ search_all();
+ //captures_len();
+ //captures_cap();
+ compile_match_or();
+ printf("All tests succesful.\n");
+}