summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorTyge Løvset <[email protected]>2022-01-11 00:27:44 +0100
committerTyge Løvset <[email protected]>2022-01-11 00:27:44 +0100
commitaf1b2068b39deebfbf723532a2eddc49da61894a (patch)
treef26d06d6be52e280bea0f82558e3e040a89b4335
parent14c0c1b9ab2ce6a4f98baf389d712515b7c68933 (diff)
downloadSTC-modified-af1b2068b39deebfbf723532a2eddc49da61894a.tar.gz
STC-modified-af1b2068b39deebfbf723532a2eddc49da61894a.zip
Removed regex for now.
-rw-r--r--examples/regex1.c26
-rw-r--r--examples/regex2.c26
-rw-r--r--examples/regex_match.c35
-rw-r--r--include/stc/cregex.h981
-rw-r--r--tests/cregex_test.c367
5 files changed, 0 insertions, 1435 deletions
diff --git a/examples/regex1.c b/examples/regex1.c
deleted file mode 100644
index 9b450453..00000000
--- a/examples/regex1.c
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <stc/cstr.h>
-#include <stc/cregex.h>
-
-int main()
-{
- c_auto (cstr, input)
- c_auto (cregex, float_expr)
- {
- float_expr = cregex_new("[+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)");
- // Until "q" is given, ask for another number
- while (true)
- {
- printf("Enter float number (q for quit): ");
- cstr_getline(&input, stdin);
-
- // Exit when the user inputs q
- if (cstr_equals(input, "q"))
- break;
-
- if (cregex_is_match(&float_expr, input.str))
- printf("Input is a float\n");
- else
- printf("Invalid input : Not a float\n");
- }
- }
-} \ No newline at end of file
diff --git a/examples/regex2.c b/examples/regex2.c
deleted file mode 100644
index 0562af00..00000000
--- a/examples/regex2.c
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <stc/cregex.h>
-#include <stc/csview.h>
-#include <stc/cstr.h>
-
-int main()
-{
- const char* fnames[] = {"home/foofile.txt", "cool/barfile.txt", "test/bazboy.dat", "hello/zoidberg"};
- c_auto (cregex, re)
- {
- re = cregex_new("([a-z]+)\\/([a-z]+)\\.([a-z]+)");
-
- c_forrange (i, c_arraylen(fnames))
- {
- printf("%s\n", fnames[i]);
- if (cregex_is_match(&re, fnames[i]))
- {
- c_forrange (j, cregex_capture_size(re))
- {
- csview cap; cregex_capture_v(&re, j, &cap);
- printf(" submatch %zu: " c_PRIsv "\n", j, c_ARGsv(cap));
- }
- puts("");
- }
- }
- }
-}
diff --git a/examples/regex_match.c b/examples/regex_match.c
deleted file mode 100644
index 72adeb8e..00000000
--- a/examples/regex_match.c
+++ /dev/null
@@ -1,35 +0,0 @@
-#include <stdio.h>
-#include <stc/csview.h>
-#include <stc/cregex.h>
-#include <stc/crandom.h>
-#define i_val double
-#define i_type Vecu64
-#include <stc/cstack.h>
-#include <time.h>
-
-
-int main()
-{
- // Lets find the first sequence of digits in a string
- const char *s = "Hello numeric world, there are 24 hours in a day, 3600 seconds in an hour."
- " Around 365.25 days a year, and 52 weeks in a year."
- " Boltzmann const: 1.38064852E-23, is very small."
- " Bohrradius is 5.29177210903e-11, and Avogadros number is 6.02214076e23.";
-
- c_auto (cregex, re)
- {
- re = cregex_new("[+-]?([0-9]*\\.)?[0-9]+([Ee][-+]?[0-9]+)?");
- cregex_match match;
- if (cregex_find(&re, s, &match)) {
- printf("Found digits at position %zu-%zu\n", match.start, match.end);
- } else {
- printf("Could not find any digits\n");
- }
-
- csview sv = {0};
- while (cregex_find_next_v(&re, s, &sv)) {
- printf(c_PRIsv " ; ", c_ARGsv(sv));
- }
- puts("");
- }
-} \ No newline at end of file
diff --git a/include/stc/cregex.h b/include/stc/cregex.h
deleted file mode 100644
index 84799d75..00000000
--- a/include/stc/cregex.h
+++ /dev/null
@@ -1,981 +0,0 @@
-/*
- * Copyright (c) 2020 Fabian van Rissenbeck
- * https://github.com/deinernstjetzt/mregexp
- * 2009 Bjoern Hoehrmann <[email protected]>
- * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
- * 2022 Tyge Løvset
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
-
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
-
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
-*/
-#ifndef CREGEX_INCLUDED
-#define CREGEX_INCLUDED
-
-#include "csview.h"
-#include "utf8.h"
-#include <stdlib.h>
-#include <setjmp.h>
-#include <stdarg.h>
-
-typedef struct {
- union cregex_node *nodes;
- const char* input;
-} cregex;
-
-typedef struct {
- size_t start;
- size_t end;
-} cregex_match;
-
-typedef enum {
- cregex_OK = 0,
- cregex_FAILED_ALLOC,
- cregex_INVALID_UTF8,
- cregex_INVALID_PARAMS,
- cregex_EARLY_QUANTIFIER,
- cregex_INVALID_COMPLEX_QUANT,
- cregex_UNEXPECTED_EOL,
- cregex_INVALID_COMPLEX_CLASS,
- cregex_UNCLOSED_SUBEXPRESSION,
-} cregex_error_t;
-
-/* create an empty expression */
-STC_INLINE cregex cregex_init(void)
- { cregex rx = {NULL}; return rx; }
-
-/* compile regular expression */
-STC_API cregex cregex_new(const char *re);
-
-/* get error type if a function failed */
-STC_API cregex_error_t cregex_error(void);
-
-/* check if input s matches re */
-STC_API bool cregex_is_match(cregex *rx, const char *s);
-
-/* find the next matching substring in s */
-STC_API bool cregex_find_next(cregex *rx, const char *s, cregex_match *m);
-
-/* find the next matching substring in s, return as csview */
-STC_API bool cregex_find_next_v(cregex *rx, const char *s, csview *sv);
-
-/* find the first matching substring in s */
-STC_INLINE bool cregex_find(cregex *rx, const char *s, cregex_match *m) {
- m->start = m->end = 0;
- return cregex_find_next(rx, s, m);
-}
-
-STC_INLINE bool cregex_find_v(cregex *rx, const char *s, csview *sv) {
- sv->str = s, sv->size = 0;
- return cregex_find_next_v(rx, s, sv);
-}
-
-/* get captured slice from capture group number index */
-STC_API bool cregex_capture(cregex *rx, size_t index, cregex_match *m);
-
-/* get captured slice from capture group number index as a csview */
-STC_INLINE bool cregex_capture_v(cregex *rx, size_t index, csview *sv) {
- cregex_match m;
- bool ret = cregex_capture(rx, index, &m);
- *sv = c_make(csview){rx->input + m.start, m.end - m.start};
- return ret;
-}
-
-/* get amount of capture groups inside of
- * the regular expression */
-STC_API size_t cregex_capture_size(cregex rx);
-
-/* free regular expression */
-STC_API void cregex_drop(cregex *rx);
-
-/* -------------------------- IMPLEMENTATION ------------------------- */
-#if defined(_i_implement)
-
-
-/* function pointer type used to evaluate if a regex node
- * matched a given string */
-typedef bool (*_rx_MatchFunc)(union cregex_node *node, const char *orig,
- const char *cur, const char **next);
-
-typedef struct _rx_GenericNode {
- union cregex_node *prev;
- union cregex_node *next;
- _rx_MatchFunc match;
-} _rx_GenericNode;
-
-typedef struct {
- _rx_GenericNode generic;
- uint32_t chr;
-} _rx_CharNode;
-
-typedef struct {
- _rx_GenericNode generic;
- union cregex_node *subexp;
- size_t min, max;
-} _rx_QuantNode;
-
-typedef struct {
- _rx_GenericNode generic;
- uint32_t first, last;
-} _rx_RangeNode;
-
-typedef struct {
- _rx_GenericNode generic;
- _rx_RangeNode *ranges;
- bool negate;
-} _rx_ClassNode;
-
-typedef struct {
- _rx_GenericNode generic;
- union cregex_node *subexp;
- cregex_match cap;
-} _rx_CapNode;
-
-typedef struct {
- _rx_GenericNode generic;
- union cregex_node *left;
- union cregex_node *right;
-} _rx_OrNode;
-
-typedef union cregex_node {
- _rx_GenericNode generic;
- _rx_CharNode chr;
- _rx_QuantNode quant;
- _rx_ClassNode cls;
- _rx_RangeNode range;
- _rx_CapNode cap;
- _rx_OrNode ior;
-} cregex_node;
-
-static bool _rx_is_match(cregex_node *node, const char *orig,
- const char *cur, const char **next)
-{
- if (node == NULL) {
- *next = cur;
- return true;
- } else {
- return ((node->generic.match)(node, orig, cur, next)) &&
- _rx_is_match(node->generic.next, orig, *next, next);
- }
-}
-
-static bool _rx_char_is_match(cregex_node *node, const char *orig,
- const char *cur, const char **next)
-{
- if (*cur == 0) {
- return false;
- }
-
- *next = utf8_next(cur);
- return node->chr.chr == utf8_peek(cur);
-}
-
-static bool _rx_start_is_match(cregex_node *node, const char *orig,
- const char *cur, const char **next)
-{
- *next = cur;
- return true;
-}
-
-static bool _rx_anchor_begin_is_match(cregex_node *node, const char *orig,
- const char *cur, const char **next)
-{
- *next = cur;
- return strlen(orig) == strlen(cur);
-}
-
-static bool _rx_anchor_end_is_match(cregex_node *node, const char *orig,
- const char *cur, const char **next)
-{
- *next = cur;
- return strlen(cur) == 0;
-}
-
-static bool _rx_any_is_match(cregex_node *node, const char *orig,
- const char *cur, const char **next)
-{
- if (*cur) {
- *next = utf8_next(cur);
- return true;
- }
-
- return false;
-}
-
-static bool _rx_quant_is_match(cregex_node *node, const char *orig,
- const char *cur, const char **next)
-{
- _rx_QuantNode *quant = (_rx_QuantNode *)node;
- size_t matches = 0;
-
- while (_rx_is_match(quant->subexp, orig, cur, next)) {
- matches++;
- cur = *next;
-
- if (matches >= quant->max)
- break;
- }
-
- *next = cur;
- return matches >= quant->min;
-}
-
-static bool _rx_class_is_match(cregex_node *node, const char *orig,
- const char *cur, const char **next)
-{
- _rx_ClassNode *cls = (_rx_ClassNode *)node;
-
- if (*cur == 0)
- return false;
-
- const uint32_t chr = utf8_peek(cur);
- *next = utf8_next(cur);
-
- bool found = false;
- for (_rx_RangeNode *range = cls->ranges; range != NULL;
- range = (_rx_RangeNode *)range->generic.next) {
- if (chr >= range->first && chr <= range->last) {
- found = true;
- break;
- }
- }
-
- if (cls->negate)
- found = !found;
-
- return found;
-}
-
-static bool _rx_cap_is_match(cregex_node *node, const char *orig,
- const char *cur, const char **next)
-{
- _rx_CapNode *cap = (_rx_CapNode *)node;
-
- if (_rx_is_match(cap->subexp, orig, cur, next)) {
- cap->cap.start = cur - orig;
- cap->cap.end = (*next) - orig;
- return true;
- }
-
- return false;
-}
-
-static bool _rx_or_is_match(cregex_node *node, const char *orig,
- const char *cur, const char **next)
-{
- _rx_OrNode *ior = (_rx_OrNode *)node;
-
- if (ior->generic.next != NULL) {
- ior->right = ior->generic.next;
- ior->generic.next = NULL;
- }
-
- if (_rx_is_match(ior->left, orig, cur, next) && ior->left != NULL) {
- return true;
- }
-
- return _rx_is_match(ior->right, orig, cur, next) && ior->right != NULL;
-}
-
-/* Global error value with callback address */
-struct {
- cregex_error_t err;
- const char *s;
- jmp_buf buf;
-} _rx_CompileException;
-
-/* set global error value to the default value */
-static inline void _rx_clear_compile_exception(void)
-{
- _rx_CompileException.err = cregex_OK;
- _rx_CompileException.s = NULL;
-}
-
-/* set global error value and jump back to the exception handler */
-static void _rx_throw_compile_exception(cregex_error_t err, const char *s)
-{
- _rx_CompileException.err = err;
- _rx_CompileException.s = s;
- longjmp(_rx_CompileException.buf, 1);
-}
-
-static size_t _rx_calc_compiled_escaped_len(const char *s, const char **leftover)
-{
- if (*s == 0)
- _rx_throw_compile_exception(cregex_UNEXPECTED_EOL, s);
-
- const uint32_t chr = utf8_peek(s);
- *leftover = utf8_next(s);
-
- switch (chr) {
- case 's':
- return 5;
-
- case 'S':
- return 5;
-
- case 'd':
- return 2;
-
- case 'D':
- return 2;
-
- case 'w':
- return 5;
-
- case 'W':
- return 5;
-
- default:
- return 1;
- }
-}
-
-static const size_t _rx_calc_compiled_class_len(const char *s,
- const char **leftover)
-{
- if (*s == '^')
- s++;
-
- size_t ret = 1;
-
- while (*s && *s != ']') {
- uint32_t chr = utf8_peek(s);
- s = utf8_next(s);
- if (chr == '\\') {
- s = utf8_next(s);
- }
-
- if (*s == '-' && s[1] != ']') {
- s++;
- chr = utf8_peek(s);
- s = utf8_next(s);
-
- if (chr == '\\')
- s = utf8_next(s);
- }
-
- ret++;
- }
-
- if (*s == ']') {
- s++;
- *leftover = s;
- } else {
- _rx_throw_compile_exception(cregex_INVALID_COMPLEX_CLASS, s);
- }
-
- return ret;
-}
-
-/* get required amount of memory in amount of nodes
- * to _rx_compile regular expressions */
-static const size_t _rx_calc_compiled_len(const char *s)
-{
- if (*s == 0) {
- return 1;
- } else {
- const uint32_t chr = utf8_peek(s);
- size_t ret = 0;
- s = utf8_next(s);
-
- switch (chr) {
- case '{': {
- const char *end = strstr(s, "}");
-
- if (end == NULL)
- _rx_throw_compile_exception(
- cregex_INVALID_COMPLEX_QUANT, s);
-
- s = end + 1;
- ret = 1;
- break;
- }
-
- case '\\':
- ret = _rx_calc_compiled_escaped_len(s, &s);
- break;
-
- case '[':
- ret = _rx_calc_compiled_class_len(s, &s);
- break;
-
- default:
- ret = 1;
- break;
- }
-
- return ret + _rx_calc_compiled_len(s);
- }
-}
-
-static void _rx_append_quant(cregex_node **prev, cregex_node *cur, size_t min,
- size_t max, const char *re)
-{
- cur->generic.match = _rx_quant_is_match;
- cur->generic.next = NULL;
- cur->generic.prev = NULL;
-
- cur->quant.max = max;
- cur->quant.min = min;
- cur->quant.subexp = *prev;
-
- *prev = (*prev)->generic.prev;
- if (*prev == NULL)
- _rx_throw_compile_exception(cregex_EARLY_QUANTIFIER, re);
-
- cur->quant.subexp->generic.next = NULL;
- cur->quant.subexp->generic.prev = NULL;
-}
-
-static inline bool _rx_is_digit(uint32_t c)
-{
- return c >= '0' && c <= '9';
-}
-
-static size_t _rx_parse_digit(const char *s, const char **leftover)
-{
- size_t ret = 0;
-
- while (*s) {
- uint32_t chr = utf8_peek(s);
-
- if (_rx_is_digit(chr)) {
- ret *= 10;
- ret += chr - '0';
- s = utf8_next(s);
- } else {
- break;
- }
- }
-
- *leftover = s;
- return ret;
-}
-
-/* parse complex quantifier of format {m,n}
- * valid formats: {,} {m,} {,n} {m} {m,n} */
-static void _rx_parse_complex_quant(const char *re, const char **leftover,
- size_t *min_p, size_t *max_p)
-{
- if (*re == 0)
- _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re);
-
- uint32_t tmp = utf8_peek(re);
- size_t min = 0, max = SIZE_MAX;
-
- if (_rx_is_digit(tmp)) {
- min = _rx_parse_digit(re, &re);
- } else if (tmp != ',') {
- _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re);
- }
-
- tmp = utf8_peek(re);
-
- if (tmp == ',') {
- re = utf8_next(re);
- if (_rx_is_digit(utf8_peek(re)))
- max = _rx_parse_digit(re, &re);
- else
- max = SIZE_MAX;
- } else {
- max = min;
- }
-
- tmp = utf8_peek(re);
- if (tmp == '}') {
- *leftover = re + 1;
- *min_p = min;
- *max_p = max;
- } else {
- _rx_throw_compile_exception(cregex_INVALID_COMPLEX_QUANT, re);
- }
-}
-
-/* append character class to linked list of nodes with
- * ranges given as optional arguments. Returns pointer
- * to next */
-static cregex_node *_rx_append_class(cregex_node *cur, bool negate, size_t n, ...)
-{
- cur->cls.negate = negate;
- cur->cls.ranges = (_rx_RangeNode *)(n ? cur + 1 : NULL);
- cur->generic.match = _rx_class_is_match;
- cur->generic.next = NULL;
- cur->generic.prev = NULL;
-
- va_list ap;
- va_start(ap, n);
- cregex_node *prev = NULL;
- cur = cur + 1;
-
- for (size_t i = 0; i < n; ++i) {
- const uint32_t first = va_arg(ap, uint32_t);
- const uint32_t last = va_arg(ap, uint32_t);
-
- cur->generic.next = NULL;
- cur->generic.prev = prev;
-
- if (prev)
- prev->generic.next = cur;
-
- cur->range.first = first;
- cur->range.last = last;
-
- prev = cur;
- cur = cur + 1;
- }
-
- va_end(ap);
-
- return cur;
-}
-
-/** _rx_compile escaped characters. return pointer to the next free node. */
-static cregex_node *_rx_compile_next_escaped(const char *re, const char **leftover,
- cregex_node *cur)
-{
- if (*re == 0)
- _rx_throw_compile_exception(cregex_UNEXPECTED_EOL, re);
-
- const uint32_t chr = utf8_peek(re);
- *leftover = utf8_next(re);
- cregex_node *ret = cur + 1;
-
- switch (chr) {
- case 'n':
- cur->chr.chr = '\n';
- cur->generic.match = _rx_char_is_match;
- break;
-
- case 't':
- cur->chr.chr = '\t';
- cur->generic.match = _rx_char_is_match;
- break;
-
- case 'r':
- cur->chr.chr = '\r';
- cur->generic.match = _rx_char_is_match;
- break;
-
- case 's':
- ret = _rx_append_class(cur, false, 4, ' ', ' ', '\t', '\t', '\r',
- '\r', '\n', '\n');
- break;
-
- case 'S':
- ret = _rx_append_class(cur, true, 4, ' ', ' ', '\t', '\t', '\r',
- '\r', '\n', '\n');
- break;
-
- case 'w':
- ret = _rx_append_class(cur, false, 4, 'a', 'z', 'A', 'Z', '0', '9',
- '_', '_');
- break;
-
- case 'W':
- ret = _rx_append_class(cur, true, 4, 'a', 'z', 'A', 'Z', '0', '9',
- '_', '_');
- break;
-
- case 'd':
- ret = _rx_append_class(cur, false, 1, '0', '9');
- break;
-
- case 'D':
- ret = _rx_append_class(cur, true, 1, '0', '9');
- break;
-
- default:
- cur->chr.chr = chr;
- cur->generic.match = _rx_char_is_match;
- break;
- }
-
- return ret;
-}
-
-static cregex_node *_rx_compile_next_complex_class(const char *re,
- const char **leftover,
- cregex_node *cur)
-{
- cur->generic.match = _rx_class_is_match;
- cur->generic.next = NULL;
- cur->generic.prev = NULL;
-
- if (*re == '^') {
- re++;
- cur->cls.negate = true;
- } else {
- cur->cls.negate = false;
- }
-
- cur->cls.ranges = NULL;
-
- cur = cur + 1;
- cregex_node *prev = NULL;
-
- while (*re && *re != ']') {
- uint32_t first = 0, last = 0;
-
- first = utf8_peek(re);
- re = utf8_next(re);
- if (first == '\\') {
- if (*re == 0)
- _rx_throw_compile_exception(
- cregex_INVALID_COMPLEX_CLASS, re);
-
- first = utf8_peek(re);
- re = utf8_next(re);
- }
-
- if (*re == '-' && re[1] != ']' && re[1]) {
- re++;
- last = utf8_peek(re);
- re = utf8_next(re);
-
- if (last == '\\') {
- if (*re == 0)
- _rx_throw_compile_exception(
- cregex_INVALID_COMPLEX_CLASS,
- re);
-
- last = utf8_peek(re);
- re = utf8_next(re);
- }
- } else {
- last = first;
- }
-
- cur->range.first = first;
- cur->range.last = last;
- cur->generic.prev = prev;
- cur->generic.next = NULL;
-
- if (prev == NULL) {
- (cur - 1)->cls.ranges = (_rx_RangeNode *)cur;
- } else {
- prev->generic.next = cur;
- }
-
- prev = cur;
- cur++;
- }
-
- if (*re == ']') {
- *leftover = re + 1;
- return cur;
- } else {
- _rx_throw_compile_exception(cregex_INVALID_COMPLEX_CLASS, re);
- return NULL; // Unreachable
- }
-}
-
-static const char *_rx_find_closing_par(const char *s)
-{
- size_t level = 1;
-
- for (; *s && level != 0; ++s) {
- if (*s == '\\')
- s++;
- else if (*s == '(')
- level++;
- else if (*s == ')')
- level--;
- }
-
- if (level == 0)
- return s;
- else
- return NULL;
-}
-
-static cregex_node *_rx_compile(const char *re, const char *end, cregex_node *nodes);
-
-static cregex_node *_rx_compile_next_cap(const char *re, const char **leftover,
- cregex_node *cur)
-{
- cur->cap.cap.start = 0;
- cur->cap.cap.end = 0;
- cur->cap.subexp = cur + 1;
- cur->generic.next = NULL;
- cur->generic.prev = NULL;
- cur->generic.match = _rx_cap_is_match;
-
- const char *end = _rx_find_closing_par(re);
-
- if (end == NULL)
- _rx_throw_compile_exception(cregex_UNCLOSED_SUBEXPRESSION, re);
-
- *leftover = end;
- return _rx_compile(re, end - 1, cur + 1);
-}
-
-static cregex_node *insert_or(cregex_node *cur, cregex_node **prev) {
- cur->generic.match = _rx_or_is_match;
- cur->generic.next = NULL;
- cur->generic.prev = NULL;
-
- // Find last start node
- cregex_node *begin = *prev;
-
- while (begin->generic.match != _rx_start_is_match) {
- begin = begin->generic.prev;
- }
-
- cur->ior.left = begin->generic.next;
- *prev = begin;
-
- return cur + 1;
-}
-
-/* _rx_compile next node. returns address of next available node.
- * returns NULL if re is empty */
-static cregex_node *_rx_compile_next(const char *re, const char **leftover,
- cregex_node *prev, cregex_node *cur)
-{
- if (*re == 0)
- return NULL;
-
- const uint32_t chr = utf8_peek(re);
- re = utf8_next(re);
- cregex_node *next = cur + 1;
-
- switch (chr) {
- case '^':
- cur->generic.match = _rx_anchor_begin_is_match;
- break;
-
- case '$':
- cur->generic.match = _rx_anchor_end_is_match;
- break;
-
- case '.':
- cur->generic.match = _rx_any_is_match;
- break;
-
- case '*':
- _rx_append_quant(&prev, cur, 0, SIZE_MAX, re);
- break;
-
- case '+':
- _rx_append_quant(&prev, cur, 1, SIZE_MAX, re);
- break;
-
- case '?':
- _rx_append_quant(&prev, cur, 0, 1, re);
- break;
-
- case '{': {
- size_t min = 0, max = SIZE_MAX;
- const char *leftover = NULL;
- _rx_parse_complex_quant(re, &leftover, &min, &max);
-
- _rx_append_quant(&prev, cur, min, max, re);
- re = leftover;
- break;
- }
-
- case '[':
- next = _rx_compile_next_complex_class(re, &re, cur);
- break;
-
- case '(':
- next = _rx_compile_next_cap(re, &re, cur);
- break;
-
- case '\\':
- next = _rx_compile_next_escaped(re, &re, cur);
- break;
-
- case '|':
- next = insert_or(cur, &prev);
- break;
-
- default:
- cur->chr.chr = chr;
- cur->generic.match = _rx_char_is_match;
- break;
- }
-
- cur->generic.next = NULL;
- cur->generic.prev = prev;
- prev->generic.next = cur;
- *leftover = re;
-
- return next;
-}
-
-/* _rx_compile raw regular expression into a linked list of nodes. return leftover nodes */
-static cregex_node *_rx_compile(const char *re, const char *end, cregex_node *nodes)
-{
- cregex_node *prev = nodes;
- cregex_node *cur = nodes + 1;
-
- prev->generic.next = NULL;
- prev->generic.prev = NULL;
- prev->generic.match = _rx_start_is_match;
-
- while (cur != NULL && re != NULL && re < end) {
- const char *next = NULL;
- cregex_node *next_node = _rx_compile_next(re, &next, prev, cur);
-
- prev = cur;
- cur = next_node;
- re = next;
- }
-
- return cur;
-}
-
-STC_DEF cregex cregex_new(const char *re)
-{
- cregex ret = {NULL};
-
- _rx_clear_compile_exception();
- if (re == NULL) {
- _rx_CompileException.err = cregex_INVALID_PARAMS;
- return ret;
- }
-
- if (!utf8_is_valid(re)) {
- _rx_CompileException.err = cregex_INVALID_UTF8;
- _rx_CompileException.s = NULL;
- return ret;
- }
-
- cregex_node *nodes = NULL;
-
- if (setjmp(_rx_CompileException.buf)) {
- // Error callback
- free(nodes);
- return ret;
- }
-
- const size_t compile_len = _rx_calc_compiled_len(re);
- nodes = (cregex_node *)calloc(compile_len, sizeof(cregex_node));
- _rx_compile(re, re + strlen(re), nodes);
- ret.nodes = nodes;
-
- return ret;
-}
-
-STC_DEF cregex_error_t cregex_error(void)
-{
- return _rx_CompileException.err;
-}
-
-STC_DEF bool cregex_is_match(cregex* rx, const char *s)
-{
- rx->input = s;
- bool res = _rx_is_match(rx->nodes, s, s, &s);
- return res && *s == 0;
-}
-
-STC_DEF bool cregex_find_next(cregex *rx, const char *s, cregex_match *m)
-{
- const char *it = s + m->end, *next;
- rx->input = s;
-
- for (; *it; it = utf8_next(it)) {
- if (_rx_is_match(rx->nodes, s, it, &next)) {
- m->start = it - s;
- m->end = next - s;
- return true;
- }
- }
- return false;
-}
-
-STC_API bool cregex_find_next_v(cregex *rx, const char *s, csview *sv)
-{
- cregex_match m = {(size_t)(sv->str - s), m.start + sv->size};
-
- bool res = cregex_find_next(rx, s, &m);
- *sv = c_make(csview){s + m.start, m.end - m.start};
- return res;
-}
-
-
-STC_DEF void cregex_drop(cregex *rx)
-{
- free(rx->nodes);
-}
-
-/* calculate amount of capture groups
- * inside a regular expression */
-static size_t _rx_cap_node_count(cregex_node *nodes)
-{
- if (nodes == NULL) {
- return 0;
- } else if (nodes->generic.match == _rx_quant_is_match) {
- return _rx_cap_node_count(nodes->quant.subexp) +
- _rx_cap_node_count(nodes->generic.next);
- } else if (nodes->generic.match == _rx_cap_is_match) {
- return _rx_cap_node_count(nodes->quant.subexp) +
- _rx_cap_node_count(nodes->generic.next) + 1;
- } else {
- return _rx_cap_node_count(nodes->generic.next);
- }
-}
-
-STC_DEF size_t cregex_capture_size(cregex rx)
-{
- return _rx_cap_node_count(rx.nodes);
-}
-
-static cregex_node *_rx_find_capture_node(cregex_node *node, size_t index)
-{
- if (node == NULL) {
- return NULL;
- } else if (node->generic.match == _rx_cap_is_match) {
- if (index == 0) {
- return node;
- } else {
- const size_t subexp_len =
- _rx_cap_node_count(node->cap.subexp);
- if (index <= subexp_len) {
- return _rx_find_capture_node(node->cap.subexp,
- index - subexp_len);
- } else {
- return _rx_find_capture_node(node->generic.next,
- index - 1 - subexp_len);
- }
- }
- } else if (node->generic.match == _rx_quant_is_match) {
- const size_t subexp_len = _rx_cap_node_count(node->quant.subexp);
- if (index < subexp_len) {
- return _rx_find_capture_node(node->quant.subexp, index);
- } else {
- return _rx_find_capture_node(node->generic.next, index);
- }
- } else {
- return _rx_find_capture_node(node->generic.next, index);
- }
-}
-
-STC_DEF bool cregex_capture(cregex *rx, size_t index, cregex_match *m)
-{
- _rx_CapNode *cap = (_rx_CapNode *)_rx_find_capture_node(rx->nodes, index);
-
- if (!cap) return false;
-
- *m = cap->cap;
- return true;
-}
-
-#endif
-#endif
-#undef i_opt
diff --git a/tests/cregex_test.c b/tests/cregex_test.c
deleted file mode 100644
index a2691152..00000000
--- a/tests/cregex_test.c
+++ /dev/null
@@ -1,367 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-//#include <check.h>
-#include <stc/cregex.h>
-
-#define START_TEST(f) void f(void)
-#define END_TEST
-
-#define ck_assert(x) assert(x)
-#define ck_assert_uint_ne(a, b) assert(a != b)
-#define ck_assert_uint_eq(a, b) if (!((a) == (b))) printf("%u == %u: ", a, b), assert((a) == (b))
-#define ck_assert_int_ne(a, b) assert(a != b)
-#define ck_assert_int_eq(a, b) assert(a == b)
-#define ck_assert_ptr_ne(a, b) assert(a != b)
-#define ck_assert_ptr_eq(a, b) assert(a == b)
-
-
-START_TEST(compile_match_char)
-{
- cregex re = cregex_new("äsdf");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- cregex_match match;
- ck_assert(cregex_find(&re, "äsdf", &match));
- ck_assert_uint_eq(match.start, 0);
- ck_assert_uint_eq(match.end, 5); // ä is two bytes wide
-
- ck_assert(cregex_find(&re, "zäsdf", &match));
- ck_assert_uint_eq(match.start, 1);
- ck_assert_uint_eq(match.end, 6);
-
- ck_assert(cregex_find(&re, "äsdf", &match));
- ck_assert_uint_eq(match.start, 0);
- ck_assert_uint_eq(match.end, 5);
-
- cregex_drop(&re);
-}
-END_TEST
-
-START_TEST(compile_match_anchors)
-{
- cregex re[1] = {cregex_new("^äs.f$")};
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- cregex_match match;
- ck_assert(cregex_find(re, "äsdf", &match));
- ck_assert_uint_eq(match.start, 0);
- ck_assert_uint_eq(match.end, 5);
-
- ck_assert(cregex_find(re, "äs♥f", &match));
- ck_assert(cregex_find(re, "äsöf", &match));
-
- cregex_drop(re);
-}
-END_TEST
-
-START_TEST(compile_match_quantifiers)
-{
- c_auto (cregex, re) {
- re = cregex_new("ä+");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- cregex_match match;
- ck_assert(cregex_find(&re, "ääb", &match));
- ck_assert_uint_eq(match.start, 0);
- ck_assert_uint_eq(match.end, 4);
-
- ck_assert(cregex_find(&re, "bäbb", &match));
- ck_assert_uint_eq(match.start, 1);
- ck_assert_uint_eq(match.end, 3);
-
- ck_assert(!cregex_find(&re, "bbb", &match));
- }
- c_auto (cregex, re) {
- re = cregex_new("bä*");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- cregex_match match;
- ck_assert(cregex_find(&re, "bääb", &match));
- ck_assert_uint_eq(match.start, 0);
- ck_assert_uint_eq(match.end, 5);
-
- ck_assert(cregex_find(&re, "bäbb", &match));
- ck_assert_uint_eq(match.start, 0);
- ck_assert_uint_eq(match.end, 3);
-
- ck_assert(cregex_find(&re, "bbb", &match));
- ck_assert_uint_eq(match.start, 0);
- ck_assert_uint_eq(match.end, 1);
- }
-}
-END_TEST
-
-START_TEST(compile_match_complex_quants)
-{
- c_auto (cregex, re1, re2, re3, re4)
- {
- re1 = cregex_new("ä{1,3}");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- re2 = cregex_new("ä{1}");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- re3 = cregex_new("ä{,}");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- re4 = cregex_new("ä{,3}");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- cregex_match match;
- ck_assert(cregex_find(&re1, "ääb", &match));
- ck_assert_uint_eq(match.start, 0);
- ck_assert_uint_eq(match.end, 4);
- ck_assert(cregex_find(&re1, "äääb", &match));
- ck_assert(cregex_find(&re1, "äb", &match));
- ck_assert(!cregex_find(&re1, "b", &match));
-
- ck_assert(cregex_find(&re2, "ää", &match));
- ck_assert_uint_eq(match.start, 0);
- ck_assert_uint_eq(match.end, 2);
- ck_assert(cregex_find(&re2, "bbäb", &match));
- ck_assert(!cregex_find(&re2, "bbbb", &match));
-
- ck_assert(cregex_find(&re3, "ääääääääääb", &match));
- ck_assert_uint_eq(match.start, 0);
- ck_assert_uint_eq(match.end, 20);
- ck_assert(cregex_find(&re3, "b", &match));
-
- ck_assert(cregex_find(&re4, "bä", &match));
- ck_assert(cregex_find(&re4, "bää", &match));
- ck_assert(cregex_find(&re4, "bäää", &match));
- }
-}
-END_TEST
-
-START_TEST(compile_match_escaped_chars)
-{
- cregex re = cregex_new("\\n\\r\\t\\{");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- cregex_match match;
- ck_assert(cregex_find(&re, "\n\r\t{", &match));
- ck_assert(!cregex_find(&re, "\n\r\t", &match));
-
- cregex_drop(&re);
-}
-END_TEST
-
-START_TEST(compile_match_class_simple)
-{
- c_auto (cregex, re1, re2, re3)
- {
- re1 = cregex_new("\\s");
- ck_assert_int_eq(cregex_error(), cregex_OK);
- re2 = cregex_new("\\w");
- ck_assert_int_eq(cregex_error(), cregex_OK);
- re3 = cregex_new("\\D");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- cregex_match match;
- ck_assert(cregex_find(&re1, " ", &match));
- ck_assert(cregex_find(&re1, "\r", &match));
- ck_assert(cregex_find(&re1, "\n", &match));
-
- ck_assert(cregex_find(&re2, "a", &match));
- ck_assert(cregex_find(&re2, "0", &match));
- ck_assert(cregex_find(&re2, "_", &match));
-
- ck_assert(cregex_find(&re3, "k", &match));
- ck_assert(!cregex_find(&re3, "0", &match));
- }
-}
-END_TEST
-
-START_TEST(compile_match_or)
-{
- c_auto (cregex, re, re2)
- {
- re = cregex_new("as|df");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- cregex_match match;
- ck_assert(cregex_find(&re, "as", &match));
- ck_assert(cregex_find(&re, "df", &match));
-
- re2 = cregex_new("(as|df)");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- ck_assert(cregex_find(&re2, "as", &match));
- ck_assert(cregex_find(&re2, "df", &match));
- }
-}
-END_TEST
-
-START_TEST(compile_match_class_complex_0)
-{
- cregex re = cregex_new("[asdf]");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- cregex_match match;
- ck_assert(cregex_find(&re, "a", &match));
- ck_assert(cregex_find(&re, "s", &match));
- ck_assert(cregex_find(&re, "d", &match));
- ck_assert(cregex_find(&re, "f", &match));
-
- cregex_drop(&re);
-}
-END_TEST
-
-START_TEST(compile_match_class_complex_1)
-{
- cregex re = cregex_new("[a-zä0-9öA-Z]");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- cregex_match match;
- ck_assert(cregex_find(&re, "a", &match));
- ck_assert(cregex_find(&re, "5", &match));
- ck_assert(cregex_find(&re, "A", &match));
- ck_assert(cregex_find(&re, "ä", &match));
- ck_assert(cregex_find(&re, "ö", &match));
-
- cregex_drop(&re);
-}
-END_TEST
-
-START_TEST(compile_match_cap)
-{
- cregex re = cregex_new("(abc)d");
- ck_assert_int_eq(cregex_error(), cregex_OK);
-
- cregex_match match;
- ck_assert(cregex_find(&re, "abcd", &match));
- ck_assert(cregex_find(&re, "llljabcdkk", &match));
- ck_assert(!cregex_find(&re, "abc", &match));
-
- cregex_drop(&re);
-}
-END_TEST
-
-START_TEST(invalid_quantifier)
-{
- ck_assert_ptr_eq(cregex_new("+").nodes, NULL);
- ck_assert_int_eq(cregex_error(), cregex_EARLY_QUANTIFIER);
-}
-END_TEST
-
-/* Test that invalid utf8 sequences cause a
- * cregex_INVALID_UTF8 error */
-START_TEST(invalid_utf8)
-{
- char s1[3] = "ä";
- s1[1] = 65; // invalid continuation byte
-
- ck_assert_ptr_eq(cregex_new(s1).nodes, NULL);
- ck_assert_int_eq(cregex_error(), cregex_INVALID_UTF8);
-}
-END_TEST
-
-START_TEST(search_all)
-{
- c_auto (cregex, re)
- {
- re = cregex_new("ab");
- cregex_match m = {0};
- bool res;
-
- res = cregex_find_next(&re, "ab,ab,ab", &m);
- ck_assert(res && m.start == 0);
- res = cregex_find_next(&re, "ab,ab,ab", &m);
- ck_assert(res && m.start == 3);
- res = cregex_find_next(&re, "ab,ab,ab", &m);
- ck_assert(res && m.start == 6);
- res = cregex_find_next(&re, "ab,ab,ab", &m);
- ck_assert(!res);
- }
-}
-END_TEST
-
-START_TEST(captures_len)
-{
- c_auto (cregex, re) {
- cregex re = cregex_new("(ab(cd))(ef)");
- ck_assert_uint_eq(cregex_capture_size(re), 3);
- }
-}
-END_TEST
-
-START_TEST(captures_cap)
-{
- c_auto (cregex, re) {
- re = cregex_new("(ab)((cd)+)");
- ck_assert_uint_eq(cregex_capture_size(re), 3);
-
- cregex_match match;
- ck_assert(cregex_find(&re, "xxabcdcde", &match));
-
- cregex_match cap0, cap1;
- cregex_capture(&re, 0, &cap0);
- cregex_capture(&re, 1, &cap1);
-
- ck_assert_uint_eq(cap0.start, 2);
- ck_assert_uint_eq(cap0.end, 4);
- ck_assert_uint_eq(cap1.start, 4);
- ck_assert_uint_eq(cap1.end, 8);
-
- ck_assert(!cregex_is_match(&re, "abcdcde"));
- ck_assert(cregex_is_match(&re, "abcdcdcd"));
- }
-}
-END_TEST
-
-/*
-Suite *cregex_test_suite(void)
-{
- Suite *ret = suite_create("mregexp");
- TCase *tcase = tcase_create("mregexp");
-
- tcase_add_test(tcase, compile_match_char);
- tcase_add_test(tcase, invalid_utf8);
- tcase_add_test(tcase, compile_match_anchors);
- tcase_add_test(tcase, compile_match_quantifiers);
- tcase_add_test(tcase, invalid_quantifier);
- tcase_add_test(tcase, compile_match_complex_quants);
- tcase_add_test(tcase, compile_match_escaped_chars);
- tcase_add_test(tcase, compile_match_class_simple);
- tcase_add_test(tcase, compile_match_class_complex_0);
- tcase_add_test(tcase, compile_match_class_complex_1);
- tcase_add_test(tcase, compile_match_cap);
- tcase_add_test(tcase, search_all);
- tcase_add_test(tcase, captures_len);
- tcase_add_test(tcase, captures_cap);
- tcase_add_test(tcase, compile_match_or);
-
- suite_add_tcase(ret, tcase);
- return ret;
-}
-
-int main(void)
-{
- SRunner *sr = srunner_create(cregex_test_suite());
- srunner_run_all(sr, CK_NORMAL);
- int fails = srunner_ntests_failed(sr);
- srunner_free(sr);
- return fails ? EXIT_FAILURE : EXIT_SUCCESS;
-}
-*/
-
-int main()
-{
- compile_match_char();
- invalid_utf8();
- compile_match_anchors();
- compile_match_quantifiers();
- invalid_quantifier();
- compile_match_complex_quants();
- compile_match_escaped_chars();
- compile_match_class_simple();
- compile_match_class_complex_0();
- compile_match_class_complex_1();
- compile_match_cap();
- search_all();
- captures_len();
- captures_cap();
- compile_match_or();
- printf("All tests succesful.\n");
-}