summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorTyge Lovset <[email protected]>2022-07-20 23:13:11 +0200
committerTyge Lovset <[email protected]>2022-07-20 23:13:11 +0200
commit3f89c290700618eae78eaa289bdb88d1cfb3514d (patch)
tree32dd44f8ca210aee69039f8779be611f7885cd5b
parent78cb61301df13fee995d3afd1bd1d8d63310d819 (diff)
downloadSTC-modified-3f89c290700618eae78eaa289bdb88d1cfb3514d.tar.gz
STC-modified-3f89c290700618eae78eaa289bdb88d1cfb3514d.zip
Added cregex_replace*() [implemented in utf8code.c]. Added examples/regex_replace.c. Docs not ready, i.e. API not fixed. Some other refactoring and minor fixes/improvements. cstr_assign_sv() now returns char* like the other cstr_assign*().
-rw-r--r--examples/regex2.c2
-rw-r--r--examples/regex_match.c6
-rw-r--r--examples/regex_replace.c35
-rw-r--r--include/stc/ccommon.h2
-rw-r--r--include/stc/cregex.h60
-rw-r--r--include/stc/cstr.h10
-rw-r--r--include/stc/csview.h4
-rw-r--r--src/cregex.c95
-rw-r--r--src/utf8code.c75
9 files changed, 181 insertions, 108 deletions
diff --git a/examples/regex2.c b/examples/regex2.c
index 82247da5..1f3163f7 100644
--- a/examples/regex2.c
+++ b/examples/regex2.c
@@ -22,7 +22,7 @@ int main()
printf("input: %s\n", inputs[i]);
if (cregex_match(&re, inputs[i], 20, m, 0) > 0)
{
- c_forrange (j, cregex_captures(re))
+ c_forrange (j, cregex_captures(&re))
{
printf(" submatch %" PRIuMAX ": %" c_PRIsv "\n", j, c_ARGsv(m[j]));
}
diff --git a/examples/regex_match.c b/examples/regex_match.c
index 2b135bb7..5680b55e 100644
--- a/examples/regex_match.c
+++ b/examples/regex_match.c
@@ -13,14 +13,14 @@ int main()
{
int res = cregex_compile(&re, "[+-]?([0-9]*\\.)?\\d+([Ee][+-]?\\d+)?", 0);
printf("%d\n", res);
- cregmatch m[10];
+ csview m[10];
if (cregex_match(&re, s, 10, m, 0) > 0) {
printf("Found digits at position %" PRIuMAX "-%" PRIuMAX "\n", m[0].str - s, m[0].str - s + m[0].size);
} else {
printf("Could not find any digits\n");
}
- while (cregex_match(&re, s, 10, m, creg_next) > 0) {
+ while (cregex_match(&re, s, 10, m, cregex_NEXT) > 0) {
printf("%" c_PRIsv " ; ", c_ARGsv(m[0]));
}
puts("");
@@ -28,7 +28,7 @@ int main()
res = cregex_compile(&re, "(.+)\\b(.+)", 0);
printf("groups: %d\n", res);
if ((res = cregex_match(&re, "hello@wørld", 10, m, 0)) > 0) {
- c_forrange (i, res)
+ c_forrange (i, res)
printf("match: [%" c_PRIsv "]\n", c_ARGsv(m[i]));
} else
printf("err: %d\n", res);
diff --git a/examples/regex_replace.c b/examples/regex_replace.c
new file mode 100644
index 00000000..1216701f
--- /dev/null
+++ b/examples/regex_replace.c
@@ -0,0 +1,35 @@
+#define i_implement
+#include <stc/cstr.h>
+#include <stc/cregex.h>
+#include <stc/csview.h>
+
+cstr sub_20y(int i, csview m) {
+ if (i == 1) { // year
+ int year;
+ sscanf(m.str, "%4d", &year);
+ return cstr_from_fmt("%04d", year - 20);
+ }
+ return cstr_from_sv(m);
+}
+
+int main()
+{
+ const char* pattern = "\\b(\\d\\d\\d\\d)-(1[0-2]|0[1-9])-(3[01]|[12][0-9]|0[1-9])\\b";
+ const char* input = "start date: 2015-12-31, end date: 2022-02-28";
+
+ c_auto (cregex, re)
+ c_auto (cstr, str1, str2)
+ {
+ printf("input: %s\n", input);
+ /* European date format */
+ str1 = cregex_replace(input, pattern, "\\3.\\2.\\1");
+ printf("euros: %s\n", cstr_str(&str1));
+
+ /* US date format, and subtract 20 years: */
+ str2 = cregex_replace_fn(input, pattern, "\\1/\\3/\\2", sub_20y, 0, 0);
+ printf("us-20: %s\n", cstr_str(&str2));
+ }
+}
+
+#include "../src/cregex.c"
+#include "../src/utf8code.c"
diff --git a/include/stc/ccommon.h b/include/stc/ccommon.h
index 3a6d8f4e..e87e7678 100644
--- a/include/stc/ccommon.h
+++ b/include/stc/ccommon.h
@@ -112,7 +112,7 @@ typedef const char* crawstr;
#define crawstr_cmp(xp, yp) strcmp(*(xp), *(yp))
#define crawstr_hash(p) c_strhash(*(p))
#define c_strlen_lit(literal) (sizeof "" literal - 1U)
-#define c_sv(lit) c_make(csview){lit, c_strlen_lit(lit)}
+#define c_sv(lit) (c_make(csview){lit, c_strlen_lit(lit)})
#define c_PRIsv ".*s"
#define c_ARGsv(sv) (int)(sv).size, (sv).str
diff --git a/include/stc/cregex.h b/include/stc/cregex.h
index 448f9405..11e21b06 100644
--- a/include/stc/cregex.h
+++ b/include/stc/cregex.h
@@ -34,32 +34,33 @@ THE SOFTWARE.
#include "forward.h" // csview
typedef enum {
- creg_nomatch = -1,
- creg_matcherror = -2,
- creg_outofmemory = -3,
- creg_unmatchedleftparenthesis = -4,
- creg_unmatchedrightparenthesis = -5,
- creg_toomanysubexpressions = -6,
- creg_toomanycharacterclasses = -7,
- creg_malformedcharacterclass = -8,
- creg_missingoperand = -9,
- creg_unknownoperator = -10,
- creg_operandstackoverflow = -11,
- creg_operatorstackoverflow = -12,
- creg_operatorstackunderflow = -13,
+ creg_success = 1,
+ creg_nomatch = 0,
+ creg_matcherror = -1,
+ creg_outofmemory = -2,
+ creg_unmatchedleftparenthesis = -3,
+ creg_unmatchedrightparenthesis = -4,
+ creg_toomanysubexpressions = -5,
+ creg_toomanycharacterclasses = -6,
+ creg_malformedcharacterclass = -7,
+ creg_missingoperand = -8,
+ creg_unknownoperator = -9,
+ creg_operandstackoverflow = -10,
+ creg_operatorstackoverflow = -11,
+ creg_operatorstackunderflow = -12,
} cregex_error_t;
enum {
/* compile flags */
- creg_dotall = 1<<0,
- creg_caseless = 1<<1,
+ cregex_DOTALL = 1<<0,
+ cregex_CASELESS = 1<<1,
/* execution flags */
- creg_fullmatch = 1<<2,
- creg_next = 1<<3,
- creg_startend = 1<<4,
+ cregex_FULLMATCH = 1<<2,
+ cregex_NEXT = 1<<3,
+ cregex_STARTEND = 1<<4,
/* limits */
- creg_max_classes = 16,
- creg_max_captures = 32,
+ cregex_MAXCLASSES = 16,
+ cregex_MAXCAPTURES = 32,
};
typedef struct {
@@ -76,15 +77,26 @@ static inline cregex cregex_init(void) {
int cregex_compile(cregex *self, const char* pattern, int cflags);
/* number of capture groups in a regex pattern */
-int cregex_captures(cregex rx);
+int cregex_captures(const cregex* self);
/* return number of capture groups on success, or (negative) error code on failure. */
int cregex_match(const cregex *self, const char* string,
- size_t nmatch, cregmatch match[], int mflags);
+ unsigned nmatch, csview match[], int mflags);
-void cregex_replace(const char* src, char* dst, int dsize,
- int nmatch, const cregmatch match[]);
+/* replace regular expression */
+void cregex_build_replace(const char* repl, unsigned nmatch, const csview match[],
+ cstr (*mfun)(int i, csview match), cstr* out);
+cstr cregex_replace_re(const char* input, const cregex* re, const char* repl,
+ cstr (*mfun)(int i, csview match), int cflags, unsigned count);
+
+cstr cregex_replace_fn(const char* input, const char* pattern, const char* replace,
+ cstr (*mfun)(int i, csview match), int cflags, unsigned count);
+static inline
+cstr cregex_replace(const char* input, const char* pattern, const char* replace)
+ { return cregex_replace_fn(input, pattern, replace, NULL, 0, 0); }
+
+/* destroy regex */
void cregex_drop(cregex* self);
#endif
diff --git a/include/stc/cstr.h b/include/stc/cstr.h
index 441fe94a..8395f127 100644
--- a/include/stc/cstr.h
+++ b/include/stc/cstr.h
@@ -404,9 +404,9 @@ STC_DEF char* cstr_reserve(cstr* self, const size_t cap) {
if (cap > cstr_s_cap) {
char* data = (char *)c_malloc(cap + 1);
const size_t len = cstr_s_size(self);
- memcpy(data, self->sml.data, len);
+ memcpy(data, self->sml.data, cstr_s_cap + 1);
self->lon.data = data;
- cstr_l_set_size(self, len);
+ self->lon.size = len;
cstr_l_set_cap(self, cap);
return data;
}
@@ -525,7 +525,8 @@ STC_DEF int cstr_vfmt(cstr* self, const char* fmt, va_list args) {
STC_DEF cstr cstr_from_fmt(const char* fmt, ...) {
cstr s = cstr_null;
- va_list args; va_start(args, fmt);
+ va_list args;
+ va_start(args, fmt);
cstr_vfmt(&s, fmt, args);
va_end(args);
return s;
@@ -533,7 +534,8 @@ STC_DEF cstr cstr_from_fmt(const char* fmt, ...) {
STC_DEF int cstr_printf(cstr* self, const char* fmt, ...) {
cstr s = cstr_null;
- va_list args; va_start(args, fmt);
+ va_list args;
+ va_start(args, fmt);
const int n = cstr_vfmt(&s, fmt, args);
va_end(args);
cstr_drop(self); *self = s;
diff --git a/include/stc/csview.h b/include/stc/csview.h
index e74ce844..39bfa354 100644
--- a/include/stc/csview.h
+++ b/include/stc/csview.h
@@ -120,8 +120,8 @@ STC_INLINE csview cstr_substr_ex(const cstr* self, intptr_t pos, size_t n)
STC_INLINE csview cstr_slice_ex(const cstr* self, intptr_t p1, intptr_t p2)
{ return csview_slice_ex(csview_from_s(self), p1, p2); }
-STC_INLINE csview cstr_assign_sv(cstr* self, csview sv)
- { return c_make(csview){cstr_assign_n(self, sv.str, sv.size), sv.size}; }
+STC_INLINE char* cstr_assign_sv(cstr* self, csview sv)
+ { return cstr_assign_n(self, sv.str, sv.size); }
STC_INLINE void cstr_append_sv(cstr* self, csview sv)
{ cstr_append_n(self, sv.str, sv.size); }
diff --git a/src/cregex.c b/src/cregex.c
index b326b4fc..c30b66da 100644
--- a/src/cregex.c
+++ b/src/cregex.c
@@ -38,9 +38,9 @@ THE SOFTWARE.
typedef uint32_t Rune; /* Utf8 code point */
typedef int32_t Token;
/* max character classes per program */
-#define NCLASS creg_max_classes
+#define NCLASS cregex_MAXCLASSES
/* max subexpressions */
-#define NSUBEXP creg_max_captures
+#define NSUBEXP cregex_MAXCAPTURES
/* max rune ranges per character class */
#define NCCRUNE (NSUBEXP * 2)
@@ -91,7 +91,7 @@ typedef struct Reprog
/*
* Sub expression matches
*/
-typedef cregmatch Resub;
+typedef csview Resub;
/*
* substitution list
@@ -228,11 +228,11 @@ utfruneicase(const char *s, Rune c)
* save a new match in mp
*/
static void
-_renewmatch(Resub *mp, int ms, Resublist *sp, int nsubids)
+_renewmatch(Resub *mp, unsigned ms, Resublist *sp, int nsubids)
{
int i;
- if (mp==NULL || ms<=0)
+ if (mp==NULL || ms==0)
return;
if (mp[0].str == NULL || sp->m[0].str < mp[0].str ||
(sp->m[0].str == mp[0].str && sp->m[0].size > mp[0].size)) {
@@ -249,7 +249,7 @@ _renewmatch(Resub *mp, int ms, Resublist *sp, int nsubids)
static Relist*
_renewthread(Relist *lp, /* _relist to add to */
Reinst *ip, /* instruction to add */
- int ms,
+ unsigned ms,
Resublist *sep) /* pointers to subexpressions */
{
Relist *p;
@@ -281,7 +281,7 @@ _renewthread(Relist *lp, /* _relist to add to */
static Relist*
_renewemptythread(Relist *lp, /* _relist to add to */
Reinst *ip, /* instruction to add */
- int ms,
+ unsigned ms,
const char *sp) /* pointers to subexpressions */
{
Relist *p;
@@ -806,8 +806,8 @@ regcomp1(Reprog *progp, Parser *par, const char *s, int cflags)
free(progp);
return NULL;
}
- pp->flags.caseless = (cflags & creg_caseless) != 0;
- pp->flags.dotall = (cflags & creg_dotall) != 0;
+ pp->flags.caseless = (cflags & cregex_CASELESS) != 0;
+ pp->flags.dotall = (cflags & cregex_DOTALL) != 0;
par->freep = pp->firstinst;
par->classp = pp->cclass;
par->errors = 0;
@@ -930,10 +930,10 @@ runematch(Rune s, Rune r, bool icase)
* <0 if we ran out of _relist space
*/
static int
-regexec1(const Reprog *progp, /* program to run */
+regexec1(const Reprog *progp, /* program to run */
const char *bol, /* string to run machine on */
- Resub *mp, /* subexpression elements */
- int ms, /* number of elements at mp */
+ Resub *mp, /* subexpression elements */
+ unsigned ms, /* number of elements at mp */
Reljunk *j,
int mflags
)
@@ -1057,7 +1057,7 @@ regexec1(const Reprog *progp, /* program to run */
/* efficiency: advance and re-evaluate */
continue;
case END: /* Match! */
- match = !(mflags & creg_fullmatch) ||
+ match = !(mflags & cregex_FULLMATCH) ||
((s == j->eol || r == 0 || r == '\n') &&
(tlp->se.m[0].str == bol || tlp->se.m[0].str[-1] == '\n'));
tlp->se.m[0].size = s - tlp->se.m[0].str;
@@ -1082,8 +1082,8 @@ regexec1(const Reprog *progp, /* program to run */
static int
regexec2(const Reprog *progp, /* program to run */
const char *bol, /* string to run machine on */
- Resub *mp, /* subexpression elements */
- int ms, /* number of elements at mp */
+ Resub *mp, /* subexpression elements */
+ unsigned ms, /* number of elements at mp */
Reljunk *j,
int mflags
)
@@ -1109,7 +1109,7 @@ regexec2(const Reprog *progp, /* program to run */
static int
regexec(const Reprog *progp, /* program to run */
const char *bol, /* string to run machine on */
- int ms, /* number of elements at mp */
+ unsigned ms, /* number of elements at mp */
Resub mp[], /* subexpression elements */
int mflags)
{
@@ -1123,10 +1123,10 @@ regexec(const Reprog *progp, /* program to run */
j.starts = bol;
j.eol = NULL;
- if (mp && mp->str && ms>0) {
- if (mflags & creg_startend)
+ if (ms && mp->size) {
+ if (mflags & cregex_STARTEND)
j.starts = mp->str, j.eol = mp->str + mp->size;
- else if (mflags & creg_next)
+ else if (mflags & cregex_NEXT)
j.starts = mp->str + mp->size;
}
@@ -1157,55 +1157,6 @@ regexec(const Reprog *progp, /* program to run */
* API functions
*/
-/* substitute into one string using the matches from the last regexec() */
-void cregex_replace(
- const char *sp, /* source string */
- char *dp, /* destination string */
- int dlen,
- int ms, /* number of elements pointed to by mp */
- const cregmatch mp[]) /* subexpression elements */
-{
- const char *ssp, *ep;
- int i;
-
- ep = dp+dlen-1;
- while (*sp != '\0') {
- if (*sp == '\\') {
- switch (*++sp) {
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- i = *sp - '0';
- if (mp[i].str != NULL && mp != NULL && ms > i)
- for (ssp = mp[i].str; ssp < (mp[i].str + mp[i].size); ssp++)
- if (dp < ep)
- *dp++ = *ssp;
- break;
- case '\\':
- if (dp < ep)
- *dp++ = '\\';
- break;
- case '\0':
- sp--;
- break;
- default:
- if (dp < ep)
- *dp++ = *sp;
- break;
- }
- } else if (*sp == '&') {
- if (mp[0].str != NULL && mp != NULL && ms > 0)
- for (ssp = mp[0].str; ssp < (mp[0].str + mp[0].size); ssp++)
- if (dp < ep)
- *dp++ = *ssp;
- } else {
- if (dp < ep)
- *dp++ = *sp;
- }
- sp++;
- }
- *dp = '\0';
-}
-
int cregex_compile(cregex *rx, const char* pattern, int cflags) {
Parser par;
rx->prog = regcomp1(rx->prog, &par, pattern, cflags);
@@ -1214,15 +1165,15 @@ int cregex_compile(cregex *rx, const char* pattern, int cflags) {
return par.errors;
}
-int cregex_captures(cregex rx) {
- return rx.prog ? 1 + rx.prog->nsubids : 0;
+int cregex_captures(const cregex* self) {
+ return self->prog ? 1 + self->prog->nsubids : 0;
}
int cregex_match(const cregex *rx, const char* string,
- size_t nmatch, cregmatch match[], int mflags) {
+ unsigned nmatch, csview match[], int mflags) {
int res = regexec(rx->prog, string, nmatch, match, mflags);
switch (res) {
- case 1: return 1 + rx->prog->nsubids;
+ case 1: return creg_success;
case 0: return creg_nomatch;
default: return creg_matcherror;
}
diff --git a/src/utf8code.c b/src/utf8code.c
index dff10409..44120cee 100644
--- a/src/utf8code.c
+++ b/src/utf8code.c
@@ -1,6 +1,7 @@
#include <ctype.h>
#define i_header
#include <stc/cstr.h>
+#include <stc/cregex.h>
#include "utf8tabs.inc"
const uint8_t utf8_dtab[] = {
@@ -16,7 +17,7 @@ const uint8_t utf8_dtab[] = {
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
- 12,36,12,12,12,12,12,12,12,12,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
};
unsigned utf8_encode(char *out, uint32_t c)
@@ -220,3 +221,75 @@ void cstr_lowercase(cstr* self) {
void cstr_uppercase(cstr* self) {
cstr_take(self, cstr_tocase(self, fn_toupper));
}
+
+
+void cregex_build_replace(const char* repl, unsigned nmatch, const csview match[],
+ cstr (*mfun)(int i, csview match), cstr* sub) {
+ cstr_clear(sub);
+ unsigned len = 0, cap = cstr_capacity(*sub);
+ char* dst = cstr_data(sub);
+
+ while (*repl != '\0') {
+ if (*repl == '\\') {
+ const char num = *++repl;
+ int i;
+ switch (num) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ i = num - '0';
+ if (i < nmatch) {
+ csview m;
+ cstr s = cstr_null;
+ if (mfun) { s = mfun(i, match[i]); m = cstr_sv(&s); }
+ else m = match[i];
+ if (len + m.size >= cap)
+ dst = cstr_reserve(sub, cap = cap*3/2 + m.size);
+ for (const char* rp = m.str; rp != (m.str + m.size); ++rp)
+ dst[len++] = *rp;
+ cstr_drop(&s);
+ }
+ ++repl;
+ case '\0':
+ continue;
+ }
+ }
+ if (len == cap)
+ dst = cstr_reserve(sub, cap = cap*3/2 + 4);
+ dst[len++] = *repl++;
+ }
+ _cstr_set_size(sub, len);
+}
+
+
+cstr cregex_replace_re(const char* input, const cregex* re, const char* repl,
+ cstr (*mfun)(int i, csview match), int cflags, unsigned count) {
+ cstr out = cstr_null;
+ cstr sub = cstr_null;
+ size_t from = 0;
+ csview match[cregex_MAXCAPTURES];
+ unsigned nmatch = cregex_captures(re);
+ if (!count) count = ~0;
+
+ while (count-- && cregex_match(re, input + from, nmatch, match, 0) > 0) {
+ cregex_build_replace(repl, nmatch, match, mfun, &sub);
+ const size_t pos = match[0].str - input;
+ cstr_append_n(&out, input + from, pos - from);
+ cstr_append_s(&out, sub);
+ from = pos + match[0].size;
+ }
+ cstr_append(&out, input + from);
+ cstr_drop(&sub);
+ return out;
+}
+
+
+cstr cregex_replace_fn(const char* input, const char* pattern, const char* repl,
+ cstr (*mfun)(int i, csview match), int cflags, unsigned count) {
+ cregex re = cregex_init();
+ int res = cregex_compile(&re, pattern, cflags);
+ if (res < 0)
+ return cstr_new("[[cregex_replace_fn]]: invalid pattern");
+ cstr out = cregex_replace_re(input, &re, repl, mfun, cflags, count);
+ cregex_drop(&re);
+ return out;
+}