summaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorTyge Lovset <[email protected]>2022-07-20 23:13:11 +0200
committerTyge Lovset <[email protected]>2022-07-20 23:13:11 +0200
commit3f89c290700618eae78eaa289bdb88d1cfb3514d (patch)
tree32dd44f8ca210aee69039f8779be611f7885cd5b /src
parent78cb61301df13fee995d3afd1bd1d8d63310d819 (diff)
downloadSTC-modified-3f89c290700618eae78eaa289bdb88d1cfb3514d.tar.gz
STC-modified-3f89c290700618eae78eaa289bdb88d1cfb3514d.zip
Added cregex_replace*() [implemented in utf8code.c]. Added examples/regex_replace.c. Docs not ready, i.e. API not fixed. Some other refactoring and minor fixes/improvements. cstr_assign_sv() now returns char* like the other cstr_assign*().
Diffstat (limited to 'src')
-rw-r--r--src/cregex.c95
-rw-r--r--src/utf8code.c75
2 files changed, 97 insertions, 73 deletions
diff --git a/src/cregex.c b/src/cregex.c
index b326b4fc..c30b66da 100644
--- a/src/cregex.c
+++ b/src/cregex.c
@@ -38,9 +38,9 @@ THE SOFTWARE.
typedef uint32_t Rune; /* Utf8 code point */
typedef int32_t Token;
/* max character classes per program */
-#define NCLASS creg_max_classes
+#define NCLASS cregex_MAXCLASSES
/* max subexpressions */
-#define NSUBEXP creg_max_captures
+#define NSUBEXP cregex_MAXCAPTURES
/* max rune ranges per character class */
#define NCCRUNE (NSUBEXP * 2)
@@ -91,7 +91,7 @@ typedef struct Reprog
/*
* Sub expression matches
*/
-typedef cregmatch Resub;
+typedef csview Resub;
/*
* substitution list
@@ -228,11 +228,11 @@ utfruneicase(const char *s, Rune c)
* save a new match in mp
*/
static void
-_renewmatch(Resub *mp, int ms, Resublist *sp, int nsubids)
+_renewmatch(Resub *mp, unsigned ms, Resublist *sp, int nsubids)
{
int i;
- if (mp==NULL || ms<=0)
+ if (mp==NULL || ms==0)
return;
if (mp[0].str == NULL || sp->m[0].str < mp[0].str ||
(sp->m[0].str == mp[0].str && sp->m[0].size > mp[0].size)) {
@@ -249,7 +249,7 @@ _renewmatch(Resub *mp, int ms, Resublist *sp, int nsubids)
static Relist*
_renewthread(Relist *lp, /* _relist to add to */
Reinst *ip, /* instruction to add */
- int ms,
+ unsigned ms,
Resublist *sep) /* pointers to subexpressions */
{
Relist *p;
@@ -281,7 +281,7 @@ _renewthread(Relist *lp, /* _relist to add to */
static Relist*
_renewemptythread(Relist *lp, /* _relist to add to */
Reinst *ip, /* instruction to add */
- int ms,
+ unsigned ms,
const char *sp) /* pointers to subexpressions */
{
Relist *p;
@@ -806,8 +806,8 @@ regcomp1(Reprog *progp, Parser *par, const char *s, int cflags)
free(progp);
return NULL;
}
- pp->flags.caseless = (cflags & creg_caseless) != 0;
- pp->flags.dotall = (cflags & creg_dotall) != 0;
+ pp->flags.caseless = (cflags & cregex_CASELESS) != 0;
+ pp->flags.dotall = (cflags & cregex_DOTALL) != 0;
par->freep = pp->firstinst;
par->classp = pp->cclass;
par->errors = 0;
@@ -930,10 +930,10 @@ runematch(Rune s, Rune r, bool icase)
* <0 if we ran out of _relist space
*/
static int
-regexec1(const Reprog *progp, /* program to run */
+regexec1(const Reprog *progp, /* program to run */
const char *bol, /* string to run machine on */
- Resub *mp, /* subexpression elements */
- int ms, /* number of elements at mp */
+ Resub *mp, /* subexpression elements */
+ unsigned ms, /* number of elements at mp */
Reljunk *j,
int mflags
)
@@ -1057,7 +1057,7 @@ regexec1(const Reprog *progp, /* program to run */
/* efficiency: advance and re-evaluate */
continue;
case END: /* Match! */
- match = !(mflags & creg_fullmatch) ||
+ match = !(mflags & cregex_FULLMATCH) ||
((s == j->eol || r == 0 || r == '\n') &&
(tlp->se.m[0].str == bol || tlp->se.m[0].str[-1] == '\n'));
tlp->se.m[0].size = s - tlp->se.m[0].str;
@@ -1082,8 +1082,8 @@ regexec1(const Reprog *progp, /* program to run */
static int
regexec2(const Reprog *progp, /* program to run */
const char *bol, /* string to run machine on */
- Resub *mp, /* subexpression elements */
- int ms, /* number of elements at mp */
+ Resub *mp, /* subexpression elements */
+ unsigned ms, /* number of elements at mp */
Reljunk *j,
int mflags
)
@@ -1109,7 +1109,7 @@ regexec2(const Reprog *progp, /* program to run */
static int
regexec(const Reprog *progp, /* program to run */
const char *bol, /* string to run machine on */
- int ms, /* number of elements at mp */
+ unsigned ms, /* number of elements at mp */
Resub mp[], /* subexpression elements */
int mflags)
{
@@ -1123,10 +1123,10 @@ regexec(const Reprog *progp, /* program to run */
j.starts = bol;
j.eol = NULL;
- if (mp && mp->str && ms>0) {
- if (mflags & creg_startend)
+ if (ms && mp->size) {
+ if (mflags & cregex_STARTEND)
j.starts = mp->str, j.eol = mp->str + mp->size;
- else if (mflags & creg_next)
+ else if (mflags & cregex_NEXT)
j.starts = mp->str + mp->size;
}
@@ -1157,55 +1157,6 @@ regexec(const Reprog *progp, /* program to run */
* API functions
*/
-/* substitute into one string using the matches from the last regexec() */
-void cregex_replace(
- const char *sp, /* source string */
- char *dp, /* destination string */
- int dlen,
- int ms, /* number of elements pointed to by mp */
- const cregmatch mp[]) /* subexpression elements */
-{
- const char *ssp, *ep;
- int i;
-
- ep = dp+dlen-1;
- while (*sp != '\0') {
- if (*sp == '\\') {
- switch (*++sp) {
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- i = *sp - '0';
- if (mp[i].str != NULL && mp != NULL && ms > i)
- for (ssp = mp[i].str; ssp < (mp[i].str + mp[i].size); ssp++)
- if (dp < ep)
- *dp++ = *ssp;
- break;
- case '\\':
- if (dp < ep)
- *dp++ = '\\';
- break;
- case '\0':
- sp--;
- break;
- default:
- if (dp < ep)
- *dp++ = *sp;
- break;
- }
- } else if (*sp == '&') {
- if (mp[0].str != NULL && mp != NULL && ms > 0)
- for (ssp = mp[0].str; ssp < (mp[0].str + mp[0].size); ssp++)
- if (dp < ep)
- *dp++ = *ssp;
- } else {
- if (dp < ep)
- *dp++ = *sp;
- }
- sp++;
- }
- *dp = '\0';
-}
-
int cregex_compile(cregex *rx, const char* pattern, int cflags) {
Parser par;
rx->prog = regcomp1(rx->prog, &par, pattern, cflags);
@@ -1214,15 +1165,15 @@ int cregex_compile(cregex *rx, const char* pattern, int cflags) {
return par.errors;
}
-int cregex_captures(cregex rx) {
- return rx.prog ? 1 + rx.prog->nsubids : 0;
+int cregex_captures(const cregex* self) {
+ return self->prog ? 1 + self->prog->nsubids : 0;
}
int cregex_match(const cregex *rx, const char* string,
- size_t nmatch, cregmatch match[], int mflags) {
+ unsigned nmatch, csview match[], int mflags) {
int res = regexec(rx->prog, string, nmatch, match, mflags);
switch (res) {
- case 1: return 1 + rx->prog->nsubids;
+ case 1: return creg_success;
case 0: return creg_nomatch;
default: return creg_matcherror;
}
diff --git a/src/utf8code.c b/src/utf8code.c
index dff10409..44120cee 100644
--- a/src/utf8code.c
+++ b/src/utf8code.c
@@ -1,6 +1,7 @@
#include <ctype.h>
#define i_header
#include <stc/cstr.h>
+#include <stc/cregex.h>
#include "utf8tabs.inc"
const uint8_t utf8_dtab[] = {
@@ -16,7 +17,7 @@ const uint8_t utf8_dtab[] = {
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
- 12,36,12,12,12,12,12,12,12,12,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
};
unsigned utf8_encode(char *out, uint32_t c)
@@ -220,3 +221,75 @@ void cstr_lowercase(cstr* self) {
void cstr_uppercase(cstr* self) {
cstr_take(self, cstr_tocase(self, fn_toupper));
}
+
+
+void cregex_build_replace(const char* repl, unsigned nmatch, const csview match[],
+ cstr (*mfun)(int i, csview match), cstr* sub) {
+ cstr_clear(sub);
+ unsigned len = 0, cap = cstr_capacity(*sub);
+ char* dst = cstr_data(sub);
+
+ while (*repl != '\0') {
+ if (*repl == '\\') {
+ const char num = *++repl;
+ int i;
+ switch (num) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ i = num - '0';
+ if (i < nmatch) {
+ csview m;
+ cstr s = cstr_null;
+ if (mfun) { s = mfun(i, match[i]); m = cstr_sv(&s); }
+ else m = match[i];
+ if (len + m.size >= cap)
+ dst = cstr_reserve(sub, cap = cap*3/2 + m.size);
+ for (const char* rp = m.str; rp != (m.str + m.size); ++rp)
+ dst[len++] = *rp;
+ cstr_drop(&s);
+ }
+ ++repl;
+ case '\0':
+ continue;
+ }
+ }
+ if (len == cap)
+ dst = cstr_reserve(sub, cap = cap*3/2 + 4);
+ dst[len++] = *repl++;
+ }
+ _cstr_set_size(sub, len);
+}
+
+
+cstr cregex_replace_re(const char* input, const cregex* re, const char* repl,
+ cstr (*mfun)(int i, csview match), int cflags, unsigned count) {
+ cstr out = cstr_null;
+ cstr sub = cstr_null;
+ size_t from = 0;
+ csview match[cregex_MAXCAPTURES];
+ unsigned nmatch = cregex_captures(re);
+ if (!count) count = ~0;
+
+ while (count-- && cregex_match(re, input + from, nmatch, match, 0) > 0) {
+ cregex_build_replace(repl, nmatch, match, mfun, &sub);
+ const size_t pos = match[0].str - input;
+ cstr_append_n(&out, input + from, pos - from);
+ cstr_append_s(&out, sub);
+ from = pos + match[0].size;
+ }
+ cstr_append(&out, input + from);
+ cstr_drop(&sub);
+ return out;
+}
+
+
+cstr cregex_replace_fn(const char* input, const char* pattern, const char* repl,
+ cstr (*mfun)(int i, csview match), int cflags, unsigned count) {
+ cregex re = cregex_init();
+ int res = cregex_compile(&re, pattern, cflags);
+ if (res < 0)
+ return cstr_new("[[cregex_replace_fn]]: invalid pattern");
+ cstr out = cregex_replace_re(input, &re, repl, mfun, cflags, count);
+ cregex_drop(&re);
+ return out;
+}