diff options
| author | Tyge Lovset <[email protected]> | 2022-07-20 23:13:11 +0200 |
|---|---|---|
| committer | Tyge Lovset <[email protected]> | 2022-07-20 23:13:11 +0200 |
| commit | 3f89c290700618eae78eaa289bdb88d1cfb3514d (patch) | |
| tree | 32dd44f8ca210aee69039f8779be611f7885cd5b /src | |
| parent | 78cb61301df13fee995d3afd1bd1d8d63310d819 (diff) | |
| download | STC-modified-3f89c290700618eae78eaa289bdb88d1cfb3514d.tar.gz STC-modified-3f89c290700618eae78eaa289bdb88d1cfb3514d.zip | |
Added cregex_replace*() [implemented in utf8code.c]. Added examples/regex_replace.c. Docs not ready, i.e. API not fixed. Some other refactoring and minor fixes/improvements. cstr_assign_sv() now returns char* like the other cstr_assign*().
Diffstat (limited to 'src')
| -rw-r--r-- | src/cregex.c | 95 | ||||
| -rw-r--r-- | src/utf8code.c | 75 |
2 files changed, 97 insertions, 73 deletions
diff --git a/src/cregex.c b/src/cregex.c index b326b4fc..c30b66da 100644 --- a/src/cregex.c +++ b/src/cregex.c @@ -38,9 +38,9 @@ THE SOFTWARE. typedef uint32_t Rune; /* Utf8 code point */ typedef int32_t Token; /* max character classes per program */ -#define NCLASS creg_max_classes +#define NCLASS cregex_MAXCLASSES /* max subexpressions */ -#define NSUBEXP creg_max_captures +#define NSUBEXP cregex_MAXCAPTURES /* max rune ranges per character class */ #define NCCRUNE (NSUBEXP * 2) @@ -91,7 +91,7 @@ typedef struct Reprog /* * Sub expression matches */ -typedef cregmatch Resub; +typedef csview Resub; /* * substitution list @@ -228,11 +228,11 @@ utfruneicase(const char *s, Rune c) * save a new match in mp */ static void -_renewmatch(Resub *mp, int ms, Resublist *sp, int nsubids) +_renewmatch(Resub *mp, unsigned ms, Resublist *sp, int nsubids) { int i; - if (mp==NULL || ms<=0) + if (mp==NULL || ms==0) return; if (mp[0].str == NULL || sp->m[0].str < mp[0].str || (sp->m[0].str == mp[0].str && sp->m[0].size > mp[0].size)) { @@ -249,7 +249,7 @@ _renewmatch(Resub *mp, int ms, Resublist *sp, int nsubids) static Relist* _renewthread(Relist *lp, /* _relist to add to */ Reinst *ip, /* instruction to add */ - int ms, + unsigned ms, Resublist *sep) /* pointers to subexpressions */ { Relist *p; @@ -281,7 +281,7 @@ _renewthread(Relist *lp, /* _relist to add to */ static Relist* _renewemptythread(Relist *lp, /* _relist to add to */ Reinst *ip, /* instruction to add */ - int ms, + unsigned ms, const char *sp) /* pointers to subexpressions */ { Relist *p; @@ -806,8 +806,8 @@ regcomp1(Reprog *progp, Parser *par, const char *s, int cflags) free(progp); return NULL; } - pp->flags.caseless = (cflags & creg_caseless) != 0; - pp->flags.dotall = (cflags & creg_dotall) != 0; + pp->flags.caseless = (cflags & cregex_CASELESS) != 0; + pp->flags.dotall = (cflags & cregex_DOTALL) != 0; par->freep = pp->firstinst; par->classp = pp->cclass; par->errors = 0; @@ -930,10 +930,10 @@ runematch(Rune s, Rune r, bool icase) * <0 if we ran out of _relist space */ static int -regexec1(const Reprog *progp, /* program to run */ +regexec1(const Reprog *progp, /* program to run */ const char *bol, /* string to run machine on */ - Resub *mp, /* subexpression elements */ - int ms, /* number of elements at mp */ + Resub *mp, /* subexpression elements */ + unsigned ms, /* number of elements at mp */ Reljunk *j, int mflags ) @@ -1057,7 +1057,7 @@ regexec1(const Reprog *progp, /* program to run */ /* efficiency: advance and re-evaluate */ continue; case END: /* Match! */ - match = !(mflags & creg_fullmatch) || + match = !(mflags & cregex_FULLMATCH) || ((s == j->eol || r == 0 || r == '\n') && (tlp->se.m[0].str == bol || tlp->se.m[0].str[-1] == '\n')); tlp->se.m[0].size = s - tlp->se.m[0].str; @@ -1082,8 +1082,8 @@ regexec1(const Reprog *progp, /* program to run */ static int regexec2(const Reprog *progp, /* program to run */ const char *bol, /* string to run machine on */ - Resub *mp, /* subexpression elements */ - int ms, /* number of elements at mp */ + Resub *mp, /* subexpression elements */ + unsigned ms, /* number of elements at mp */ Reljunk *j, int mflags ) @@ -1109,7 +1109,7 @@ regexec2(const Reprog *progp, /* program to run */ static int regexec(const Reprog *progp, /* program to run */ const char *bol, /* string to run machine on */ - int ms, /* number of elements at mp */ + unsigned ms, /* number of elements at mp */ Resub mp[], /* subexpression elements */ int mflags) { @@ -1123,10 +1123,10 @@ regexec(const Reprog *progp, /* program to run */ j.starts = bol; j.eol = NULL; - if (mp && mp->str && ms>0) { - if (mflags & creg_startend) + if (ms && mp->size) { + if (mflags & cregex_STARTEND) j.starts = mp->str, j.eol = mp->str + mp->size; - else if (mflags & creg_next) + else if (mflags & cregex_NEXT) j.starts = mp->str + mp->size; } @@ -1157,55 +1157,6 @@ regexec(const Reprog *progp, /* program to run */ * API functions */ -/* substitute into one string using the matches from the last regexec() */ -void cregex_replace( - const char *sp, /* source string */ - char *dp, /* destination string */ - int dlen, - int ms, /* number of elements pointed to by mp */ - const cregmatch mp[]) /* subexpression elements */ -{ - const char *ssp, *ep; - int i; - - ep = dp+dlen-1; - while (*sp != '\0') { - if (*sp == '\\') { - switch (*++sp) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - i = *sp - '0'; - if (mp[i].str != NULL && mp != NULL && ms > i) - for (ssp = mp[i].str; ssp < (mp[i].str + mp[i].size); ssp++) - if (dp < ep) - *dp++ = *ssp; - break; - case '\\': - if (dp < ep) - *dp++ = '\\'; - break; - case '\0': - sp--; - break; - default: - if (dp < ep) - *dp++ = *sp; - break; - } - } else if (*sp == '&') { - if (mp[0].str != NULL && mp != NULL && ms > 0) - for (ssp = mp[0].str; ssp < (mp[0].str + mp[0].size); ssp++) - if (dp < ep) - *dp++ = *ssp; - } else { - if (dp < ep) - *dp++ = *sp; - } - sp++; - } - *dp = '\0'; -} - int cregex_compile(cregex *rx, const char* pattern, int cflags) { Parser par; rx->prog = regcomp1(rx->prog, &par, pattern, cflags); @@ -1214,15 +1165,15 @@ int cregex_compile(cregex *rx, const char* pattern, int cflags) { return par.errors; } -int cregex_captures(cregex rx) { - return rx.prog ? 1 + rx.prog->nsubids : 0; +int cregex_captures(const cregex* self) { + return self->prog ? 1 + self->prog->nsubids : 0; } int cregex_match(const cregex *rx, const char* string, - size_t nmatch, cregmatch match[], int mflags) { + unsigned nmatch, csview match[], int mflags) { int res = regexec(rx->prog, string, nmatch, match, mflags); switch (res) { - case 1: return 1 + rx->prog->nsubids; + case 1: return creg_success; case 0: return creg_nomatch; default: return creg_matcherror; } diff --git a/src/utf8code.c b/src/utf8code.c index dff10409..44120cee 100644 --- a/src/utf8code.c +++ b/src/utf8code.c @@ -1,6 +1,7 @@ #include <ctype.h> #define i_header #include <stc/cstr.h> +#include <stc/cregex.h> #include "utf8tabs.inc" const uint8_t utf8_dtab[] = { @@ -16,7 +17,7 @@ const uint8_t utf8_dtab[] = { 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, - 12,36,12,12,12,12,12,12,12,12,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, }; unsigned utf8_encode(char *out, uint32_t c) @@ -220,3 +221,75 @@ void cstr_lowercase(cstr* self) { void cstr_uppercase(cstr* self) { cstr_take(self, cstr_tocase(self, fn_toupper)); } + + +void cregex_build_replace(const char* repl, unsigned nmatch, const csview match[], + cstr (*mfun)(int i, csview match), cstr* sub) { + cstr_clear(sub); + unsigned len = 0, cap = cstr_capacity(*sub); + char* dst = cstr_data(sub); + + while (*repl != '\0') { + if (*repl == '\\') { + const char num = *++repl; + int i; + switch (num) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + i = num - '0'; + if (i < nmatch) { + csview m; + cstr s = cstr_null; + if (mfun) { s = mfun(i, match[i]); m = cstr_sv(&s); } + else m = match[i]; + if (len + m.size >= cap) + dst = cstr_reserve(sub, cap = cap*3/2 + m.size); + for (const char* rp = m.str; rp != (m.str + m.size); ++rp) + dst[len++] = *rp; + cstr_drop(&s); + } + ++repl; + case '\0': + continue; + } + } + if (len == cap) + dst = cstr_reserve(sub, cap = cap*3/2 + 4); + dst[len++] = *repl++; + } + _cstr_set_size(sub, len); +} + + +cstr cregex_replace_re(const char* input, const cregex* re, const char* repl, + cstr (*mfun)(int i, csview match), int cflags, unsigned count) { + cstr out = cstr_null; + cstr sub = cstr_null; + size_t from = 0; + csview match[cregex_MAXCAPTURES]; + unsigned nmatch = cregex_captures(re); + if (!count) count = ~0; + + while (count-- && cregex_match(re, input + from, nmatch, match, 0) > 0) { + cregex_build_replace(repl, nmatch, match, mfun, &sub); + const size_t pos = match[0].str - input; + cstr_append_n(&out, input + from, pos - from); + cstr_append_s(&out, sub); + from = pos + match[0].size; + } + cstr_append(&out, input + from); + cstr_drop(&sub); + return out; +} + + +cstr cregex_replace_fn(const char* input, const char* pattern, const char* repl, + cstr (*mfun)(int i, csview match), int cflags, unsigned count) { + cregex re = cregex_init(); + int res = cregex_compile(&re, pattern, cflags); + if (res < 0) + return cstr_new("[[cregex_replace_fn]]: invalid pattern"); + cstr out = cregex_replace_re(input, &re, repl, mfun, cflags, count); + cregex_drop(&re); + return out; +} |
