diff options
| author | Tyge Løvset <[email protected]> | 2022-02-05 23:21:51 +0100 |
|---|---|---|
| committer | Tyge Løvset <[email protected]> | 2022-02-05 23:21:51 +0100 |
| commit | 29409b257d9144010bd608afc19f66ee2fbaa337 (patch) | |
| tree | 229c17af426a76d6fc7d773a9ac8c98ef7352405 /src/cregex.c | |
| parent | 53de8174dd788db62c219477e9cca0ee3ce757cb (diff) | |
| download | STC-modified-29409b257d9144010bd608afc19f66ee2fbaa337.tar.gz STC-modified-29409b257d9144010bd608afc19f66ee2fbaa337.zip | |
Switched to heavily modified version of Rob Pike's plan9 regexp9. -> now renamed to cregex, with new API.
Diffstat (limited to 'src/cregex.c')
| -rw-r--r-- | src/cregex.c | 1167 |
1 files changed, 1167 insertions, 0 deletions
diff --git a/src/cregex.c b/src/cregex.c new file mode 100644 index 00000000..aaa6e62a --- /dev/null +++ b/src/cregex.c @@ -0,0 +1,1167 @@ +/*
+This is a Unix port of the Plan 9 regular expression library, by Rob Pike.
+Please send comments about the packaging to Russ Cox <[email protected]>.
+
+Copyright © 2021 Plan 9 Foundation
+Copyright © 2022 Tyge Løvset, for additions made in 2022.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <setjmp.h>
+#include <string.h>
+#include <ctype.h>
+#include <stc/cregex.h>
+#include "cregex_utf8.c"
+
+typedef uint32_t Rune;
+/* max character classes per program */
+#define NCLASS creg_max_classes
+/* max subexpressions */
+#define NSUBEXP creg_max_captures
+/* max rune ranges per character class */
+#define NCCRUNE (NSUBEXP * 2)
+
+/*
+ * character class, each pair of rune's defines a range
+ */
+typedef struct
+{
+ Rune *end;
+ Rune spans[NCCRUNE];
+} Reclass;
+
+/*
+ * Machine instructions
+ */
+typedef struct Reinst
+{
+ int type;
+ union {
+ Reclass *classp; /* class pointer */
+ Rune rune; /* character */
+ int subid; /* sub-expression id for RBRA and LBRA */
+ struct Reinst *right; /* right child of OR */
+ } r;
+ union { /* regexp relies on these two being in the same union */
+ struct Reinst *left; /* left child of OR */
+ struct Reinst *next; /* next instruction for CAT & LBRA */
+ } l;
+} Reinst;
+
+typedef struct {
+ bool ignorecase;
+ bool dotall;
+} Reflags;
+
+/*
+ * Reprogram definition
+ */
+typedef struct Reprog
+{
+ Reinst *startinst; /* start pc */
+ Reflags flags;
+ int nsubids;
+ Reclass cclass[NCLASS]; /* .data */
+ Reinst firstinst[]; /* .text : originally 5 elements? */
+} Reprog;
+
+/*
+ * Sub expression matches
+ */
+typedef cregmatch Resub;
+
+/*
+ * substitution list
+ */
+typedef struct Resublist
+{
+ Resub m[NSUBEXP];
+} Resublist;
+
+/*
+ * Actions and Tokens (Reinst types)
+ *
+ * 0x80-0x8F: operators, value => precedence
+ * 0x90-0xAF: RUNE and char classes.
+ * 0xB0-0xBF: tokens, i.e. operands for operators
+ */
+enum {
+ OPERATOR = 0x80, /* Bitmask of all operators */
+ START = 0x80, /* Start, used for marker on stack */
+ RBRA , /* Right bracket, ) */
+ LBRA , /* Left bracket, ( */
+ OR , /* Alternation, | */
+ CAT , /* Concatentation, implicit operator */
+ STAR , /* Closure, * */
+ PLUS , /* a+ == aa* */
+ QUEST , /* a? == a|nothing, i.e. 0 or 1 a's */
+ RUNE = 0x90,
+ CLS_d , CLS_D, /* digit, non-digit */
+ CLS_s , CLS_S, /* space, non-space */
+ CLS_w , CLS_W, /* word, non-word */
+ CLS_an , CLS_AN, /* alphanum */
+ CLS_al , CLS_AL, /* alpha */
+ CLS_bl , CLS_BL, /* blank */
+ CLS_pu , CLS_PU, /* punct */
+ CLS_ct , CLS_CT, /* ctrl */
+ CLS_gr , CLS_GR, /* graphic */
+ CLS_lo , CLS_LO, /* lower */
+ CLS_pr , CLS_PR, /* print */
+ CLS_up , CLS_UP, /* upper */
+ CLS_xd , CLS_XD, /* xdigit */
+ ANY = 0xB0, /* Any character except newline, . */
+ ANYNL , /* Any character including newline, . */
+ NOP , /* No operation, internal use only */
+ BOL , /* Beginning of line, ^ */
+ EOL , /* End of line, $ */
+ CCLASS , /* Character class, [] */
+ NCCLASS , /* Negated character class, [] */
+ WBOUND , /* Non-word boundary, not consuming meta char */
+ NWBOUND , /* Word boundary, not consuming meta char */
+ END = 0xBF, /* Terminate: match found */
+};
+
+/*
+ * regexec execution lists
+ */
+#define LISTSIZE 10
+#define BIGLISTSIZE (10*LISTSIZE)
+
+typedef struct Relist
+{
+ Reinst* inst; /* Reinstruction of the thread */
+ Resublist se; /* matched subexpressions in this thread */
+} Relist;
+
+typedef struct Reljunk
+{
+ Relist* relist[2];
+ Relist* reliste[2];
+ int starttype;
+ Rune startchar;
+ const char* starts;
+ const char* eol;
+} Reljunk;
+
+/*
+ * utf8 and Rune code
+ */
+
+static int
+chartorune(Rune *rune, const char *s)
+{
+ utf8_decode_t ctx = {UTF8_OK};
+ const uint8_t *b = (const uint8_t*)s;
+ utf8_decode(&ctx, *b++);
+ switch (ctx.len) {
+ case 4: utf8_decode(&ctx, *b++);
+ case 3: utf8_decode(&ctx, *b++);
+ case 2: utf8_decode(&ctx, *b++);
+ }
+ *rune = ctx.codep;
+ return ctx.len;
+}
+
+static const char*
+utfrune(const char *s, Rune c)
+{
+ Rune r;
+
+ if (c < 128) /* ascii */
+ return strchr((char *)s, c);
+
+ for (;;) {
+ int n = chartorune(&r, s);
+ if (r == c) return s;
+ if ((r == 0) | (n == 0)) return NULL;
+ s += n;
+ }
+}
+
+static const char*
+utfruneicase(const char *s, Rune c)
+{
+ Rune r;
+ c = utf8_tolower(c);
+ for (;;) {
+ int n = chartorune(&r, s);
+ if (utf8_tolower(r) == c) return s;
+ if ((r == 0) | (n == 0)) return NULL;
+ s += n;
+ }
+}
+
+/************
+ * regaux.c *
+ ************/
+
+/*
+ * save a new match in mp
+ */
+static void
+_renewmatch(Resub *mp, int ms, Resublist *sp, int nsubids)
+{
+ int i;
+
+ if (mp==NULL || ms<=0)
+ return;
+ if (mp[0].str == NULL || sp->m[0].str < mp[0].str ||
+ (sp->m[0].str == mp[0].str && sp->m[0].len > mp[0].len)) {
+ for (i=0; i<ms && i<=nsubids; i++)
+ mp[i] = sp->m[i];
+ }
+}
+
+/*
+ * Note optimization in _renewthread:
+ * *lp must be pending when _renewthread called; if *l has been looked
+ * at already, the optimization is a bug.
+ */
+static Relist*
+_renewthread(Relist *lp, /* _relist to add to */
+ Reinst *ip, /* instruction to add */
+ int ms,
+ Resublist *sep) /* pointers to subexpressions */
+{
+ Relist *p;
+
+ for (p=lp; p->inst; p++) {
+ if (p->inst == ip) {
+ if (sep->m[0].str < p->se.m[0].str) {
+ if (ms > 1)
+ p->se = *sep;
+ else
+ p->se.m[0] = sep->m[0];
+ }
+ return 0;
+ }
+ }
+ p->inst = ip;
+ if (ms > 1)
+ p->se = *sep;
+ else
+ p->se.m[0] = sep->m[0];
+ (++p)->inst = NULL;
+ return p;
+}
+
+/*
+ * same as renewthread, but called with
+ * initial empty start pointer.
+ */
+static Relist*
+_renewemptythread(Relist *lp, /* _relist to add to */
+ Reinst *ip, /* instruction to add */
+ int ms,
+ const char *sp) /* pointers to subexpressions */
+{
+ Relist *p;
+
+ for (p=lp; p->inst; p++) {
+ if (p->inst == ip) {
+ if (sp < p->se.m[0].str) {
+ if (ms > 1)
+ memset(&p->se, 0, sizeof(p->se));
+ p->se.m[0].str = sp;
+ }
+ return 0;
+ }
+ }
+ p->inst = ip;
+ if (ms > 1)
+ memset(&p->se, 0, sizeof(p->se));
+ p->se.m[0].str = sp;
+ (++p)->inst = NULL;
+ return p;
+}
+
+/*
+ * Parser Information
+ */
+typedef struct Node
+{
+ Reinst* first;
+ Reinst* last;
+} Node;
+
+#define NSTACK 20
+typedef struct Parser
+{
+ const char* exprp; /* pointer to next character in source expression */
+ Node andstack[NSTACK];
+ Node* andp;
+ short atorstack[NSTACK];
+ short* atorp;
+ short subidstack[NSTACK]; /* parallel to atorstack */
+ short* subidp;
+ short cursubid; /* id of current subexpression */
+ int errors;
+ bool ignorecase;
+ bool lastwasand; /* Last token was operand */
+ bool lexdone;
+ short nbra;
+ short nclass;
+ Rune yyrune; /* last lex'd rune */
+ Reclass *yyclassp; /* last lex'd class */
+ Reclass* classp;
+ Reinst* freep;
+ jmp_buf regkaboom;
+} Parser;
+
+/* predeclared crap */
+static void _operator(Parser *par, int type);
+static void pushand(Parser *par, Reinst *first, Reinst *last);
+static void pushator(Parser *par, int type);
+static void evaluntil(Parser *par, int type);
+static int bldcclass(Parser *par);
+
+static void
+rcerror(Parser *par, cregex_error_t err)
+{
+ par->errors = err;
+ longjmp(par->regkaboom, 1);
+}
+
+static Reinst*
+newinst(Parser *par, int t)
+{
+ par->freep->type = t;
+ par->freep->l.left = 0;
+ par->freep->r.right = 0;
+ return par->freep++;
+}
+
+static void
+operand(Parser *par, int t)
+{
+ Reinst *i;
+
+ if (par->lastwasand)
+ _operator(par, CAT); /* catenate is implicit */
+ i = newinst(par, t);
+
+ if (t == CCLASS || t == NCCLASS)
+ i->r.classp = par->yyclassp;
+ if (t == RUNE)
+ i->r.rune = par->yyrune;
+
+ pushand(par, i, i);
+ par->lastwasand = true;
+}
+
+static void
+_operator(Parser *par, int t)
+{
+ if (t==RBRA && --par->nbra<0)
+ rcerror(par, creg_unmatchedrightparenthesis);
+ if (t==LBRA) {
+ if (++par->cursubid >= NSUBEXP)
+ rcerror(par, creg_toomanysubexpressions);
+ par->nbra++;
+ if (par->lastwasand)
+ _operator(par, CAT);
+ } else
+ evaluntil(par, t);
+ if (t != RBRA)
+ pushator(par, t);
+ par->lastwasand = 0;
+ if (t==STAR || t==QUEST || t==PLUS || t==RBRA)
+ par->lastwasand = true; /* these look like operands */
+}
+
+static void
+pushand(Parser *par, Reinst *f, Reinst *l)
+{
+ if (par->andp >= &par->andstack[NSTACK])
+ rcerror(par, creg_operandstackoverflow);
+ par->andp->first = f;
+ par->andp->last = l;
+ par->andp++;
+}
+
+static void
+pushator(Parser *par, int t)
+{
+ if (par->atorp >= &par->atorstack[NSTACK])
+ rcerror(par, creg_operatorstackoverflow);
+ *par->atorp++ = t;
+ *par->subidp++ = par->cursubid;
+}
+
+static Node*
+popand(Parser *par, int op)
+{
+ Reinst *inst;
+
+ if (par->andp <= &par->andstack[0]) {
+ rcerror(par, creg_missingoperand);
+ inst = newinst(par, NOP);
+ pushand(par, inst, inst);
+ }
+ return --par->andp;
+}
+
+static int
+popator(Parser *par)
+{
+ if (par->atorp <= &par->atorstack[0])
+ rcerror(par, creg_operatorstackunderflow);
+ --par->subidp;
+ return *--par->atorp;
+}
+
+static void
+evaluntil(Parser *par, int pri)
+{
+ Node *op1, *op2;
+ Reinst *inst1, *inst2;
+
+ while (pri==RBRA || par->atorp[-1]>=pri) {
+ switch (popator(par)) {
+ default:
+ rcerror(par, creg_unknownoperator);
+ break;
+ case LBRA: /* must have been RBRA */
+ op1 = popand(par, '(');
+ inst2 = newinst(par, RBRA);
+ inst2->r.subid = *par->subidp;
+ op1->last->l.next = inst2;
+ inst1 = newinst(par, LBRA);
+ inst1->r.subid = *par->subidp;
+ inst1->l.next = op1->first;
+ pushand(par, inst1, inst2);
+ return;
+ case OR:
+ op2 = popand(par, '|');
+ op1 = popand(par, '|');
+ inst2 = newinst(par, NOP);
+ op2->last->l.next = inst2;
+ op1->last->l.next = inst2;
+ inst1 = newinst(par, OR);
+ inst1->r.right = op1->first;
+ inst1->l.left = op2->first;
+ pushand(par, inst1, inst2);
+ break;
+ case CAT:
+ op2 = popand(par, 0);
+ op1 = popand(par, 0);
+ op1->last->l.next = op2->first;
+ pushand(par, op1->first, op2->last);
+ break;
+ case STAR:
+ op2 = popand(par, '*');
+ inst1 = newinst(par, OR);
+ op2->last->l.next = inst1;
+ inst1->r.right = op2->first;
+ pushand(par, inst1, inst1);
+ break;
+ case PLUS:
+ op2 = popand(par, '+');
+ inst1 = newinst(par, OR);
+ op2->last->l.next = inst1;
+ inst1->r.right = op2->first;
+ pushand(par, op2->first, inst1);
+ break;
+ case QUEST:
+ op2 = popand(par, '?');
+ inst1 = newinst(par, OR);
+ inst2 = newinst(par, NOP);
+ inst1->l.left = inst2;
+ inst1->r.right = op2->first;
+ op2->last->l.next = inst2;
+ pushand(par, inst1, inst2);
+ break;
+ }
+ }
+}
+
+static Reprog*
+optimize(Parser *par, Reprog *pp)
+{
+ Reinst *inst, *target;
+ size_t size;
+ Reprog *npp;
+ Reclass *cl;
+ ptrdiff_t diff;
+
+ /*
+ * get rid of NOOP chains
+ */
+ for (inst = pp->firstinst; inst->type != END; inst++) {
+ target = inst->l.next;
+ while (target->type == NOP)
+ target = target->l.next;
+ inst->l.next = target;
+ }
+
+ /*
+ * The original allocation is for an area larger than
+ * necessary. Reallocate to the actual space used
+ * and then relocate the code.
+ */
+ size = sizeof(Reprog) + (par->freep - pp->firstinst)*sizeof(Reinst);
+ npp = (Reprog *)realloc(pp, size);
+ if (npp==NULL || npp==pp)
+ return pp;
+ diff = (char *)npp - (char *)pp;
+ par->freep = (Reinst *)((char *)par->freep + diff);
+ for (inst = npp->firstinst; inst < par->freep; inst++) {
+ switch (inst->type) {
+ case OR:
+ case STAR:
+ case PLUS:
+ case QUEST:
+ inst->r.right = (Reinst *)((char*)inst->r.right + diff);
+ break;
+ case CCLASS:
+ case NCCLASS:
+ inst->r.right = (Reinst *)((char*)inst->r.right + diff);
+ cl = inst->r.classp;
+ cl->end = (Rune *)((char*)cl->end + diff);
+ break;
+ }
+ inst->l.left = (Reinst *)((char*)inst->l.left + diff);
+ }
+ npp->startinst = (Reinst *)((char*)npp->startinst + diff);
+ return npp;
+}
+
+#ifdef DEBUG
+static void
+dumpstack(Parser *par) {
+ Node *stk;
+ int *ip;
+
+ print("operators\n");
+ for (ip = par->atorstack; ip < par->atorp; ip++)
+ print("0%o\n", *ip);
+ print("operands\n");
+ for (stk = par->andstack; stk < par->andp; stk++)
+ print("0%o\t0%o\n", stk->first->type, stk->last->type);
+}
+
+static void
+dump(Reprog *pp)
+{
+ Reinst *l;
+ Rune *p;
+
+ l = pp->firstinst;
+ do {
+ print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
+ l->l.left-pp->firstinst, l->r.right-pp->firstinst);
+ if (l->type == RUNE)
+ print("\t%C\n", l->r.rune);
+ else if (l->type == CCLASS || l->type == NCCLASS) {
+ print("\t[");
+ if (l->type == NCCLASS)
+ print("^");
+ for (p = l->r.classp->spans; p < l->r.classp->end; p += 2)
+ if (p[0] == p[1])
+ print("%C", p[0]);
+ else
+ print("%C-%C", p[0], p[1]);
+ print("]\n");
+ } else
+ print("\n");
+ } while (l++->type);
+}
+#endif
+
+static Reclass*
+newclass(Parser *par)
+{
+ if (par->nclass >= NCLASS)
+ rcerror(par, creg_toomanycharacterclasses);
+ return &(par->classp[par->nclass++]);
+}
+
+static int
+nextc(Parser *par, Rune *rp)
+{
+ if (par->lexdone) {
+ *rp = 0;
+ return true;
+ }
+ par->exprp += chartorune(rp, par->exprp);
+ if (*rp == '\\') {
+ par->exprp += chartorune(rp, par->exprp);
+ switch (*rp) {
+ case 't': *rp = '\t'; break;
+ case 'n': *rp = '\n'; break;
+ case 'r': *rp = '\r'; break;
+ case 'v': *rp = '\v'; break;
+ case 'f': *rp = '\f'; break;
+ case 'd': *rp = CLS_d; break;
+ case 'D': *rp = CLS_D; break;
+ case 's': *rp = CLS_s; break;
+ case 'S': *rp = CLS_S; break;
+ case 'w': *rp = CLS_w; break;
+ case 'W': *rp = CLS_W; break;
+ }
+ return true;
+ }
+ if (*rp == 0)
+ par->lexdone = true;
+ return false;
+}
+
+static int
+lex(Parser *par, int* dot_type)
+{
+ int quoted;
+ start:
+ quoted = nextc(par, &par->yyrune);
+ if (quoted) switch (par->yyrune) {
+ case 0 : return END;
+ case 'b': return WBOUND;
+ case 'B': return NWBOUND;
+ default : return RUNE;
+ }
+
+ switch (par->yyrune) {
+ case 0 : return END;
+ case '*': return STAR;
+ case '?': return QUEST;
+ case '+': return PLUS;
+ case '|': return OR;
+ case '.': return *dot_type;
+ case '(':
+ if (par->exprp[0] == '?') {
+ for (int k = 1, inv = 0; ; ++k) switch (par->exprp[k]) {
+ case 0 : par->exprp += k; return END;
+ case ')': par->exprp += k + 1; goto start;
+ case '-': inv = 1; break;
+ case 's': *dot_type = inv ? ANY : ANYNL; break;
+ case 'i': par->ignorecase = !inv; break;
+ }
+ }
+ return LBRA;
+ case ')': return RBRA;
+ case '^': return BOL;
+ case '$': return EOL;
+ case '[': return bldcclass(par);
+ }
+ return RUNE;
+}
+
+static int
+bldcclass(Parser *par)
+{
+ int type;
+ Rune r[NCCRUNE];
+ Rune *p, *ep, *np;
+ Rune rune;
+ int quoted;
+
+ /* we have already seen the '[' */
+ type = CCLASS;
+ par->yyclassp = newclass(par);
+
+ /* look ahead for negation */
+ /* SPECIAL CASE!!! negated classes don't match \n */
+ ep = r;
+ quoted = nextc(par, &rune);
+ if (!quoted && rune == '^') {
+ type = NCCLASS;
+ quoted = nextc(par, &rune);
+ *ep++ = '\n';
+ *ep++ = '\n';
+ }
+
+ /* parse class into a set of spans */
+ for (; ep < &r[NCCRUNE]; quoted = nextc(par, &rune)) {
+ if (rune == 0) {
+ rcerror(par, creg_malformedcharacterclass);
+ return 0;
+ }
+ if (!quoted) {
+ if (rune == ']')
+ break;
+ if (rune == '-') {
+ if (ep != r && *par->exprp != ']') {
+ quoted = nextc(par, &rune);
+ if (rune == 0) {
+ rcerror(par, creg_malformedcharacterclass);
+ return 0;
+ }
+ ep[-1] = rune;
+ continue;
+ }
+ }
+ if (rune == '[' && *par->exprp == ':') {
+ static struct { const char* c; int n, r; } cls[] = {
+ {":alnum:]", 8, CLS_an}, {":alpha:]", 8, CLS_al}, {":blank:]", 8, CLS_bl},
+ {":cntrl:]", 8, CLS_ct}, {":digit:]", 8, CLS_d}, {":graph:]", 8, CLS_gr},
+ {":lower:]", 8, CLS_lo}, {":print:]", 8, CLS_pr}, {":punct:]", 8, CLS_pu},
+ {":space:]", 8, CLS_s}, {":upper:]", 8, CLS_up}, {":xdigit:]", 9, CLS_xd},
+ {":word:]", 7, CLS_w},
+ };
+ for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i)
+ if (!strncmp(par->exprp, cls[i].c, cls[i].n)) {
+ rune = cls[i].r;
+ par->exprp += cls[i].n;
+ break;
+ }
+ }
+ }
+ *ep++ = rune;
+ *ep++ = rune;
+ }
+
+ /* sort on span start */
+ for (p = r; p < ep; p += 2) {
+ for (np = p; np < ep; np += 2)
+ if (*np < *p) {
+ rune = np[0];
+ np[0] = p[0];
+ p[0] = rune;
+ rune = np[1];
+ np[1] = p[1];
+ p[1] = rune;
+ }
+ }
+
+ /* merge spans */
+ np = par->yyclassp->spans;
+ p = r;
+ if (r == ep)
+ par->yyclassp->end = np;
+ else {
+ np[0] = *p++;
+ np[1] = *p++;
+ for (; p < ep; p += 2)
+ if (p[0] <= np[1]) {
+ if (p[1] > np[1])
+ np[1] = p[1];
+ } else {
+ np += 2;
+ np[0] = p[0];
+ np[1] = p[1];
+ }
+ par->yyclassp->end = np+2;
+ }
+
+ return type;
+}
+
+static Reprog*
+regcomp1(Parser *par, const char *s, int dot_type)
+{
+ int token;
+ Reprog *volatile pp;
+
+ /* get memory for the program. estimated max usage */
+ const int instcap = 5 + 6*strlen(s);
+ pp = (Reprog *)malloc(sizeof(Reprog) + instcap*sizeof(Reinst));
+ if (pp == NULL) {
+ rcerror(par, creg_outofmemory);
+ return NULL;
+ }
+ pp->flags.ignorecase = false;
+ pp->flags.dotall = (dot_type == ANYNL);
+ par->freep = pp->firstinst;
+ par->classp = pp->cclass;
+ par->errors = 0;
+
+ if (setjmp(par->regkaboom))
+ goto out;
+
+ /* go compile the sucker */
+ par->lexdone = false;
+ par->ignorecase = false;
+ par->exprp = s;
+ par->nclass = 0;
+ par->nbra = 0;
+ par->atorp = par->atorstack;
+ par->andp = par->andstack;
+ par->subidp = par->subidstack;
+ par->lastwasand = false;
+ par->cursubid = 0;
+
+ /* Start with a low priority operator to prime parser */
+ pushator(par, START-1);
+ while ((token = lex(par, &dot_type)) != END) {
+ if ((token & 0xF0) == OPERATOR)
+ _operator(par, token);
+ else
+ operand(par, token);
+ }
+
+ /* Close with a low priority operator */
+ evaluntil(par, START);
+
+ /* Force END */
+ operand(par, END);
+ evaluntil(par, START);
+#ifdef DEBUG
+ dumpstack(par);
+#endif
+ if (par->nbra)
+ rcerror(par, creg_unmatchedleftparenthesis);
+ --par->andp; /* points to first and only operand */
+ pp->startinst = par->andp->first;
+#ifdef DEBUG
+ dump(pp);
+#endif
+ pp = optimize(par, pp);
+ pp->flags.ignorecase |= par->ignorecase;
+ pp->nsubids = par->cursubid;
+#ifdef DEBUG
+ print("start: %d\n", par->andp->first-pp->firstinst);
+ dump(pp);
+#endif
+out:
+ if (par->errors) {
+ free(pp);
+ pp = NULL;
+ }
+ return pp;
+}
+
+
+static int
+runematch(Rune s, Rune r, bool icase)
+{
+ int inv = 0;
+ switch (s) {
+ case CLS_D: inv = true; /* fallthrough */
+ case CLS_d: return inv ^ (isdigit(r) != 0);
+ case CLS_S: inv = true;
+ case CLS_s: return inv ^ (isspace(r) != 0);
+ case CLS_W: inv = true;
+ case CLS_w: return inv ^ (utf8_isalnum(r) | (r == '_'));
+ case CLS_al: return utf8_isalpha(r);
+ case CLS_bl: return ((r == ' ') | (r == '\t'));
+ case CLS_ct: return iscntrl(r) != 0;
+ case CLS_gr: return isgraph(r) != 0;
+ case CLS_an: return utf8_isalnum(r);
+ case CLS_pr: return isprint(r) != 0;
+ case CLS_pu: return ispunct(r) != 0;
+ case CLS_xd: return isxdigit(r) != 0;
+ case CLS_lo: return icase ? utf8_isalpha(s) : utf8_islower(r);
+ case CLS_up: return icase ? utf8_isalpha(s) : utf8_isupper(r);
+ }
+ return icase ? utf8_tolower(s) == utf8_tolower(r) : s == r;
+}
+
+/*
+ * return 0 if no match
+ * >0 if a match
+ * <0 if we ran out of _relist space
+ */
+static int
+regexec1(const Reprog *progp, /* program to run */
+ const char *bol, /* string to run machine on */
+ Resub *mp, /* subexpression elements */
+ int ms, /* number of elements at mp */
+ Reljunk *j,
+ int mflags
+)
+{
+ int flag=0;
+ Reinst *inst;
+ Relist *tlp;
+ Relist *tl, *nl; /* This list, next list */
+ Relist *tle, *nle; /* Ends of this and next list */
+ const char *s, *p;
+ int i, n, checkstart;
+ Rune r, *rp, *ep;
+ int match = 0;
+
+ bool icase = progp->flags.ignorecase || (mflags & creg_caseless);
+ checkstart = j->starttype;
+ if (mp)
+ for (i=0; i<ms; i++) {
+ mp[i].str = NULL;
+ mp[i].len = 0;
+ }
+ j->relist[0][0].inst = NULL;
+ j->relist[1][0].inst = NULL;
+
+ /* Execute machine once for each character, including terminal NUL */
+ s = j->starts;
+ do {
+ /* fast check for first char */
+ if (checkstart) {
+ switch (j->starttype) {
+ case RUNE:
+ p = icase ? utfruneicase(s, j->startchar)
+ : utfrune(s, j->startchar);
+ if (p == NULL || s == j->eol)
+ return match;
+ s = p;
+ break;
+ case BOL:
+ if (s == bol)
+ break;
+ p = utfrune(s, '\n');
+ if (p == NULL || s == j->eol)
+ return match;
+ s = p+1;
+ break;
+ }
+ }
+ n = chartorune(&r, s);
+
+ /* switch run lists */
+ tl = j->relist[flag];
+ tle = j->reliste[flag];
+ nl = j->relist[flag^=1];
+ nle = j->reliste[flag];
+ nl->inst = NULL;
+
+ /* Add first instruction to current list */
+ if (match == 0)
+ _renewemptythread(tl, progp->startinst, ms, s);
+
+ /* Execute machine until current list is empty */
+ for (tlp=tl; tlp->inst; tlp++) { /* assignment = */
+ for (inst = tlp->inst; ; inst = inst->l.next) {
+ int ok = false;
+
+ switch (inst->type) {
+ case RUNE: /* regular character */
+ ok = runematch(inst->r.rune, r, icase);
+ break;
+ case LBRA:
+ tlp->se.m[inst->r.subid].str = s;
+ continue;
+ case RBRA:
+ tlp->se.m[inst->r.subid].len = s - tlp->se.m[inst->r.subid].str;
+ continue;
+ case ANY:
+ ok = (r != '\n');
+ break;
+ case ANYNL:
+ ok = true;
+ break;
+ case BOL:
+ if (s == bol || *(s-1) == '\n')
+ continue;
+ break;
+ case EOL:
+ if (s == j->eol || r == 0 || r == '\n')
+ continue;
+ break;
+ case NWBOUND:
+ ok = true; /* fallthrough */
+ case WBOUND:
+ if (ok ^ (s == bol || s == j->eol || ((isalnum(s[-1]) || s[-1] == '_')
+ ^ (isalnum(s[ 0]) || s[ 0] == '_'))))
+ continue;
+ break;
+ case NCCLASS:
+ ok = true; /* fallthrough */
+ case CCLASS:
+ ep = inst->r.classp->end;
+ for (rp = inst->r.classp->spans; rp < ep; rp += 2) {
+ if ((r >= rp[0] && r <= rp[1]) || (rp[0] == rp[1] && runematch(rp[0], r, icase)))
+ break;
+ }
+ ok ^= (rp < ep);
+ break;
+ case OR:
+ /* evaluate right choice later */
+ if (_renewthread(tlp, inst->r.right, ms, &tlp->se) == tle)
+ return -1;
+ /* efficiency: advance and re-evaluate */
+ continue;
+ case END: /* Match! */
+ match = !(mflags & creg_fullmatch) ||
+ ((s == j->eol || r == 0 || r == '\n') &&
+ (tlp->se.m[0].str == bol || tlp->se.m[0].str[-1] == '\n'));
+ tlp->se.m[0].len = s - tlp->se.m[0].str;
+ if (mp != NULL)
+ _renewmatch(mp, ms, &tlp->se, progp->nsubids);
+ break;
+ }
+
+ if (ok && _renewthread(nl, inst->l.next, ms, &tlp->se) == nle)
+ return -1;
+ break;
+ }
+ }
+ if (s == j->eol)
+ break;
+ checkstart = j->starttype && nl->inst==NULL;
+ s += n;
+ } while (r);
+ return match;
+}
+
+static int
+regexec2(const Reprog *progp, /* program to run */
+ const char *bol, /* string to run machine on */
+ Resub *mp, /* subexpression elements */
+ int ms, /* number of elements at mp */
+ Reljunk *j,
+ int mflags
+)
+{
+ int rv;
+ Relist *relists;
+
+ /* mark space */
+ relists = (Relist *)malloc(2 * BIGLISTSIZE*sizeof(Relist));
+ if (relists == NULL)
+ return -1;
+
+ j->relist[0] = relists;
+ j->relist[1] = relists + BIGLISTSIZE;
+ j->reliste[0] = relists + BIGLISTSIZE - 2;
+ j->reliste[1] = relists + 2*BIGLISTSIZE - 2;
+
+ rv = regexec1(progp, bol, mp, ms, j, mflags);
+ free(relists);
+ return rv;
+}
+
+static int
+regexec9(const Reprog *progp, /* program to run */
+ const char *bol, /* string to run machine on */
+ int ms, /* number of elements at mp */
+ Resub mp[], /* subexpression elements */
+ int mflags)
+{
+ Reljunk j;
+ Relist relist0[LISTSIZE], relist1[LISTSIZE];
+ int rv;
+
+ /*
+ * use user-specified starting/ending location if specified
+ */
+ j.starts = bol;
+ j.eol = NULL;
+
+ if (mp && mp->str && ms>0) {
+ if (mflags & creg_startend)
+ j.starts = mp->str, j.eol = mp->str + mp->len;
+ else if (mflags & creg_next)
+ j.starts = mp->str + mp->len;
+ }
+
+ j.starttype = 0;
+ j.startchar = 0;
+ if (progp->startinst->type == RUNE && progp->startinst->r.rune < 128) {
+ j.starttype = RUNE;
+ j.startchar = progp->startinst->r.rune;
+ }
+ if (progp->startinst->type == BOL)
+ j.starttype = BOL;
+
+ /* mark space */
+ j.relist[0] = relist0;
+ j.relist[1] = relist1;
+ j.reliste[0] = relist0 + LISTSIZE - 2;
+ j.reliste[1] = relist1 + LISTSIZE - 2;
+
+ rv = regexec1(progp, bol, mp, ms, &j, mflags);
+ if (rv >= 0)
+ return rv;
+ rv = regexec2(progp, bol, mp, ms, &j, mflags);
+ return rv;
+}
+
+/*
+ * API functions
+ */
+
+/* substitute into one string using the matches from the last regexec() */
+void cregex_replace(
+ const char *sp, /* source string */
+ char *dp, /* destination string */
+ int dlen,
+ int ms, /* number of elements pointed to by mp */
+ const cregmatch mp[]) /* subexpression elements */
+{
+ const char *ssp, *ep;
+ int i;
+
+ ep = dp+dlen-1;
+ while (*sp != '\0') {
+ if (*sp == '\\') {
+ switch (*++sp) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ i = *sp - '0';
+ if (mp[i].str != NULL && mp != NULL && ms > i)
+ for (ssp = mp[i].str; ssp < (mp[i].str + mp[i].len); ssp++)
+ if (dp < ep)
+ *dp++ = *ssp;
+ break;
+ case '\\':
+ if (dp < ep)
+ *dp++ = '\\';
+ break;
+ case '\0':
+ sp--;
+ break;
+ default:
+ if (dp < ep)
+ *dp++ = *sp;
+ break;
+ }
+ } else if (*sp == '&') {
+ if (mp[0].str != NULL && mp != NULL && ms > 0)
+ for (ssp = mp[0].str; ssp < (mp[0].str + mp[0].len); ssp++)
+ if (dp < ep)
+ *dp++ = *ssp;
+ } else {
+ if (dp < ep)
+ *dp++ = *sp;
+ }
+ sp++;
+ }
+ *dp = '\0';
+}
+
+int cregex_compile(cregex *rx, const char* pattern, int cflags) {
+ Parser par;
+ rx->prog = regcomp1(&par, pattern, cflags & creg_dotall ? ANYNL : ANY);
+ if (rx->prog) {
+ if (cflags & creg_caseless)
+ rx->prog->flags.ignorecase = true;
+ return 1 + rx->prog->nsubids;
+ }
+ return par.errors;
+}
+
+int cregex_captures(cregex rx) {
+ return rx.prog ? 1 + rx.prog->nsubids : 0;
+}
+
+int cregex_find(const cregex *rx, const char* string,
+ size_t nmatch, cregmatch match[], int mflags) {
+ int res = regexec9(rx->prog, string, nmatch, match, mflags);
+ switch (res) {
+ case 1: return 1 + rx->prog->nsubids;
+ case 0: return creg_nomatch;
+ default: return creg_matcherror;
+ }
+}
+
+void cregex_drop(cregex* self) {
+ free(self->prog);
+}
|
