From 29409b257d9144010bd608afc19f66ee2fbaa337 Mon Sep 17 00:00:00 2001 From: Tyge Løvset Date: Sat, 5 Feb 2022 23:21:51 +0100 Subject: Switched to heavily modified version of Rob Pike's plan9 regexp9. -> now renamed to cregex, with new API. --- src/cregex.c | 1167 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/cregex_utf8.c | 1165 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 2332 insertions(+) create mode 100644 src/cregex.c create mode 100644 src/cregex_utf8.c (limited to 'src') diff --git a/src/cregex.c b/src/cregex.c new file mode 100644 index 00000000..aaa6e62a --- /dev/null +++ b/src/cregex.c @@ -0,0 +1,1167 @@ +/* +This is a Unix port of the Plan 9 regular expression library, by Rob Pike. +Please send comments about the packaging to Russ Cox . + +Copyright © 2021 Plan 9 Foundation +Copyright © 2022 Tyge Løvset, for additions made in 2022. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cregex_utf8.c" + +typedef uint32_t Rune; +/* max character classes per program */ +#define NCLASS creg_max_classes +/* max subexpressions */ +#define NSUBEXP creg_max_captures +/* max rune ranges per character class */ +#define NCCRUNE (NSUBEXP * 2) + +/* + * character class, each pair of rune's defines a range + */ +typedef struct +{ + Rune *end; + Rune spans[NCCRUNE]; +} Reclass; + +/* + * Machine instructions + */ +typedef struct Reinst +{ + int type; + union { + Reclass *classp; /* class pointer */ + Rune rune; /* character */ + int subid; /* sub-expression id for RBRA and LBRA */ + struct Reinst *right; /* right child of OR */ + } r; + union { /* regexp relies on these two being in the same union */ + struct Reinst *left; /* left child of OR */ + struct Reinst *next; /* next instruction for CAT & LBRA */ + } l; +} Reinst; + +typedef struct { + bool ignorecase; + bool dotall; +} Reflags; + +/* + * Reprogram definition + */ +typedef struct Reprog +{ + Reinst *startinst; /* start pc */ + Reflags flags; + int nsubids; + Reclass cclass[NCLASS]; /* .data */ + Reinst firstinst[]; /* .text : originally 5 elements? */ +} Reprog; + +/* + * Sub expression matches + */ +typedef cregmatch Resub; + +/* + * substitution list + */ +typedef struct Resublist +{ + Resub m[NSUBEXP]; +} Resublist; + +/* + * Actions and Tokens (Reinst types) + * + * 0x80-0x8F: operators, value => precedence + * 0x90-0xAF: RUNE and char classes. + * 0xB0-0xBF: tokens, i.e. operands for operators + */ +enum { + OPERATOR = 0x80, /* Bitmask of all operators */ + START = 0x80, /* Start, used for marker on stack */ + RBRA , /* Right bracket, ) */ + LBRA , /* Left bracket, ( */ + OR , /* Alternation, | */ + CAT , /* Concatentation, implicit operator */ + STAR , /* Closure, * */ + PLUS , /* a+ == aa* */ + QUEST , /* a? == a|nothing, i.e. 0 or 1 a's */ + RUNE = 0x90, + CLS_d , CLS_D, /* digit, non-digit */ + CLS_s , CLS_S, /* space, non-space */ + CLS_w , CLS_W, /* word, non-word */ + CLS_an , CLS_AN, /* alphanum */ + CLS_al , CLS_AL, /* alpha */ + CLS_bl , CLS_BL, /* blank */ + CLS_pu , CLS_PU, /* punct */ + CLS_ct , CLS_CT, /* ctrl */ + CLS_gr , CLS_GR, /* graphic */ + CLS_lo , CLS_LO, /* lower */ + CLS_pr , CLS_PR, /* print */ + CLS_up , CLS_UP, /* upper */ + CLS_xd , CLS_XD, /* xdigit */ + ANY = 0xB0, /* Any character except newline, . */ + ANYNL , /* Any character including newline, . */ + NOP , /* No operation, internal use only */ + BOL , /* Beginning of line, ^ */ + EOL , /* End of line, $ */ + CCLASS , /* Character class, [] */ + NCCLASS , /* Negated character class, [] */ + WBOUND , /* Non-word boundary, not consuming meta char */ + NWBOUND , /* Word boundary, not consuming meta char */ + END = 0xBF, /* Terminate: match found */ +}; + +/* + * regexec execution lists + */ +#define LISTSIZE 10 +#define BIGLISTSIZE (10*LISTSIZE) + +typedef struct Relist +{ + Reinst* inst; /* Reinstruction of the thread */ + Resublist se; /* matched subexpressions in this thread */ +} Relist; + +typedef struct Reljunk +{ + Relist* relist[2]; + Relist* reliste[2]; + int starttype; + Rune startchar; + const char* starts; + const char* eol; +} Reljunk; + +/* + * utf8 and Rune code + */ + +static int +chartorune(Rune *rune, const char *s) +{ + utf8_decode_t ctx = {UTF8_OK}; + const uint8_t *b = (const uint8_t*)s; + utf8_decode(&ctx, *b++); + switch (ctx.len) { + case 4: utf8_decode(&ctx, *b++); + case 3: utf8_decode(&ctx, *b++); + case 2: utf8_decode(&ctx, *b++); + } + *rune = ctx.codep; + return ctx.len; +} + +static const char* +utfrune(const char *s, Rune c) +{ + Rune r; + + if (c < 128) /* ascii */ + return strchr((char *)s, c); + + for (;;) { + int n = chartorune(&r, s); + if (r == c) return s; + if ((r == 0) | (n == 0)) return NULL; + s += n; + } +} + +static const char* +utfruneicase(const char *s, Rune c) +{ + Rune r; + c = utf8_tolower(c); + for (;;) { + int n = chartorune(&r, s); + if (utf8_tolower(r) == c) return s; + if ((r == 0) | (n == 0)) return NULL; + s += n; + } +} + +/************ + * regaux.c * + ************/ + +/* + * save a new match in mp + */ +static void +_renewmatch(Resub *mp, int ms, Resublist *sp, int nsubids) +{ + int i; + + if (mp==NULL || ms<=0) + return; + if (mp[0].str == NULL || sp->m[0].str < mp[0].str || + (sp->m[0].str == mp[0].str && sp->m[0].len > mp[0].len)) { + for (i=0; im[i]; + } +} + +/* + * Note optimization in _renewthread: + * *lp must be pending when _renewthread called; if *l has been looked + * at already, the optimization is a bug. + */ +static Relist* +_renewthread(Relist *lp, /* _relist to add to */ + Reinst *ip, /* instruction to add */ + int ms, + Resublist *sep) /* pointers to subexpressions */ +{ + Relist *p; + + for (p=lp; p->inst; p++) { + if (p->inst == ip) { + if (sep->m[0].str < p->se.m[0].str) { + if (ms > 1) + p->se = *sep; + else + p->se.m[0] = sep->m[0]; + } + return 0; + } + } + p->inst = ip; + if (ms > 1) + p->se = *sep; + else + p->se.m[0] = sep->m[0]; + (++p)->inst = NULL; + return p; +} + +/* + * same as renewthread, but called with + * initial empty start pointer. + */ +static Relist* +_renewemptythread(Relist *lp, /* _relist to add to */ + Reinst *ip, /* instruction to add */ + int ms, + const char *sp) /* pointers to subexpressions */ +{ + Relist *p; + + for (p=lp; p->inst; p++) { + if (p->inst == ip) { + if (sp < p->se.m[0].str) { + if (ms > 1) + memset(&p->se, 0, sizeof(p->se)); + p->se.m[0].str = sp; + } + return 0; + } + } + p->inst = ip; + if (ms > 1) + memset(&p->se, 0, sizeof(p->se)); + p->se.m[0].str = sp; + (++p)->inst = NULL; + return p; +} + +/* + * Parser Information + */ +typedef struct Node +{ + Reinst* first; + Reinst* last; +} Node; + +#define NSTACK 20 +typedef struct Parser +{ + const char* exprp; /* pointer to next character in source expression */ + Node andstack[NSTACK]; + Node* andp; + short atorstack[NSTACK]; + short* atorp; + short subidstack[NSTACK]; /* parallel to atorstack */ + short* subidp; + short cursubid; /* id of current subexpression */ + int errors; + bool ignorecase; + bool lastwasand; /* Last token was operand */ + bool lexdone; + short nbra; + short nclass; + Rune yyrune; /* last lex'd rune */ + Reclass *yyclassp; /* last lex'd class */ + Reclass* classp; + Reinst* freep; + jmp_buf regkaboom; +} Parser; + +/* predeclared crap */ +static void _operator(Parser *par, int type); +static void pushand(Parser *par, Reinst *first, Reinst *last); +static void pushator(Parser *par, int type); +static void evaluntil(Parser *par, int type); +static int bldcclass(Parser *par); + +static void +rcerror(Parser *par, cregex_error_t err) +{ + par->errors = err; + longjmp(par->regkaboom, 1); +} + +static Reinst* +newinst(Parser *par, int t) +{ + par->freep->type = t; + par->freep->l.left = 0; + par->freep->r.right = 0; + return par->freep++; +} + +static void +operand(Parser *par, int t) +{ + Reinst *i; + + if (par->lastwasand) + _operator(par, CAT); /* catenate is implicit */ + i = newinst(par, t); + + if (t == CCLASS || t == NCCLASS) + i->r.classp = par->yyclassp; + if (t == RUNE) + i->r.rune = par->yyrune; + + pushand(par, i, i); + par->lastwasand = true; +} + +static void +_operator(Parser *par, int t) +{ + if (t==RBRA && --par->nbra<0) + rcerror(par, creg_unmatchedrightparenthesis); + if (t==LBRA) { + if (++par->cursubid >= NSUBEXP) + rcerror(par, creg_toomanysubexpressions); + par->nbra++; + if (par->lastwasand) + _operator(par, CAT); + } else + evaluntil(par, t); + if (t != RBRA) + pushator(par, t); + par->lastwasand = 0; + if (t==STAR || t==QUEST || t==PLUS || t==RBRA) + par->lastwasand = true; /* these look like operands */ +} + +static void +pushand(Parser *par, Reinst *f, Reinst *l) +{ + if (par->andp >= &par->andstack[NSTACK]) + rcerror(par, creg_operandstackoverflow); + par->andp->first = f; + par->andp->last = l; + par->andp++; +} + +static void +pushator(Parser *par, int t) +{ + if (par->atorp >= &par->atorstack[NSTACK]) + rcerror(par, creg_operatorstackoverflow); + *par->atorp++ = t; + *par->subidp++ = par->cursubid; +} + +static Node* +popand(Parser *par, int op) +{ + Reinst *inst; + + if (par->andp <= &par->andstack[0]) { + rcerror(par, creg_missingoperand); + inst = newinst(par, NOP); + pushand(par, inst, inst); + } + return --par->andp; +} + +static int +popator(Parser *par) +{ + if (par->atorp <= &par->atorstack[0]) + rcerror(par, creg_operatorstackunderflow); + --par->subidp; + return *--par->atorp; +} + +static void +evaluntil(Parser *par, int pri) +{ + Node *op1, *op2; + Reinst *inst1, *inst2; + + while (pri==RBRA || par->atorp[-1]>=pri) { + switch (popator(par)) { + default: + rcerror(par, creg_unknownoperator); + break; + case LBRA: /* must have been RBRA */ + op1 = popand(par, '('); + inst2 = newinst(par, RBRA); + inst2->r.subid = *par->subidp; + op1->last->l.next = inst2; + inst1 = newinst(par, LBRA); + inst1->r.subid = *par->subidp; + inst1->l.next = op1->first; + pushand(par, inst1, inst2); + return; + case OR: + op2 = popand(par, '|'); + op1 = popand(par, '|'); + inst2 = newinst(par, NOP); + op2->last->l.next = inst2; + op1->last->l.next = inst2; + inst1 = newinst(par, OR); + inst1->r.right = op1->first; + inst1->l.left = op2->first; + pushand(par, inst1, inst2); + break; + case CAT: + op2 = popand(par, 0); + op1 = popand(par, 0); + op1->last->l.next = op2->first; + pushand(par, op1->first, op2->last); + break; + case STAR: + op2 = popand(par, '*'); + inst1 = newinst(par, OR); + op2->last->l.next = inst1; + inst1->r.right = op2->first; + pushand(par, inst1, inst1); + break; + case PLUS: + op2 = popand(par, '+'); + inst1 = newinst(par, OR); + op2->last->l.next = inst1; + inst1->r.right = op2->first; + pushand(par, op2->first, inst1); + break; + case QUEST: + op2 = popand(par, '?'); + inst1 = newinst(par, OR); + inst2 = newinst(par, NOP); + inst1->l.left = inst2; + inst1->r.right = op2->first; + op2->last->l.next = inst2; + pushand(par, inst1, inst2); + break; + } + } +} + +static Reprog* +optimize(Parser *par, Reprog *pp) +{ + Reinst *inst, *target; + size_t size; + Reprog *npp; + Reclass *cl; + ptrdiff_t diff; + + /* + * get rid of NOOP chains + */ + for (inst = pp->firstinst; inst->type != END; inst++) { + target = inst->l.next; + while (target->type == NOP) + target = target->l.next; + inst->l.next = target; + } + + /* + * The original allocation is for an area larger than + * necessary. Reallocate to the actual space used + * and then relocate the code. + */ + size = sizeof(Reprog) + (par->freep - pp->firstinst)*sizeof(Reinst); + npp = (Reprog *)realloc(pp, size); + if (npp==NULL || npp==pp) + return pp; + diff = (char *)npp - (char *)pp; + par->freep = (Reinst *)((char *)par->freep + diff); + for (inst = npp->firstinst; inst < par->freep; inst++) { + switch (inst->type) { + case OR: + case STAR: + case PLUS: + case QUEST: + inst->r.right = (Reinst *)((char*)inst->r.right + diff); + break; + case CCLASS: + case NCCLASS: + inst->r.right = (Reinst *)((char*)inst->r.right + diff); + cl = inst->r.classp; + cl->end = (Rune *)((char*)cl->end + diff); + break; + } + inst->l.left = (Reinst *)((char*)inst->l.left + diff); + } + npp->startinst = (Reinst *)((char*)npp->startinst + diff); + return npp; +} + +#ifdef DEBUG +static void +dumpstack(Parser *par) { + Node *stk; + int *ip; + + print("operators\n"); + for (ip = par->atorstack; ip < par->atorp; ip++) + print("0%o\n", *ip); + print("operands\n"); + for (stk = par->andstack; stk < par->andp; stk++) + print("0%o\t0%o\n", stk->first->type, stk->last->type); +} + +static void +dump(Reprog *pp) +{ + Reinst *l; + Rune *p; + + l = pp->firstinst; + do { + print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, + l->l.left-pp->firstinst, l->r.right-pp->firstinst); + if (l->type == RUNE) + print("\t%C\n", l->r.rune); + else if (l->type == CCLASS || l->type == NCCLASS) { + print("\t["); + if (l->type == NCCLASS) + print("^"); + for (p = l->r.classp->spans; p < l->r.classp->end; p += 2) + if (p[0] == p[1]) + print("%C", p[0]); + else + print("%C-%C", p[0], p[1]); + print("]\n"); + } else + print("\n"); + } while (l++->type); +} +#endif + +static Reclass* +newclass(Parser *par) +{ + if (par->nclass >= NCLASS) + rcerror(par, creg_toomanycharacterclasses); + return &(par->classp[par->nclass++]); +} + +static int +nextc(Parser *par, Rune *rp) +{ + if (par->lexdone) { + *rp = 0; + return true; + } + par->exprp += chartorune(rp, par->exprp); + if (*rp == '\\') { + par->exprp += chartorune(rp, par->exprp); + switch (*rp) { + case 't': *rp = '\t'; break; + case 'n': *rp = '\n'; break; + case 'r': *rp = '\r'; break; + case 'v': *rp = '\v'; break; + case 'f': *rp = '\f'; break; + case 'd': *rp = CLS_d; break; + case 'D': *rp = CLS_D; break; + case 's': *rp = CLS_s; break; + case 'S': *rp = CLS_S; break; + case 'w': *rp = CLS_w; break; + case 'W': *rp = CLS_W; break; + } + return true; + } + if (*rp == 0) + par->lexdone = true; + return false; +} + +static int +lex(Parser *par, int* dot_type) +{ + int quoted; + start: + quoted = nextc(par, &par->yyrune); + if (quoted) switch (par->yyrune) { + case 0 : return END; + case 'b': return WBOUND; + case 'B': return NWBOUND; + default : return RUNE; + } + + switch (par->yyrune) { + case 0 : return END; + case '*': return STAR; + case '?': return QUEST; + case '+': return PLUS; + case '|': return OR; + case '.': return *dot_type; + case '(': + if (par->exprp[0] == '?') { + for (int k = 1, inv = 0; ; ++k) switch (par->exprp[k]) { + case 0 : par->exprp += k; return END; + case ')': par->exprp += k + 1; goto start; + case '-': inv = 1; break; + case 's': *dot_type = inv ? ANY : ANYNL; break; + case 'i': par->ignorecase = !inv; break; + } + } + return LBRA; + case ')': return RBRA; + case '^': return BOL; + case '$': return EOL; + case '[': return bldcclass(par); + } + return RUNE; +} + +static int +bldcclass(Parser *par) +{ + int type; + Rune r[NCCRUNE]; + Rune *p, *ep, *np; + Rune rune; + int quoted; + + /* we have already seen the '[' */ + type = CCLASS; + par->yyclassp = newclass(par); + + /* look ahead for negation */ + /* SPECIAL CASE!!! negated classes don't match \n */ + ep = r; + quoted = nextc(par, &rune); + if (!quoted && rune == '^') { + type = NCCLASS; + quoted = nextc(par, &rune); + *ep++ = '\n'; + *ep++ = '\n'; + } + + /* parse class into a set of spans */ + for (; ep < &r[NCCRUNE]; quoted = nextc(par, &rune)) { + if (rune == 0) { + rcerror(par, creg_malformedcharacterclass); + return 0; + } + if (!quoted) { + if (rune == ']') + break; + if (rune == '-') { + if (ep != r && *par->exprp != ']') { + quoted = nextc(par, &rune); + if (rune == 0) { + rcerror(par, creg_malformedcharacterclass); + return 0; + } + ep[-1] = rune; + continue; + } + } + if (rune == '[' && *par->exprp == ':') { + static struct { const char* c; int n, r; } cls[] = { + {":alnum:]", 8, CLS_an}, {":alpha:]", 8, CLS_al}, {":blank:]", 8, CLS_bl}, + {":cntrl:]", 8, CLS_ct}, {":digit:]", 8, CLS_d}, {":graph:]", 8, CLS_gr}, + {":lower:]", 8, CLS_lo}, {":print:]", 8, CLS_pr}, {":punct:]", 8, CLS_pu}, + {":space:]", 8, CLS_s}, {":upper:]", 8, CLS_up}, {":xdigit:]", 9, CLS_xd}, + {":word:]", 7, CLS_w}, + }; + for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i) + if (!strncmp(par->exprp, cls[i].c, cls[i].n)) { + rune = cls[i].r; + par->exprp += cls[i].n; + break; + } + } + } + *ep++ = rune; + *ep++ = rune; + } + + /* sort on span start */ + for (p = r; p < ep; p += 2) { + for (np = p; np < ep; np += 2) + if (*np < *p) { + rune = np[0]; + np[0] = p[0]; + p[0] = rune; + rune = np[1]; + np[1] = p[1]; + p[1] = rune; + } + } + + /* merge spans */ + np = par->yyclassp->spans; + p = r; + if (r == ep) + par->yyclassp->end = np; + else { + np[0] = *p++; + np[1] = *p++; + for (; p < ep; p += 2) + if (p[0] <= np[1]) { + if (p[1] > np[1]) + np[1] = p[1]; + } else { + np += 2; + np[0] = p[0]; + np[1] = p[1]; + } + par->yyclassp->end = np+2; + } + + return type; +} + +static Reprog* +regcomp1(Parser *par, const char *s, int dot_type) +{ + int token; + Reprog *volatile pp; + + /* get memory for the program. estimated max usage */ + const int instcap = 5 + 6*strlen(s); + pp = (Reprog *)malloc(sizeof(Reprog) + instcap*sizeof(Reinst)); + if (pp == NULL) { + rcerror(par, creg_outofmemory); + return NULL; + } + pp->flags.ignorecase = false; + pp->flags.dotall = (dot_type == ANYNL); + par->freep = pp->firstinst; + par->classp = pp->cclass; + par->errors = 0; + + if (setjmp(par->regkaboom)) + goto out; + + /* go compile the sucker */ + par->lexdone = false; + par->ignorecase = false; + par->exprp = s; + par->nclass = 0; + par->nbra = 0; + par->atorp = par->atorstack; + par->andp = par->andstack; + par->subidp = par->subidstack; + par->lastwasand = false; + par->cursubid = 0; + + /* Start with a low priority operator to prime parser */ + pushator(par, START-1); + while ((token = lex(par, &dot_type)) != END) { + if ((token & 0xF0) == OPERATOR) + _operator(par, token); + else + operand(par, token); + } + + /* Close with a low priority operator */ + evaluntil(par, START); + + /* Force END */ + operand(par, END); + evaluntil(par, START); +#ifdef DEBUG + dumpstack(par); +#endif + if (par->nbra) + rcerror(par, creg_unmatchedleftparenthesis); + --par->andp; /* points to first and only operand */ + pp->startinst = par->andp->first; +#ifdef DEBUG + dump(pp); +#endif + pp = optimize(par, pp); + pp->flags.ignorecase |= par->ignorecase; + pp->nsubids = par->cursubid; +#ifdef DEBUG + print("start: %d\n", par->andp->first-pp->firstinst); + dump(pp); +#endif +out: + if (par->errors) { + free(pp); + pp = NULL; + } + return pp; +} + + +static int +runematch(Rune s, Rune r, bool icase) +{ + int inv = 0; + switch (s) { + case CLS_D: inv = true; /* fallthrough */ + case CLS_d: return inv ^ (isdigit(r) != 0); + case CLS_S: inv = true; + case CLS_s: return inv ^ (isspace(r) != 0); + case CLS_W: inv = true; + case CLS_w: return inv ^ (utf8_isalnum(r) | (r == '_')); + case CLS_al: return utf8_isalpha(r); + case CLS_bl: return ((r == ' ') | (r == '\t')); + case CLS_ct: return iscntrl(r) != 0; + case CLS_gr: return isgraph(r) != 0; + case CLS_an: return utf8_isalnum(r); + case CLS_pr: return isprint(r) != 0; + case CLS_pu: return ispunct(r) != 0; + case CLS_xd: return isxdigit(r) != 0; + case CLS_lo: return icase ? utf8_isalpha(s) : utf8_islower(r); + case CLS_up: return icase ? utf8_isalpha(s) : utf8_isupper(r); + } + return icase ? utf8_tolower(s) == utf8_tolower(r) : s == r; +} + +/* + * return 0 if no match + * >0 if a match + * <0 if we ran out of _relist space + */ +static int +regexec1(const Reprog *progp, /* program to run */ + const char *bol, /* string to run machine on */ + Resub *mp, /* subexpression elements */ + int ms, /* number of elements at mp */ + Reljunk *j, + int mflags +) +{ + int flag=0; + Reinst *inst; + Relist *tlp; + Relist *tl, *nl; /* This list, next list */ + Relist *tle, *nle; /* Ends of this and next list */ + const char *s, *p; + int i, n, checkstart; + Rune r, *rp, *ep; + int match = 0; + + bool icase = progp->flags.ignorecase || (mflags & creg_caseless); + checkstart = j->starttype; + if (mp) + for (i=0; irelist[0][0].inst = NULL; + j->relist[1][0].inst = NULL; + + /* Execute machine once for each character, including terminal NUL */ + s = j->starts; + do { + /* fast check for first char */ + if (checkstart) { + switch (j->starttype) { + case RUNE: + p = icase ? utfruneicase(s, j->startchar) + : utfrune(s, j->startchar); + if (p == NULL || s == j->eol) + return match; + s = p; + break; + case BOL: + if (s == bol) + break; + p = utfrune(s, '\n'); + if (p == NULL || s == j->eol) + return match; + s = p+1; + break; + } + } + n = chartorune(&r, s); + + /* switch run lists */ + tl = j->relist[flag]; + tle = j->reliste[flag]; + nl = j->relist[flag^=1]; + nle = j->reliste[flag]; + nl->inst = NULL; + + /* Add first instruction to current list */ + if (match == 0) + _renewemptythread(tl, progp->startinst, ms, s); + + /* Execute machine until current list is empty */ + for (tlp=tl; tlp->inst; tlp++) { /* assignment = */ + for (inst = tlp->inst; ; inst = inst->l.next) { + int ok = false; + + switch (inst->type) { + case RUNE: /* regular character */ + ok = runematch(inst->r.rune, r, icase); + break; + case LBRA: + tlp->se.m[inst->r.subid].str = s; + continue; + case RBRA: + tlp->se.m[inst->r.subid].len = s - tlp->se.m[inst->r.subid].str; + continue; + case ANY: + ok = (r != '\n'); + break; + case ANYNL: + ok = true; + break; + case BOL: + if (s == bol || *(s-1) == '\n') + continue; + break; + case EOL: + if (s == j->eol || r == 0 || r == '\n') + continue; + break; + case NWBOUND: + ok = true; /* fallthrough */ + case WBOUND: + if (ok ^ (s == bol || s == j->eol || ((isalnum(s[-1]) || s[-1] == '_') + ^ (isalnum(s[ 0]) || s[ 0] == '_')))) + continue; + break; + case NCCLASS: + ok = true; /* fallthrough */ + case CCLASS: + ep = inst->r.classp->end; + for (rp = inst->r.classp->spans; rp < ep; rp += 2) { + if ((r >= rp[0] && r <= rp[1]) || (rp[0] == rp[1] && runematch(rp[0], r, icase))) + break; + } + ok ^= (rp < ep); + break; + case OR: + /* evaluate right choice later */ + if (_renewthread(tlp, inst->r.right, ms, &tlp->se) == tle) + return -1; + /* efficiency: advance and re-evaluate */ + continue; + case END: /* Match! */ + match = !(mflags & creg_fullmatch) || + ((s == j->eol || r == 0 || r == '\n') && + (tlp->se.m[0].str == bol || tlp->se.m[0].str[-1] == '\n')); + tlp->se.m[0].len = s - tlp->se.m[0].str; + if (mp != NULL) + _renewmatch(mp, ms, &tlp->se, progp->nsubids); + break; + } + + if (ok && _renewthread(nl, inst->l.next, ms, &tlp->se) == nle) + return -1; + break; + } + } + if (s == j->eol) + break; + checkstart = j->starttype && nl->inst==NULL; + s += n; + } while (r); + return match; +} + +static int +regexec2(const Reprog *progp, /* program to run */ + const char *bol, /* string to run machine on */ + Resub *mp, /* subexpression elements */ + int ms, /* number of elements at mp */ + Reljunk *j, + int mflags +) +{ + int rv; + Relist *relists; + + /* mark space */ + relists = (Relist *)malloc(2 * BIGLISTSIZE*sizeof(Relist)); + if (relists == NULL) + return -1; + + j->relist[0] = relists; + j->relist[1] = relists + BIGLISTSIZE; + j->reliste[0] = relists + BIGLISTSIZE - 2; + j->reliste[1] = relists + 2*BIGLISTSIZE - 2; + + rv = regexec1(progp, bol, mp, ms, j, mflags); + free(relists); + return rv; +} + +static int +regexec9(const Reprog *progp, /* program to run */ + const char *bol, /* string to run machine on */ + int ms, /* number of elements at mp */ + Resub mp[], /* subexpression elements */ + int mflags) +{ + Reljunk j; + Relist relist0[LISTSIZE], relist1[LISTSIZE]; + int rv; + + /* + * use user-specified starting/ending location if specified + */ + j.starts = bol; + j.eol = NULL; + + if (mp && mp->str && ms>0) { + if (mflags & creg_startend) + j.starts = mp->str, j.eol = mp->str + mp->len; + else if (mflags & creg_next) + j.starts = mp->str + mp->len; + } + + j.starttype = 0; + j.startchar = 0; + if (progp->startinst->type == RUNE && progp->startinst->r.rune < 128) { + j.starttype = RUNE; + j.startchar = progp->startinst->r.rune; + } + if (progp->startinst->type == BOL) + j.starttype = BOL; + + /* mark space */ + j.relist[0] = relist0; + j.relist[1] = relist1; + j.reliste[0] = relist0 + LISTSIZE - 2; + j.reliste[1] = relist1 + LISTSIZE - 2; + + rv = regexec1(progp, bol, mp, ms, &j, mflags); + if (rv >= 0) + return rv; + rv = regexec2(progp, bol, mp, ms, &j, mflags); + return rv; +} + +/* + * API functions + */ + +/* substitute into one string using the matches from the last regexec() */ +void cregex_replace( + const char *sp, /* source string */ + char *dp, /* destination string */ + int dlen, + int ms, /* number of elements pointed to by mp */ + const cregmatch mp[]) /* subexpression elements */ +{ + const char *ssp, *ep; + int i; + + ep = dp+dlen-1; + while (*sp != '\0') { + if (*sp == '\\') { + switch (*++sp) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + i = *sp - '0'; + if (mp[i].str != NULL && mp != NULL && ms > i) + for (ssp = mp[i].str; ssp < (mp[i].str + mp[i].len); ssp++) + if (dp < ep) + *dp++ = *ssp; + break; + case '\\': + if (dp < ep) + *dp++ = '\\'; + break; + case '\0': + sp--; + break; + default: + if (dp < ep) + *dp++ = *sp; + break; + } + } else if (*sp == '&') { + if (mp[0].str != NULL && mp != NULL && ms > 0) + for (ssp = mp[0].str; ssp < (mp[0].str + mp[0].len); ssp++) + if (dp < ep) + *dp++ = *ssp; + } else { + if (dp < ep) + *dp++ = *sp; + } + sp++; + } + *dp = '\0'; +} + +int cregex_compile(cregex *rx, const char* pattern, int cflags) { + Parser par; + rx->prog = regcomp1(&par, pattern, cflags & creg_dotall ? ANYNL : ANY); + if (rx->prog) { + if (cflags & creg_caseless) + rx->prog->flags.ignorecase = true; + return 1 + rx->prog->nsubids; + } + return par.errors; +} + +int cregex_captures(cregex rx) { + return rx.prog ? 1 + rx.prog->nsubids : 0; +} + +int cregex_find(const cregex *rx, const char* string, + size_t nmatch, cregmatch match[], int mflags) { + int res = regexec9(rx->prog, string, nmatch, match, mflags); + switch (res) { + case 1: return 1 + rx->prog->nsubids; + case 0: return creg_nomatch; + default: return creg_matcherror; + } +} + +void cregex_drop(cregex* self) { + free(self->prog); +} diff --git a/src/cregex_utf8.c b/src/cregex_utf8.c new file mode 100644 index 00000000..a121542c --- /dev/null +++ b/src/cregex_utf8.c @@ -0,0 +1,1165 @@ +#include +#include + +enum { UPPER = 0, LOWER = 1, HT_SIZE = 1997 }; +// based on unicode CaseFolding.txt +static const uint32_t cfold[][2] = { +{0x00041, 0x00061}, // LATIN CAPITAL LETTER A +{0x00042, 0x00062}, // LATIN CAPITAL LETTER B +{0x00043, 0x00063}, // LATIN CAPITAL LETTER C +{0x00044, 0x00064}, // LATIN CAPITAL LETTER D +{0x00045, 0x00065}, // LATIN CAPITAL LETTER E +{0x00046, 0x00066}, // LATIN CAPITAL LETTER F +{0x00047, 0x00067}, // LATIN CAPITAL LETTER G +{0x00048, 0x00068}, // LATIN CAPITAL LETTER H +{0x00049, 0x00069}, // LATIN CAPITAL LETTER I +{0x0004A, 0x0006A}, // LATIN CAPITAL LETTER J +{0x0004B, 0x0006B}, // LATIN CAPITAL LETTER K +{0x0004C, 0x0006C}, // LATIN CAPITAL LETTER L +{0x0004D, 0x0006D}, // LATIN CAPITAL LETTER M +{0x0004E, 0x0006E}, // LATIN CAPITAL LETTER N +{0x0004F, 0x0006F}, // LATIN CAPITAL LETTER O +{0x00050, 0x00070}, // LATIN CAPITAL LETTER P +{0x00051, 0x00071}, // LATIN CAPITAL LETTER Q +{0x00052, 0x00072}, // LATIN CAPITAL LETTER R +{0x00053, 0x00073}, // LATIN CAPITAL LETTER S +{0x00054, 0x00074}, // LATIN CAPITAL LETTER T +{0x00055, 0x00075}, // LATIN CAPITAL LETTER U +{0x00056, 0x00076}, // LATIN CAPITAL LETTER V +{0x00057, 0x00077}, // LATIN CAPITAL LETTER W +{0x00058, 0x00078}, // LATIN CAPITAL LETTER X +{0x00059, 0x00079}, // LATIN CAPITAL LETTER Y +{0x0005A, 0x0007A}, // LATIN CAPITAL LETTER Z +{0x000B5, 0x003BC}, // MICRO SIGN +{0x000C0, 0x000E0}, // LATIN CAPITAL LETTER A WITH GRAVE +{0x000C1, 0x000E1}, // LATIN CAPITAL LETTER A WITH ACUTE +{0x000C2, 0x000E2}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX +{0x000C3, 0x000E3}, // LATIN CAPITAL LETTER A WITH TILDE +{0x000C4, 0x000E4}, // LATIN CAPITAL LETTER A WITH DIAERESIS +{0x000C5, 0x000E5}, // LATIN CAPITAL LETTER A WITH RING ABOVE +{0x000C6, 0x000E6}, // LATIN CAPITAL LETTER AE +{0x000C7, 0x000E7}, // LATIN CAPITAL LETTER C WITH CEDILLA +{0x000C8, 0x000E8}, // LATIN CAPITAL LETTER E WITH GRAVE +{0x000C9, 0x000E9}, // LATIN CAPITAL LETTER E WITH ACUTE +{0x000CA, 0x000EA}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX +{0x000CB, 0x000EB}, // LATIN CAPITAL LETTER E WITH DIAERESIS +{0x000CC, 0x000EC}, // LATIN CAPITAL LETTER I WITH GRAVE +{0x000CD, 0x000ED}, // LATIN CAPITAL LETTER I WITH ACUTE +{0x000CE, 0x000EE}, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX +{0x000CF, 0x000EF}, // LATIN CAPITAL LETTER I WITH DIAERESIS +{0x000D0, 0x000F0}, // LATIN CAPITAL LETTER ETH +{0x000D1, 0x000F1}, // LATIN CAPITAL LETTER N WITH TILDE +{0x000D2, 0x000F2}, // LATIN CAPITAL LETTER O WITH GRAVE +{0x000D3, 0x000F3}, // LATIN CAPITAL LETTER O WITH ACUTE +{0x000D4, 0x000F4}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX +{0x000D5, 0x000F5}, // LATIN CAPITAL LETTER O WITH TILDE +{0x000D6, 0x000F6}, // LATIN CAPITAL LETTER O WITH DIAERESIS +{0x000D8, 0x000F8}, // LATIN CAPITAL LETTER O WITH STROKE +{0x000D9, 0x000F9}, // LATIN CAPITAL LETTER U WITH GRAVE +{0x000DA, 0x000FA}, // LATIN CAPITAL LETTER U WITH ACUTE +{0x000DB, 0x000FB}, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX +{0x000DC, 0x000FC}, // LATIN CAPITAL LETTER U WITH DIAERESIS +{0x000DD, 0x000FD}, // LATIN CAPITAL LETTER Y WITH ACUTE +{0x000DE, 0x000FE}, // LATIN CAPITAL LETTER THORN +{0x00100, 0x00101}, // LATIN CAPITAL LETTER A WITH MACRON +{0x00102, 0x00103}, // LATIN CAPITAL LETTER A WITH BREVE +{0x00104, 0x00105}, // LATIN CAPITAL LETTER A WITH OGONEK +{0x00106, 0x00107}, // LATIN CAPITAL LETTER C WITH ACUTE +{0x00108, 0x00109}, // LATIN CAPITAL LETTER C WITH CIRCUMFLEX +{0x0010A, 0x0010B}, // LATIN CAPITAL LETTER C WITH DOT ABOVE +{0x0010C, 0x0010D}, // LATIN CAPITAL LETTER C WITH CARON +{0x0010E, 0x0010F}, // LATIN CAPITAL LETTER D WITH CARON +{0x00110, 0x00111}, // LATIN CAPITAL LETTER D WITH STROKE +{0x00112, 0x00113}, // LATIN CAPITAL LETTER E WITH MACRON +{0x00114, 0x00115}, // LATIN CAPITAL LETTER E WITH BREVE +{0x00116, 0x00117}, // LATIN CAPITAL LETTER E WITH DOT ABOVE +{0x00118, 0x00119}, // LATIN CAPITAL LETTER E WITH OGONEK +{0x0011A, 0x0011B}, // LATIN CAPITAL LETTER E WITH CARON +{0x0011C, 0x0011D}, // LATIN CAPITAL LETTER G WITH CIRCUMFLEX +{0x0011E, 0x0011F}, // LATIN CAPITAL LETTER G WITH BREVE +{0x00120, 0x00121}, // LATIN CAPITAL LETTER G WITH DOT ABOVE +{0x00122, 0x00123}, // LATIN CAPITAL LETTER G WITH CEDILLA +{0x00124, 0x00125}, // LATIN CAPITAL LETTER H WITH CIRCUMFLEX +{0x00126, 0x00127}, // LATIN CAPITAL LETTER H WITH STROKE +{0x00128, 0x00129}, // LATIN CAPITAL LETTER I WITH TILDE +{0x0012A, 0x0012B}, // LATIN CAPITAL LETTER I WITH MACRON +{0x0012C, 0x0012D}, // LATIN CAPITAL LETTER I WITH BREVE +{0x0012E, 0x0012F}, // LATIN CAPITAL LETTER I WITH OGONEK +{0x00132, 0x00133}, // LATIN CAPITAL LIGATURE IJ +{0x00134, 0x00135}, // LATIN CAPITAL LETTER J WITH CIRCUMFLEX +{0x00136, 0x00137}, // LATIN CAPITAL LETTER K WITH CEDILLA +{0x00139, 0x0013A}, // LATIN CAPITAL LETTER L WITH ACUTE +{0x0013B, 0x0013C}, // LATIN CAPITAL LETTER L WITH CEDILLA +{0x0013D, 0x0013E}, // LATIN CAPITAL LETTER L WITH CARON +{0x0013F, 0x00140}, // LATIN CAPITAL LETTER L WITH MIDDLE DOT +{0x00141, 0x00142}, // LATIN CAPITAL LETTER L WITH STROKE +{0x00143, 0x00144}, // LATIN CAPITAL LETTER N WITH ACUTE +{0x00145, 0x00146}, // LATIN CAPITAL LETTER N WITH CEDILLA +{0x00147, 0x00148}, // LATIN CAPITAL LETTER N WITH CARON +{0x0014A, 0x0014B}, // LATIN CAPITAL LETTER ENG +{0x0014C, 0x0014D}, // LATIN CAPITAL LETTER O WITH MACRON +{0x0014E, 0x0014F}, // LATIN CAPITAL LETTER O WITH BREVE +{0x00150, 0x00151}, // LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +{0x00152, 0x00153}, // LATIN CAPITAL LIGATURE OE +{0x00154, 0x00155}, // LATIN CAPITAL LETTER R WITH ACUTE +{0x00156, 0x00157}, // LATIN CAPITAL LETTER R WITH CEDILLA +{0x00158, 0x00159}, // LATIN CAPITAL LETTER R WITH CARON +{0x0015A, 0x0015B}, // LATIN CAPITAL LETTER S WITH ACUTE +{0x0015C, 0x0015D}, // LATIN CAPITAL LETTER S WITH CIRCUMFLEX +{0x0015E, 0x0015F}, // LATIN CAPITAL LETTER S WITH CEDILLA +{0x00160, 0x00161}, // LATIN CAPITAL LETTER S WITH CARON +{0x00162, 0x00163}, // LATIN CAPITAL LETTER T WITH CEDILLA +{0x00164, 0x00165}, // LATIN CAPITAL LETTER T WITH CARON +{0x00166, 0x00167}, // LATIN CAPITAL LETTER T WITH STROKE +{0x00168, 0x00169}, // LATIN CAPITAL LETTER U WITH TILDE +{0x0016A, 0x0016B}, // LATIN CAPITAL LETTER U WITH MACRON +{0x0016C, 0x0016D}, // LATIN CAPITAL LETTER U WITH BREVE +{0x0016E, 0x0016F}, // LATIN CAPITAL LETTER U WITH RING ABOVE +{0x00170, 0x00171}, // LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +{0x00172, 0x00173}, // LATIN CAPITAL LETTER U WITH OGONEK +{0x00174, 0x00175}, // LATIN CAPITAL LETTER W WITH CIRCUMFLEX +{0x00176, 0x00177}, // LATIN CAPITAL LETTER Y WITH CIRCUMFLEX +{0x00178, 0x000FF}, // LATIN CAPITAL LETTER Y WITH DIAERESIS +{0x00179, 0x0017A}, // LATIN CAPITAL LETTER Z WITH ACUTE +{0x0017B, 0x0017C}, // LATIN CAPITAL LETTER Z WITH DOT ABOVE +{0x0017D, 0x0017E}, // LATIN CAPITAL LETTER Z WITH CARON +{0x0017F, 0x00073}, // LATIN SMALL LETTER LONG S +{0x00181, 0x00253}, // LATIN CAPITAL LETTER B WITH HOOK +{0x00182, 0x00183}, // LATIN CAPITAL LETTER B WITH TOPBAR +{0x00184, 0x00185}, // LATIN CAPITAL LETTER TONE SIX +{0x00186, 0x00254}, // LATIN CAPITAL LETTER OPEN O +{0x00187, 0x00188}, // LATIN CAPITAL LETTER C WITH HOOK +{0x00189, 0x00256}, // LATIN CAPITAL LETTER AFRICAN D +{0x0018A, 0x00257}, // LATIN CAPITAL LETTER D WITH HOOK +{0x0018B, 0x0018C}, // LATIN CAPITAL LETTER D WITH TOPBAR +{0x0018E, 0x001DD}, // LATIN CAPITAL LETTER REVERSED E +{0x0018F, 0x00259}, // LATIN CAPITAL LETTER SCHWA +{0x00190, 0x0025B}, // LATIN CAPITAL LETTER OPEN E +{0x00191, 0x00192}, // LATIN CAPITAL LETTER F WITH HOOK +{0x00193, 0x00260}, // LATIN CAPITAL LETTER G WITH HOOK +{0x00194, 0x00263}, // LATIN CAPITAL LETTER GAMMA +{0x00196, 0x00269}, // LATIN CAPITAL LETTER IOTA +{0x00197, 0x00268}, // LATIN CAPITAL LETTER I WITH STROKE +{0x00198, 0x00199}, // LATIN CAPITAL LETTER K WITH HOOK +{0x0019C, 0x0026F}, // LATIN CAPITAL LETTER TURNED M +{0x0019D, 0x00272}, // LATIN CAPITAL LETTER N WITH LEFT HOOK +{0x0019F, 0x00275}, // LATIN CAPITAL LETTER O WITH MIDDLE TILDE +{0x001A0, 0x001A1}, // LATIN CAPITAL LETTER O WITH HORN +{0x001A2, 0x001A3}, // LATIN CAPITAL LETTER OI +{0x001A4, 0x001A5}, // LATIN CAPITAL LETTER P WITH HOOK +{0x001A6, 0x00280}, // LATIN LETTER YR +{0x001A7, 0x001A8}, // LATIN CAPITAL LETTER TONE TWO +{0x001A9, 0x00283}, // LATIN CAPITAL LETTER ESH +{0x001AC, 0x001AD}, // LATIN CAPITAL LETTER T WITH HOOK +{0x001AE, 0x00288}, // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK +{0x001AF, 0x001B0}, // LATIN CAPITAL LETTER U WITH HORN +{0x001B1, 0x0028A}, // LATIN CAPITAL LETTER UPSILON +{0x001B2, 0x0028B}, // LATIN CAPITAL LETTER V WITH HOOK +{0x001B3, 0x001B4}, // LATIN CAPITAL LETTER Y WITH HOOK +{0x001B5, 0x001B6}, // LATIN CAPITAL LETTER Z WITH STROKE +{0x001B7, 0x00292}, // LATIN CAPITAL LETTER EZH +{0x001B8, 0x001B9}, // LATIN CAPITAL LETTER EZH REVERSED +{0x001BC, 0x001BD}, // LATIN CAPITAL LETTER TONE FIVE +{0x001C4, 0x001C6}, // LATIN CAPITAL LETTER DZ WITH CARON +{0x001C5, 0x001C6}, // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON +{0x001C7, 0x001C9}, // LATIN CAPITAL LETTER LJ +{0x001C8, 0x001C9}, // LATIN CAPITAL LETTER L WITH SMALL LETTER J +{0x001CA, 0x001CC}, // LATIN CAPITAL LETTER NJ +{0x001CB, 0x001CC}, // LATIN CAPITAL LETTER N WITH SMALL LETTER J +{0x001CD, 0x001CE}, // LATIN CAPITAL LETTER A WITH CARON +{0x001CF, 0x001D0}, // LATIN CAPITAL LETTER I WITH CARON +{0x001D1, 0x001D2}, // LATIN CAPITAL LETTER O WITH CARON +{0x001D3, 0x001D4}, // LATIN CAPITAL LETTER U WITH CARON +{0x001D5, 0x001D6}, // LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON +{0x001D7, 0x001D8}, // LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE +{0x001D9, 0x001DA}, // LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON +{0x001DB, 0x001DC}, // LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE +{0x001DE, 0x001DF}, // LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON +{0x001E0, 0x001E1}, // LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON +{0x001E2, 0x001E3}, // LATIN CAPITAL LETTER AE WITH MACRON +{0x001E4, 0x001E5}, // LATIN CAPITAL LETTER G WITH STROKE +{0x001E6, 0x001E7}, // LATIN CAPITAL LETTER G WITH CARON +{0x001E8, 0x001E9}, // LATIN CAPITAL LETTER K WITH CARON +{0x001EA, 0x001EB}, // LATIN CAPITAL LETTER O WITH OGONEK +{0x001EC, 0x001ED}, // LATIN CAPITAL LETTER O WITH OGONEK AND MACRON +{0x001EE, 0x001EF}, // LATIN CAPITAL LETTER EZH WITH CARON +{0x001F1, 0x001F3}, // LATIN CAPITAL LETTER DZ +{0x001F2, 0x001F3}, // LATIN CAPITAL LETTER D WITH SMALL LETTER Z +{0x001F4, 0x001F5}, // LATIN CAPITAL LETTER G WITH ACUTE +{0x001F6, 0x00195}, // LATIN CAPITAL LETTER HWAIR +{0x001F7, 0x001BF}, // LATIN CAPITAL LETTER WYNN +{0x001F8, 0x001F9}, // LATIN CAPITAL LETTER N WITH GRAVE +{0x001FA, 0x001FB}, // LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE +{0x001FC, 0x001FD}, // LATIN CAPITAL LETTER AE WITH ACUTE +{0x001FE, 0x001FF}, // LATIN CAPITAL LETTER O WITH STROKE AND ACUTE +{0x00200, 0x00201}, // LATIN CAPITAL LETTER A WITH DOUBLE GRAVE +{0x00202, 0x00203}, // LATIN CAPITAL LETTER A WITH INVERTED BREVE +{0x00204, 0x00205}, // LATIN CAPITAL LETTER E WITH DOUBLE GRAVE +{0x00206, 0x00207}, // LATIN CAPITAL LETTER E WITH INVERTED BREVE +{0x00208, 0x00209}, // LATIN CAPITAL LETTER I WITH DOUBLE GRAVE +{0x0020A, 0x0020B}, // LATIN CAPITAL LETTER I WITH INVERTED BREVE +{0x0020C, 0x0020D}, // LATIN CAPITAL LETTER O WITH DOUBLE GRAVE +{0x0020E, 0x0020F}, // LATIN CAPITAL LETTER O WITH INVERTED BREVE +{0x00210, 0x00211}, // LATIN CAPITAL LETTER R WITH DOUBLE GRAVE +{0x00212, 0x00213}, // LATIN CAPITAL LETTER R WITH INVERTED BREVE +{0x00214, 0x00215}, // LATIN CAPITAL LETTER U WITH DOUBLE GRAVE +{0x00216, 0x00217}, // LATIN CAPITAL LETTER U WITH INVERTED BREVE +{0x00218, 0x00219}, // LATIN CAPITAL LETTER S WITH COMMA BELOW +{0x0021A, 0x0021B}, // LATIN CAPITAL LETTER T WITH COMMA BELOW +{0x0021C, 0x0021D}, // LATIN CAPITAL LETTER YOGH +{0x0021E, 0x0021F}, // LATIN CAPITAL LETTER H WITH CARON +{0x00220, 0x0019E}, // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG +{0x00222, 0x00223}, // LATIN CAPITAL LETTER OU +{0x00224, 0x00225}, // LATIN CAPITAL LETTER Z WITH HOOK +{0x00226, 0x00227}, // LATIN CAPITAL LETTER A WITH DOT ABOVE +{0x00228, 0x00229}, // LATIN CAPITAL LETTER E WITH CEDILLA +{0x0022A, 0x0022B}, // LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON +{0x0022C, 0x0022D}, // LATIN CAPITAL LETTER O WITH TILDE AND MACRON +{0x0022E, 0x0022F}, // LATIN CAPITAL LETTER O WITH DOT ABOVE +{0x00230, 0x00231}, // LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON +{0x00232, 0x00233}, // LATIN CAPITAL LETTER Y WITH MACRON +{0x0023A, 0x02C65}, // LATIN CAPITAL LETTER A WITH STROKE +{0x0023B, 0x0023C}, // LATIN CAPITAL LETTER C WITH STROKE +{0x0023D, 0x0019A}, // LATIN CAPITAL LETTER L WITH BAR +{0x0023E, 0x02C66}, // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE +{0x00241, 0x00242}, // LATIN CAPITAL LETTER GLOTTAL STOP +{0x00243, 0x00180}, // LATIN CAPITAL LETTER B WITH STROKE +{0x00244, 0x00289}, // LATIN CAPITAL LETTER U BAR +{0x00245, 0x0028C}, // LATIN CAPITAL LETTER TURNED V +{0x00246, 0x00247}, // LATIN CAPITAL LETTER E WITH STROKE +{0x00248, 0x00249}, // LATIN CAPITAL LETTER J WITH STROKE +{0x0024A, 0x0024B}, // LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL +{0x0024C, 0x0024D}, // LATIN CAPITAL LETTER R WITH STROKE +{0x0024E, 0x0024F}, // LATIN CAPITAL LETTER Y WITH STROKE +{0x00345, 0x003B9}, // COMBINING GREEK YPOGEGRAMMENI +{0x00370, 0x00371}, // GREEK CAPITAL LETTER HETA +{0x00372, 0x00373}, // GREEK CAPITAL LETTER ARCHAIC SAMPI +{0x00376, 0x00377}, // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA +{0x0037F, 0x003F3}, // GREEK CAPITAL LETTER YOT +{0x00386, 0x003AC}, // GREEK CAPITAL LETTER ALPHA WITH TONOS +{0x00388, 0x003AD}, // GREEK CAPITAL LETTER EPSILON WITH TONOS +{0x00389, 0x003AE}, // GREEK CAPITAL LETTER ETA WITH TONOS +{0x0038A, 0x003AF}, // GREEK CAPITAL LETTER IOTA WITH TONOS +{0x0038C, 0x003CC}, // GREEK CAPITAL LETTER OMICRON WITH TONOS +{0x0038E, 0x003CD}, // GREEK CAPITAL LETTER UPSILON WITH TONOS +{0x0038F, 0x003CE}, // GREEK CAPITAL LETTER OMEGA WITH TONOS +{0x00391, 0x003B1}, // GREEK CAPITAL LETTER ALPHA +{0x00392, 0x003B2}, // GREEK CAPITAL LETTER BETA +{0x00393, 0x003B3}, // GREEK CAPITAL LETTER GAMMA +{0x00394, 0x003B4}, // GREEK CAPITAL LETTER DELTA +{0x00395, 0x003B5}, // GREEK CAPITAL LETTER EPSILON +{0x00396, 0x003B6}, // GREEK CAPITAL LETTER ZETA +{0x00397, 0x003B7}, // GREEK CAPITAL LETTER ETA +{0x00398, 0x003B8}, // GREEK CAPITAL LETTER THETA +{0x00399, 0x003B9}, // GREEK CAPITAL LETTER IOTA +{0x0039A, 0x003BA}, // GREEK CAPITAL LETTER KAPPA +{0x0039B, 0x003BB}, // GREEK CAPITAL LETTER LAMDA +{0x0039C, 0x003BC}, // GREEK CAPITAL LETTER MU +{0x0039D, 0x003BD}, // GREEK CAPITAL LETTER NU +{0x0039E, 0x003BE}, // GREEK CAPITAL LETTER XI +{0x0039F, 0x003BF}, // GREEK CAPITAL LETTER OMICRON +{0x003A0, 0x003C0}, // GREEK CAPITAL LETTER PI +{0x003A1, 0x003C1}, // GREEK CAPITAL LETTER RHO +{0x003A3, 0x003C3}, // GREEK CAPITAL LETTER SIGMA +{0x003A4, 0x003C4}, // GREEK CAPITAL LETTER TAU +{0x003A5, 0x003C5}, // GREEK CAPITAL LETTER UPSILON +{0x003A6, 0x003C6}, // GREEK CAPITAL LETTER PHI +{0x003A7, 0x003C7}, // GREEK CAPITAL LETTER CHI +{0x003A8, 0x003C8}, // GREEK CAPITAL LETTER PSI +{0x003A9, 0x003C9}, // GREEK CAPITAL LETTER OMEGA +{0x003AA, 0x003CA}, // GREEK CAPITAL LETTER IOTA WITH DIALYTIKA +{0x003AB, 0x003CB}, // GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +{0x003C2, 0x003C3}, // GREEK SMALL LETTER FINAL SIGMA +{0x003CF, 0x003D7}, // GREEK CAPITAL KAI SYMBOL +{0x003D0, 0x003B2}, // GREEK BETA SYMBOL +{0x003D1, 0x003B8}, // GREEK THETA SYMBOL +{0x003D5, 0x003C6}, // GREEK PHI SYMBOL +{0x003D6, 0x003C0}, // GREEK PI SYMBOL +{0x003D8, 0x003D9}, // GREEK LETTER ARCHAIC KOPPA +{0x003DA, 0x003DB}, // GREEK LETTER STIGMA +{0x003DC, 0x003DD}, // GREEK LETTER DIGAMMA +{0x003DE, 0x003DF}, // GREEK LETTER KOPPA +{0x003E0, 0x003E1}, // GREEK LETTER SAMPI +{0x003F0, 0x003BA}, // GREEK KAPPA SYMBOL +{0x003F1, 0x003C1}, // GREEK RHO SYMBOL +{0x003F4, 0x003B8}, // GREEK CAPITAL THETA SYMBOL +{0x003F5, 0x003B5}, // GREEK LUNATE EPSILON SYMBOL +{0x003F7, 0x003F8}, // GREEK CAPITAL LETTER SHO +{0x003F9, 0x003F2}, // GREEK CAPITAL LUNATE SIGMA SYMBOL +{0x003FA, 0x003FB}, // GREEK CAPITAL LETTER SAN +{0x003FD, 0x0037B}, // GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL +{0x003FE, 0x0037C}, // GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL +{0x003FF, 0x0037D}, // GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL +{0x00400, 0x00450}, // CYRILLIC CAPITAL LETTER IE WITH GRAVE +{0x00401, 0x00451}, // CYRILLIC CAPITAL LETTER IO +{0x00402, 0x00452}, // CYRILLIC CAPITAL LETTER DJE +{0x00403, 0x00453}, // CYRILLIC CAPITAL LETTER GJE +{0x00404, 0x00454}, // CYRILLIC CAPITAL LETTER UKRAINIAN IE +{0x00405, 0x00455}, // CYRILLIC CAPITAL LETTER DZE +{0x00406, 0x00456}, // CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +{0x00407, 0x00457}, // CYRILLIC CAPITAL LETTER YI +{0x00408, 0x00458}, // CYRILLIC CAPITAL LETTER JE +{0x00409, 0x00459}, // CYRILLIC CAPITAL LETTER LJE +{0x0040A, 0x0045A}, // CYRILLIC CAPITAL LETTER NJE +{0x0040B, 0x0045B}, // CYRILLIC CAPITAL LETTER TSHE +{0x0040C, 0x0045C}, // CYRILLIC CAPITAL LETTER KJE +{0x0040D, 0x0045D}, // CYRILLIC CAPITAL LETTER I WITH GRAVE +{0x0040E, 0x0045E}, // CYRILLIC CAPITAL LETTER SHORT U +{0x0040F, 0x0045F}, // CYRILLIC CAPITAL LETTER DZHE +{0x00410, 0x00430}, // CYRILLIC CAPITAL LETTER A +{0x00411, 0x00431}, // CYRILLIC CAPITAL LETTER BE +{0x00412, 0x00432}, // CYRILLIC CAPITAL LETTER VE +{0x00413, 0x00433}, // CYRILLIC CAPITAL LETTER GHE +{0x00414, 0x00434}, // CYRILLIC CAPITAL LETTER DE +{0x00415, 0x00435}, // CYRILLIC CAPITAL LETTER IE +{0x00416, 0x00436}, // CYRILLIC CAPITAL LETTER ZHE +{0x00417, 0x00437}, // CYRILLIC CAPITAL LETTER ZE +{0x00418, 0x00438}, // CYRILLIC CAPITAL LETTER I +{0x00419, 0x00439}, // CYRILLIC CAPITAL LETTER SHORT I +{0x0041A, 0x0043A}, // CYRILLIC CAPITAL LETTER KA +{0x0041B, 0x0043B}, // CYRILLIC CAPITAL LETTER EL +{0x0041C, 0x0043C}, // CYRILLIC CAPITAL LETTER EM +{0x0041D, 0x0043D}, // CYRILLIC CAPITAL LETTER EN +{0x0041E, 0x0043E}, // CYRILLIC CAPITAL LETTER O +{0x0041F, 0x0043F}, // CYRILLIC CAPITAL LETTER PE +{0x00420, 0x00440}, // CYRILLIC CAPITAL LETTER ER +{0x00421, 0x00441}, // CYRILLIC CAPITAL LETTER ES +{0x00422, 0x00442}, // CYRILLIC CAPITAL LETTER TE +{0x00423, 0x00443}, // CYRILLIC CAPITAL LETTER U +{0x00424, 0x00444}, // CYRILLIC CAPITAL LETTER EF +{0x00425, 0x00445}, // CYRILLIC CAPITAL LETTER HA +{0x00426, 0x00446}, // CYRILLIC CAPITAL LETTER TSE +{0x00427, 0x00447}, // CYRILLIC CAPITAL LETTER CHE +{0x00428, 0x00448}, // CYRILLIC CAPITAL LETTER SHA +{0x00429, 0x00449}, // CYRILLIC CAPITAL LETTER SHCHA +{0x0042A, 0x0044A}, // CYRILLIC CAPITAL LETTER HARD SIGN +{0x0042B, 0x0044B}, // CYRILLIC CAPITAL LETTER YERU +{0x0042C, 0x0044C}, // CYRILLIC CAPITAL LETTER SOFT SIGN +{0x0042D, 0x0044D}, // CYRILLIC CAPITAL LETTER E +{0x0042E, 0x0044E}, // CYRILLIC CAPITAL LETTER YU +{0x0042F, 0x0044F}, // CYRILLIC CAPITAL LETTER YA +{0x00460, 0x00461}, // CYRILLIC CAPITAL LETTER OMEGA +{0x00462, 0x00463}, // CYRILLIC CAPITAL LETTER YAT +{0x00464, 0x00465}, // CYRILLIC CAPITAL LETTER IOTIFIED E +{0x00466, 0x00467}, // CYRILLIC CAPITAL LETTER LITTLE YUS +{0x00468, 0x00469}, // CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS +{0x0046A, 0x0046B}, // CYRILLIC CAPITAL LETTER BIG YUS +{0x0046C, 0x0046D}, // CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS +{0x0046E, 0x0046F}, // CYRILLIC CAPITAL LETTER KSI +{0x00470, 0x00471}, // CYRILLIC CAPITAL LETTER PSI +{0x00472, 0x00473}, // CYRILLIC CAPITAL LETTER FITA +{0x00474, 0x00475}, // CYRILLIC CAPITAL LETTER IZHITSA +{0x00476, 0x00477}, // CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT +{0x00478, 0x00479}, // CYRILLIC CAPITAL LETTER UK +{0x0047A, 0x0047B}, // CYRILLIC CAPITAL LETTER ROUND OMEGA +{0x0047C, 0x0047D}, // CYRILLIC CAPITAL LETTER OMEGA WITH TITLO +{0x0047E, 0x0047F}, // CYRILLIC CAPITAL LETTER OT +{0x00480, 0x00481}, // CYRILLIC CAPITAL LETTER KOPPA +{0x0048A, 0x0048B}, // CYRILLIC CAPITAL LETTER SHORT I WITH TAIL +{0x0048C, 0x0048D}, // CYRILLIC CAPITAL LETTER SEMISOFT SIGN +{0x0048E, 0x0048F}, // CYRILLIC CAPITAL LETTER ER WITH TICK +{0x00490, 0x00491}, // CYRILLIC CAPITAL LETTER GHE WITH UPTURN +{0x00492, 0x00493}, // CYRILLIC CAPITAL LETTER GHE WITH STROKE +{0x00494, 0x00495}, // CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK +{0x00496, 0x00497}, // CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER +{0x00498, 0x00499}, // CYRILLIC CAPITAL LETTER ZE WITH DESCENDER +{0x0049A, 0x0049B}, // CYRILLIC CAPITAL LETTER KA WITH DESCENDER +{0x0049C, 0x0049D}, // CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE +{0x0049E, 0x0049F}, // CYRILLIC CAPITAL LETTER KA WITH STROKE +{0x004A0, 0x004A1}, // CYRILLIC CAPITAL LETTER BASHKIR KA +{0x004A2, 0x004A3}, // CYRILLIC CAPITAL LETTER EN WITH DESCENDER +{0x004A4, 0x004A5}, // CYRILLIC CAPITAL LIGATURE EN GHE +{0x004A6, 0x004A7}, // CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK +{0x004A8, 0x004A9}, // CYRILLIC CAPITAL LETTER ABKHASIAN HA +{0x004AA, 0x004AB}, // CYRILLIC CAPITAL LETTER ES WITH DESCENDER +{0x004AC, 0x004AD}, // CYRILLIC CAPITAL LETTER TE WITH DESCENDER +{0x004AE, 0x004AF}, // CYRILLIC CAPITAL LETTER STRAIGHT U +{0x004B0, 0x004B1}, // CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE +{0x004B2, 0x004B3}, // CYRILLIC CAPITAL LETTER HA WITH DESCENDER +{0x004B4, 0x004B5}, // CYRILLIC CAPITAL LIGATURE TE TSE +{0x004B6, 0x004B7}, // CYRILLIC CAPITAL LETTER CHE WITH DESCENDER +{0x004B8, 0x004B9}, // CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE +{0x004BA, 0x004BB}, // CYRILLIC CAPITAL LETTER SHHA +{0x004BC, 0x004BD}, // CYRILLIC CAPITAL LETTER ABKHASIAN CHE +{0x004BE, 0x004BF}, // CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER +{0x004C0, 0x004CF}, // CYRILLIC LETTER PALOCHKA +{0x004C1, 0x004C2}, // CYRILLIC CAPITAL LETTER ZHE WITH BREVE +{0x004C3, 0x004C4}, // CYRILLIC CAPITAL LETTER KA WITH HOOK +{0x004C5, 0x004C6}, // CYRILLIC CAPITAL LETTER EL WITH TAIL +{0x004C7, 0x004C8}, // CYRILLIC CAPITAL LETTER EN WITH HOOK +{0x004C9, 0x004CA}, // CYRILLIC CAPITAL LETTER EN WITH TAIL +{0x004CB, 0x004CC}, // CYRILLIC CAPITAL LETTER KHAKASSIAN CHE +{0x004CD, 0x004CE}, // CYRILLIC CAPITAL LETTER EM WITH TAIL +{0x004D0, 0x004D1}, // CYRILLIC CAPITAL LETTER A WITH BREVE +{0x004D2, 0x004D3}, // CYRILLIC CAPITAL LETTER A WITH DIAERESIS +{0x004D4, 0x004D5}, // CYRILLIC CAPITAL LIGATURE A IE +{0x004D6, 0x004D7}, // CYRILLIC CAPITAL LETTER IE WITH BREVE +{0x004D8, 0x004D9}, // CYRILLIC CAPITAL LETTER SCHWA +{0x004DA, 0x004DB}, // CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS +{0x004DC, 0x004DD}, // CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS +{0x004DE, 0x004DF}, // CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS +{0x004E0, 0x004E1}, // CYRILLIC CAPITAL LETTER ABKHASIAN DZE +{0x004E2, 0x004E3}, // CYRILLIC CAPITAL LETTER I WITH MACRON +{0x004E4, 0x004E5}, // CYRILLIC CAPITAL LETTER I WITH DIAERESIS +{0x004E6, 0x004E7}, // CYRILLIC CAPITAL LETTER O WITH DIAERESIS +{0x004E8, 0x004E9}, // CYRILLIC CAPITAL LETTER BARRED O +{0x004EA, 0x004EB}, // CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS +{0x004EC, 0x004ED}, // CYRILLIC CAPITAL LETTER E WITH DIAERESIS +{0x004EE, 0x004EF}, // CYRILLIC CAPITAL LETTER U WITH MACRON +{0x004F0, 0x004F1}, // CYRILLIC CAPITAL LETTER U WITH DIAERESIS +{0x004F2, 0x004F3}, // CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE +{0x004F4, 0x004F5}, // CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS +{0x004F6, 0x004F7}, // CYRILLIC CAPITAL LETTER GHE WITH DESCENDER +{0x004F8, 0x004F9}, // CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS +{0x004FA, 0x004FB}, // CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK +{0x004FC, 0x004FD}, // CYRILLIC CAPITAL LETTER HA WITH HOOK +{0x004FE, 0x004FF}, // CYRILLIC CAPITAL LETTER HA WITH STROKE +{0x00500, 0x00501}, // CYRILLIC CAPITAL LETTER KOMI DE +{0x00502, 0x00503}, // CYRILLIC CAPITAL LETTER KOMI DJE +{0x00504, 0x00505}, // CYRILLIC CAPITAL LETTER KOMI ZJE +{0x00506, 0x00507}, // CYRILLIC CAPITAL LETTER KOMI DZJE +{0x00508, 0x00509}, // CYRILLIC CAPITAL LETTER KOMI LJE +{0x0050A, 0x0050B}, // CYRILLIC CAPITAL LETTER KOMI NJE +{0x0050C, 0x0050D}, // CYRILLIC CAPITAL LETTER KOMI SJE +{0x0050E, 0x0050F}, // CYRILLIC CAPITAL LETTER KOMI TJE +{0x00510, 0x00511}, // CYRILLIC CAPITAL LETTER REVERSED ZE +{0x00512, 0x00513}, // CYRILLIC CAPITAL LETTER EL WITH HOOK +{0x00514, 0x00515}, // CYRILLIC CAPITAL LETTER LHA +{0x00516, 0x00517}, // CYRILLIC CAPITAL LETTER RHA +{0x00518, 0x00519}, // CYRILLIC CAPITAL LETTER YAE +{0x0051A, 0x0051B}, // CYRILLIC CAPITAL LETTER QA +{0x0051C, 0x0051D}, // CYRILLIC CAPITAL LETTER WE +{0x0051E, 0x0051F}, // CYRILLIC CAPITAL LETTER ALEUT KA +{0x00520, 0x00521}, // CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK +{0x00522, 0x00523}, // CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK +{0x00524, 0x00525}, // CYRILLIC CAPITAL LETTER PE WITH DESCENDER +{0x00526, 0x00527}, // CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER +{0x00528, 0x00529}, // CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK +{0x0052A, 0x0052B}, // CYRILLIC CAPITAL LETTER DZZHE +{0x0052C, 0x0052D}, // CYRILLIC CAPITAL LETTER DCHE +{0x0052E, 0x0052F}, // CYRILLIC CAPITAL LETTER EL WITH DESCENDER +// {0x01C80, 0x00432}, // CYRILLIC SMALL LETTER ROUNDED VE +// {0x01C81, 0x00434}, // CYRILLIC SMALL LETTER LONG-LEGGED DE +// {0x01C82, 0x0043E}, // CYRILLIC SMALL LETTER NARROW O +// {0x01C83, 0x00441}, // CYRILLIC SMALL LETTER WIDE ES +// {0x01C84, 0x00442}, // CYRILLIC SMALL LETTER TALL TE +// {0x01C85, 0x00442}, // CYRILLIC SMALL LETTER THREE-LEGGED TE +// {0x01C86, 0x0044A}, // CYRILLIC SMALL LETTER TALL HARD SIGN +// {0x01C87, 0x00463}, // CYRILLIC SMALL LETTER TALL YAT +// {0x01C88, 0x0A64B}, // CYRILLIC SMALL LETTER UNBLENDED UK +{0x01E00, 0x01E01}, // LATIN CAPITAL LETTER A WITH RING BELOW +{0x01E02, 0x01E03}, // LATIN CAPITAL LETTER B WITH DOT ABOVE +{0x01E04, 0x01E05}, // LATIN CAPITAL LETTER B WITH DOT BELOW +{0x01E06, 0x01E07}, // LATIN CAPITAL LETTER B WITH LINE BELOW +{0x01E08, 0x01E09}, // LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE +{0x01E0A, 0x01E0B}, // LATIN CAPITAL LETTER D WITH DOT ABOVE +{0x01E0C, 0x01E0D}, // LATIN CAPITAL LETTER D WITH DOT BELOW +{0x01E0E, 0x01E0F}, // LATIN CAPITAL LETTER D WITH LINE BELOW +{0x01E10, 0x01E11}, // LATIN CAPITAL LETTER D WITH CEDILLA +{0x01E12, 0x01E13}, // LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW +{0x01E14, 0x01E15}, // LATIN CAPITAL LETTER E WITH MACRON AND GRAVE +{0x01E16, 0x01E17}, // LATIN CAPITAL LETTER E WITH MACRON AND ACUTE +{0x01E18, 0x01E19}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW +{0x01E1A, 0x01E1B}, // LATIN CAPITAL LETTER E WITH TILDE BELOW +{0x01E1C, 0x01E1D}, // LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE +{0x01E1E, 0x01E1F}, // LATIN CAPITAL LETTER F WITH DOT ABOVE +{0x01E20, 0x01E21}, // LATIN CAPITAL LETTER G WITH MACRON +{0x01E22, 0x01E23}, // LATIN CAPITAL LETTER H WITH DOT ABOVE +{0x01E24, 0x01E25}, // LATIN CAPITAL LETTER H WITH DOT BELOW +{0x01E26, 0x01E27}, // LATIN CAPITAL LETTER H WITH DIAERESIS +{0x01E28, 0x01E29}, // LATIN CAPITAL LETTER H WITH CEDILLA +{0x01E2A, 0x01E2B}, // LATIN CAPITAL LETTER H WITH BREVE BELOW +{0x01E2C, 0x01E2D}, // LATIN CAPITAL LETTER I WITH TILDE BELOW +{0x01E2E, 0x01E2F}, // LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE +{0x01E30, 0x01E31}, // LATIN CAPITAL LETTER K WITH ACUTE +{0x01E32, 0x01E33}, // LATIN CAPITAL LETTER K WITH DOT BELOW +{0x01E34, 0x01E35}, // LATIN CAPITAL LETTER K WITH LINE BELOW +{0x01E36, 0x01E37}, // LATIN CAPITAL LETTER L WITH DOT BELOW +{0x01E38, 0x01E39}, // LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON +{0x01E3A, 0x01E3B}, // LATIN CAPITAL LETTER L WITH LINE BELOW +{0x01E3C, 0x01E3D}, // LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW +{0x01E3E, 0x01E3F}, // LATIN CAPITAL LETTER M WITH ACUTE +{0x01E40, 0x01E41}, // LATIN CAPITAL LETTER M WITH DOT ABOVE +{0x01E42, 0x01E43}, // LATIN CAPITAL LETTER M WITH DOT BELOW +{0x01E44, 0x01E45}, // LATIN CAPITAL LETTER N WITH DOT ABOVE +{0x01E46, 0x01E47}, // LATIN CAPITAL LETTER N WITH DOT BELOW +{0x01E48, 0x01E49}, // LATIN CAPITAL LETTER N WITH LINE BELOW +{0x01E4A, 0x01E4B}, // LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW +{0x01E4C, 0x01E4D}, // LATIN CAPITAL LETTER O WITH TILDE AND ACUTE +{0x01E4E, 0x01E4F}, // LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS +{0x01E50, 0x01E51}, // LATIN CAPITAL LETTER O WITH MACRON AND GRAVE +{0x01E52, 0x01E53}, // LATIN CAPITAL LETTER O WITH MACRON AND ACUTE +{0x01E54, 0x01E55}, // LATIN CAPITAL LETTER P WITH ACUTE +{0x01E56, 0x01E57}, // LATIN CAPITAL LETTER P WITH DOT ABOVE +{0x01E58, 0x01E59}, // LATIN CAPITAL LETTER R WITH DOT ABOVE +{0x01E5A, 0x01E5B}, // LATIN CAPITAL LETTER R WITH DOT BELOW +{0x01E5C, 0x01E5D}, // LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON +{0x01E5E, 0x01E5F}, // LATIN CAPITAL LETTER R WITH LINE BELOW +{0x01E60, 0x01E61}, // LATIN CAPITAL LETTER S WITH DOT ABOVE +{0x01E62, 0x01E63}, // LATIN CAPITAL LETTER S WITH DOT BELOW +{0x01E64, 0x01E65}, // LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE +{0x01E66, 0x01E67}, // LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE +{0x01E68, 0x01E69}, // LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE +{0x01E6A, 0x01E6B}, // LATIN CAPITAL LETTER T WITH DOT ABOVE +{0x01E6C, 0x01E6D}, // LATIN CAPITAL LETTER T WITH DOT BELOW +{0x01E6E, 0x01E6F}, // LATIN CAPITAL LETTER T WITH LINE BELOW +{0x01E70, 0x01E71}, // LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW +{0x01E72, 0x01E73}, // LATIN CAPITAL LETTER U WITH DIAERESIS BELOW +{0x01E74, 0x01E75}, // LATIN CAPITAL LETTER U WITH TILDE BELOW +{0x01E76, 0x01E77}, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW +{0x01E78, 0x01E79}, // LATIN CAPITAL LETTER U WITH TILDE AND ACUTE +{0x01E7A, 0x01E7B}, // LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS +{0x01E7C, 0x01E7D}, // LATIN CAPITAL LETTER V WITH TILDE +{0x01E7E, 0x01E7F}, // LATIN CAPITAL LETTER V WITH DOT BELOW +{0x01E80, 0x01E81}, // LATIN CAPITAL LETTER W WITH GRAVE +{0x01E82, 0x01E83}, // LATIN CAPITAL LETTER W WITH ACUTE +{0x01E84, 0x01E85}, // LATIN CAPITAL LETTER W WITH DIAERESIS +{0x01E86, 0x01E87}, // LATIN CAPITAL LETTER W WITH DOT ABOVE +{0x01E88, 0x01E89}, // LATIN CAPITAL LETTER W WITH DOT BELOW +{0x01E8A, 0x01E8B}, // LATIN CAPITAL LETTER X WITH DOT ABOVE +{0x01E8C, 0x01E8D}, // LATIN CAPITAL LETTER X WITH DIAERESIS +{0x01E8E, 0x01E8F}, // LATIN CAPITAL LETTER Y WITH DOT ABOVE +{0x01E90, 0x01E91}, // LATIN CAPITAL LETTER Z WITH CIRCUMFLEX +{0x01E92, 0x01E93}, // LATIN CAPITAL LETTER Z WITH DOT BELOW +{0x01E94, 0x01E95}, // LATIN CAPITAL LETTER Z WITH LINE BELOW +{0x01E9B, 0x01E61}, // LATIN SMALL LETTER LONG S WITH DOT ABOVE +{0x01E9E, 0x000DF}, // LATIN CAPITAL LETTER SHARP S +{0x01EA0, 0x01EA1}, // LATIN CAPITAL LETTER A WITH DOT BELOW +{0x01EA2, 0x01EA3}, // LATIN CAPITAL LETTER A WITH HOOK ABOVE +{0x01EA4, 0x01EA5}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE +{0x01EA6, 0x01EA7}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE +{0x01EA8, 0x01EA9}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE +{0x01EAA, 0x01EAB}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE +{0x01EAC, 0x01EAD}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW +{0x01EAE, 0x01EAF}, // LATIN CAPITAL LETTER A WITH BREVE AND ACUTE +{0x01EB0, 0x01EB1}, // LATIN CAPITAL LETTER A WITH BREVE AND GRAVE +{0x01EB2, 0x01EB3}, // LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE +{0x01EB4, 0x01EB5}, // LATIN CAPITAL LETTER A WITH BREVE AND TILDE +{0x01EB6, 0x01EB7}, // LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW +{0x01EB8, 0x01EB9}, // LATIN CAPITAL LETTER E WITH DOT BELOW +{0x01EBA, 0x01EBB}, // LATIN CAPITAL LETTER E WITH HOOK ABOVE +{0x01EBC, 0x01EBD}, // LATIN CAPITAL LETTER E WITH TILDE +{0x01EBE, 0x01EBF}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE +{0x01EC0, 0x01EC1}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE +{0x01EC2, 0x01EC3}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE +{0x01EC4, 0x01EC5}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE +{0x01EC6, 0x01EC7}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW +{0x01EC8, 0x01EC9}, // LATIN CAPITAL LETTER I WITH HOOK ABOVE +{0x01ECA, 0x01ECB}, // LATIN CAPITAL LETTER I WITH DOT BELOW +{0x01ECC, 0x01ECD}, // LATIN CAPITAL LETTER O WITH DOT BELOW +{0x01ECE, 0x01ECF}, // LATIN CAPITAL LETTER O WITH HOOK ABOVE +{0x01ED0, 0x01ED1}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE +{0x01ED2, 0x01ED3}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE +{0x01ED4, 0x01ED5}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE +{0x01ED6, 0x01ED7}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE +{0x01ED8, 0x01ED9}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW +{0x01EDA, 0x01EDB}, // LATIN CAPITAL LETTER O WITH HORN AND ACUTE +{0x01EDC, 0x01EDD}, // LATIN CAPITAL LETTER O WITH HORN AND GRAVE +{0x01EDE, 0x01EDF}, // LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE +{0x01EE0, 0x01EE1}, // LATIN CAPITAL LETTER O WITH HORN AND TILDE +{0x01EE2, 0x01EE3}, // LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW +{0x01EE4, 0x01EE5}, // LATIN CAPITAL LETTER U WITH DOT BELOW +{0x01EE6, 0x01EE7}, // LATIN CAPITAL LETTER U WITH HOOK ABOVE +{0x01EE8, 0x01EE9}, // LATIN CAPITAL LETTER U WITH HORN AND ACUTE +{0x01EEA, 0x01EEB}, // LATIN CAPITAL LETTER U WITH HORN AND GRAVE +{0x01EEC, 0x01EED}, // LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE +{0x01EEE, 0x01EEF}, // LATIN CAPITAL LETTER U WITH HORN AND TILDE +{0x01EF0, 0x01EF1}, // LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW +{0x01EF2, 0x01EF3}, // LATIN CAPITAL LETTER Y WITH GRAVE +{0x01EF4, 0x01EF5}, // LATIN CAPITAL LETTER Y WITH DOT BELOW +{0x01EF6, 0x01EF7}, // LATIN CAPITAL LETTER Y WITH HOOK ABOVE +{0x01EF8, 0x01EF9}, // LATIN CAPITAL LETTER Y WITH TILDE +{0x01EFA, 0x01EFB}, // LATIN CAPITAL LETTER MIDDLE-WELSH LL +{0x01EFC, 0x01EFD}, // LATIN CAPITAL LETTER MIDDLE-WELSH V +{0x01EFE, 0x01EFF}, // LATIN CAPITAL LETTER Y WITH LOOP +{0x01F08, 0x01F00}, // GREEK CAPITAL LETTER ALPHA WITH PSILI +{0x01F09, 0x01F01}, // GREEK CAPITAL LETTER ALPHA WITH DASIA +{0x01F0A, 0x01F02}, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA +{0x01F0B, 0x01F03}, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA +{0x01F0C, 0x01F04}, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA +{0x01F0D, 0x01F05}, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA +{0x01F0E, 0x01F06}, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI +{0x01F0F, 0x01F07}, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI +{0x01F18, 0x01F10}, // GREEK CAPITAL LETTER EPSILON WITH PSILI +{0x01F19, 0x01F11}, // GREEK CAPITAL LETTER EPSILON WITH DASIA +{0x01F1A, 0x01F12}, // GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA +{0x01F1B, 0x01F13}, // GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA +{0x01F1C, 0x01F14}, // GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA +{0x01F1D, 0x01F15}, // GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA +{0x01F28, 0x01F20}, // GREEK CAPITAL LETTER ETA WITH PSILI +{0x01F29, 0x01F21}, // GREEK CAPITAL LETTER ETA WITH DASIA +{0x01F2A, 0x01F22}, // GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA +{0x01F2B, 0x01F23}, // GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA +{0x01F2C, 0x01F24}, // GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA +{0x01F2D, 0x01F25}, // GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA +{0x01F2E, 0x01F26}, // GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI +{0x01F2F, 0x01F27}, // GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI +{0x01F38, 0x01F30}, // GREEK CAPITAL LETTER IOTA WITH PSILI +{0x01F39, 0x01F31}, // GREEK CAPITAL LETTER IOTA WITH DASIA +{0x01F3A, 0x01F32}, // GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA +{0x01F3B, 0x01F33}, // GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA +{0x01F3C, 0x01F34}, // GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA +{0x01F3D, 0x01F35}, // GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA +{0x01F3E, 0x01F36}, // GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI +{0x01F3F, 0x01F37}, // GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI +{0x01F48, 0x01F40}, // GREEK CAPITAL LETTER OMICRON WITH PSILI +{0x01F49, 0x01F41}, // GREEK CAPITAL LETTER OMICRON WITH DASIA +{0x01F4A, 0x01F42}, // GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA +{0x01F4B, 0x01F43}, // GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA +{0x01F4C, 0x01F44}, // GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA +{0x01F4D, 0x01F45}, // GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA +{0x01F59, 0x01F51}, // GREEK CAPITAL LETTER UPSILON WITH DASIA +{0x01F5B, 0x01F53}, // GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA +{0x01F5D, 0x01F55}, // GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA +{0x01F5F, 0x01F57}, // GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI +{0x01F68, 0x01F60}, // GREEK CAPITAL LETTER OMEGA WITH PSILI +{0x01F69, 0x01F61}, // GREEK CAPITAL LETTER OMEGA WITH DASIA +{0x01F6A, 0x01F62}, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA +{0x01F6B, 0x01F63}, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA +{0x01F6C, 0x01F64}, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA +{0x01F6D, 0x01F65}, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA +{0x01F6E, 0x01F66}, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI +{0x01F6F, 0x01F67}, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI +{0x01F88, 0x01F80}, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +{0x01F89, 0x01F81}, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +{0x01F8A, 0x01F82}, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +{0x01F8B, 0x01F83}, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +{0x01F8C, 0x01F84}, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +{0x01F8D, 0x01F85}, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +{0x01F8E, 0x01F86}, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +{0x01F8F, 0x01F87}, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +{0x01F98, 0x01F90}, // GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +{0x01F99, 0x01F91}, // GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +{0x01F9A, 0x01F92}, // GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +{0x01F9B, 0x01F93}, // GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +{0x01F9C, 0x01F94}, // GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +{0x01F9D, 0x01F95}, // GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +{0x01F9E, 0x01F96}, // GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +{0x01F9F, 0x01F97}, // GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +{0x01FA8, 0x01FA0}, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +{0x01FA9, 0x01FA1}, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +{0x01FAA, 0x01FA2}, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +{0x01FAB, 0x01FA3}, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +{0x01FAC, 0x01FA4}, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +{0x01FAD, 0x01FA5}, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +{0x01FAE, 0x01FA6}, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +{0x01FAF, 0x01FA7}, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +{0x01FB8, 0x01FB0}, // GREEK CAPITAL LETTER ALPHA WITH VRACHY +{0x01FB9, 0x01FB1}, // GREEK CAPITAL LETTER ALPHA WITH MACRON +{0x01FBA, 0x01F70}, // GREEK CAPITAL LETTER ALPHA WITH VARIA +{0x01FBB, 0x01F71}, // GREEK CAPITAL LETTER ALPHA WITH OXIA +{0x01FBC, 0x01FB3}, // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +{0x01FBE, 0x003B9}, // GREEK PROSGEGRAMMENI +{0x01FC8, 0x01F72}, // GREEK CAPITAL LETTER EPSILON WITH VARIA +{0x01FC9, 0x01F73}, // GREEK CAPITAL LETTER EPSILON WITH OXIA +{0x01FCA, 0x01F74}, // GREEK CAPITAL LETTER ETA WITH VARIA +{0x01FCB, 0x01F75}, // GREEK CAPITAL LETTER ETA WITH OXIA +{0x01FCC, 0x01FC3}, // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +{0x01FD8, 0x01FD0}, // GREEK CAPITAL LETTER IOTA WITH VRACHY +{0x01FD9, 0x01FD1}, // GREEK CAPITAL LETTER IOTA WITH MACRON +{0x01FDA, 0x01F76}, // GREEK CAPITAL LETTER IOTA WITH VARIA +{0x01FDB, 0x01F77}, // GREEK CAPITAL LETTER IOTA WITH OXIA +{0x01FE8, 0x01FE0}, // GREEK CAPITAL LETTER UPSILON WITH VRACHY +{0x01FE9, 0x01FE1}, // GREEK CAPITAL LETTER UPSILON WITH MACRON +{0x01FEA, 0x01F7A}, // GREEK CAPITAL LETTER UPSILON WITH VARIA +{0x01FEB, 0x01F7B}, // GREEK CAPITAL LETTER UPSILON WITH OXIA +{0x01FEC, 0x01FE5}, // GREEK CAPITAL LETTER RHO WITH DASIA +{0x01FF8, 0x01F78}, // GREEK CAPITAL LETTER OMICRON WITH VARIA +{0x01FF9, 0x01F79}, // GREEK CAPITAL LETTER OMICRON WITH OXIA +{0x01FFA, 0x01F7C}, // GREEK CAPITAL LETTER OMEGA WITH VARIA +{0x01FFB, 0x01F7D}, // GREEK CAPITAL LETTER OMEGA WITH OXIA +{0x01FFC, 0x01FF3}, // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +{0x02126, 0x003C9}, // OHM SIGN +{0x0212A, 0x0006B}, // KELVIN SIGN +{0x0212B, 0x000E5}, // ANGSTROM SIGN +{0x02132, 0x0214E}, // TURNED CAPITAL F +{0x02160, 0x02170}, // ROMAN NUMERAL ONE +{0x02161, 0x02171}, // ROMAN NUMERAL TWO +{0x02162, 0x02172}, // ROMAN NUMERAL THREE +{0x02163, 0x02173}, // ROMAN NUMERAL FOUR +{0x02164, 0x02174}, // ROMAN NUMERAL FIVE +{0x02165, 0x02175}, // ROMAN NUMERAL SIX +{0x02166, 0x02176}, // ROMAN NUMERAL SEVEN +{0x02167, 0x02177}, // ROMAN NUMERAL EIGHT +{0x02168, 0x02178}, // ROMAN NUMERAL NINE +{0x02169, 0x02179}, // ROMAN NUMERAL TEN +{0x0216A, 0x0217A}, // ROMAN NUMERAL ELEVEN +{0x0216B, 0x0217B}, // ROMAN NUMERAL TWELVE +{0x0216C, 0x0217C}, // ROMAN NUMERAL FIFTY +{0x0216D, 0x0217D}, // ROMAN NUMERAL ONE HUNDRED +{0x0216E, 0x0217E}, // ROMAN NUMERAL FIVE HUNDRED +{0x0216F, 0x0217F}, // ROMAN NUMERAL ONE THOUSAND +{0x02183, 0x02184}, // ROMAN NUMERAL REVERSED ONE HUNDRED +{0x024B6, 0x024D0}, // CIRCLED LATIN CAPITAL LETTER A +{0x024B7, 0x024D1}, // CIRCLED LATIN CAPITAL LETTER B +{0x024B8, 0x024D2}, // CIRCLED LATIN CAPITAL LETTER C +{0x024B9, 0x024D3}, // CIRCLED LATIN CAPITAL LETTER D +{0x024BA, 0x024D4}, // CIRCLED LATIN CAPITAL LETTER E +{0x024BB, 0x024D5}, // CIRCLED LATIN CAPITAL LETTER F +{0x024BC, 0x024D6}, // CIRCLED LATIN CAPITAL LETTER G +{0x024BD, 0x024D7}, // CIRCLED LATIN CAPITAL LETTER H +{0x024BE, 0x024D8}, // CIRCLED LATIN CAPITAL LETTER I +{0x024BF, 0x024D9}, // CIRCLED LATIN CAPITAL LETTER J +{0x024C0, 0x024DA}, // CIRCLED LATIN CAPITAL LETTER K +{0x024C1, 0x024DB}, // CIRCLED LATIN CAPITAL LETTER L +{0x024C2, 0x024DC}, // CIRCLED LATIN CAPITAL LETTER M +{0x024C3, 0x024DD}, // CIRCLED LATIN CAPITAL LETTER N +{0x024C4, 0x024DE}, // CIRCLED LATIN CAPITAL LETTER O +{0x024C5, 0x024DF}, // CIRCLED LATIN CAPITAL LETTER P +{0x024C6, 0x024E0}, // CIRCLED LATIN CAPITAL LETTER Q +{0x024C7, 0x024E1}, // CIRCLED LATIN CAPITAL LETTER R +{0x024C8, 0x024E2}, // CIRCLED LATIN CAPITAL LETTER S +{0x024C9, 0x024E3}, // CIRCLED LATIN CAPITAL LETTER T +{0x024CA, 0x024E4}, // CIRCLED LATIN CAPITAL LETTER U +{0x024CB, 0x024E5}, // CIRCLED LATIN CAPITAL LETTER V +{0x024CC, 0x024E6}, // CIRCLED LATIN CAPITAL LETTER W +{0x024CD, 0x024E7}, // CIRCLED LATIN CAPITAL LETTER X +{0x024CE, 0x024E8}, // CIRCLED LATIN CAPITAL LETTER Y +{0x024CF, 0x024E9}, // CIRCLED LATIN CAPITAL LETTER Z +{0x02C60, 0x02C61}, // LATIN CAPITAL LETTER L WITH DOUBLE BAR +{0x02C62, 0x0026B}, // LATIN CAPITAL LETTER L WITH MIDDLE TILDE +{0x02C63, 0x01D7D}, // LATIN CAPITAL LETTER P WITH STROKE +{0x02C64, 0x0027D}, // LATIN CAPITAL LETTER R WITH TAIL +{0x02C67, 0x02C68}, // LATIN CAPITAL LETTER H WITH DESCENDER +{0x02C69, 0x02C6A}, // LATIN CAPITAL LETTER K WITH DESCENDER +{0x02C6B, 0x02C6C}, // LATIN CAPITAL LETTER Z WITH DESCENDER +{0x02C6D, 0x00251}, // LATIN CAPITAL LETTER ALPHA +{0x02C6E, 0x00271}, // LATIN CAPITAL LETTER M WITH HOOK +{0x02C6F, 0x00250}, // LATIN CAPITAL LETTER TURNED A +{0x02C70, 0x00252}, // LATIN CAPITAL LETTER TURNED ALPHA +{0x02C72, 0x02C73}, // LATIN CAPITAL LETTER W WITH HOOK +{0x02C75, 0x02C76}, // LATIN CAPITAL LETTER HALF H +{0x02C7E, 0x0023F}, // LATIN CAPITAL LETTER S WITH SWASH TAIL +{0x02C7F, 0x00240}, // LATIN CAPITAL LETTER Z WITH SWASH TAIL +{0x0A640, 0x0A641}, // CYRILLIC CAPITAL LETTER ZEMLYA +{0x0A642, 0x0A643}, // CYRILLIC CAPITAL LETTER DZELO +{0x0A644, 0x0A645}, // CYRILLIC CAPITAL LETTER REVERSED DZE +{0x0A646, 0x0A647}, // CYRILLIC CAPITAL LETTER IOTA +{0x0A648, 0x0A649}, // CYRILLIC CAPITAL LETTER DJERV +{0x0A64A, 0x0A64B}, // CYRILLIC CAPITAL LETTER MONOGRAPH UK +{0x0A64C, 0x0A64D}, // CYRILLIC CAPITAL LETTER BROAD OMEGA +{0x0A64E, 0x0A64F}, // CYRILLIC CAPITAL LETTER NEUTRAL YER +{0x0A650, 0x0A651}, // CYRILLIC CAPITAL LETTER YERU WITH BACK YER +{0x0A652, 0x0A653}, // CYRILLIC CAPITAL LETTER IOTIFIED YAT +{0x0A654, 0x0A655}, // CYRILLIC CAPITAL LETTER REVERSED YU +{0x0A656, 0x0A657}, // CYRILLIC CAPITAL LETTER IOTIFIED A +{0x0A658, 0x0A659}, // CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS +{0x0A65A, 0x0A65B}, // CYRILLIC CAPITAL LETTER BLENDED YUS +{0x0A65C, 0x0A65D}, // CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS +{0x0A65E, 0x0A65F}, // CYRILLIC CAPITAL LETTER YN +{0x0A660, 0x0A661}, // CYRILLIC CAPITAL LETTER REVERSED TSE +{0x0A662, 0x0A663}, // CYRILLIC CAPITAL LETTER SOFT DE +{0x0A664, 0x0A665}, // CYRILLIC CAPITAL LETTER SOFT EL +{0x0A666, 0x0A667}, // CYRILLIC CAPITAL LETTER SOFT EM +{0x0A668, 0x0A669}, // CYRILLIC CAPITAL LETTER MONOCULAR O +{0x0A66A, 0x0A66B}, // CYRILLIC CAPITAL LETTER BINOCULAR O +{0x0A66C, 0x0A66D}, // CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O +{0x0A680, 0x0A681}, // CYRILLIC CAPITAL LETTER DWE +{0x0A682, 0x0A683}, // CYRILLIC CAPITAL LETTER DZWE +{0x0A684, 0x0A685}, // CYRILLIC CAPITAL LETTER ZHWE +{0x0A686, 0x0A687}, // CYRILLIC CAPITAL LETTER CCHE +{0x0A688, 0x0A689}, // CYRILLIC CAPITAL LETTER DZZE +{0x0A68A, 0x0A68B}, // CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK +{0x0A68C, 0x0A68D}, // CYRILLIC CAPITAL LETTER TWE +{0x0A68E, 0x0A68F}, // CYRILLIC CAPITAL LETTER TSWE +{0x0A690, 0x0A691}, // CYRILLIC CAPITAL LETTER TSSE +{0x0A692, 0x0A693}, // CYRILLIC CAPITAL LETTER TCHE +{0x0A694, 0x0A695}, // CYRILLIC CAPITAL LETTER HWE +{0x0A696, 0x0A697}, // CYRILLIC CAPITAL LETTER SHWE +{0x0A698, 0x0A699}, // CYRILLIC CAPITAL LETTER DOUBLE O +{0x0A69A, 0x0A69B}, // CYRILLIC CAPITAL LETTER CROSSED O +{0x0A722, 0x0A723}, // LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF +{0x0A724, 0x0A725}, // LATIN CAPITAL LETTER EGYPTOLOGICAL AIN +{0x0A726, 0x0A727}, // LATIN CAPITAL LETTER HENG +{0x0A728, 0x0A729}, // LATIN CAPITAL LETTER TZ +{0x0A72A, 0x0A72B}, // LATIN CAPITAL LETTER TRESILLO +{0x0A72C, 0x0A72D}, // LATIN CAPITAL LETTER CUATRILLO +{0x0A72E, 0x0A72F}, // LATIN CAPITAL LETTER CUATRILLO WITH COMMA +{0x0A732, 0x0A733}, // LATIN CAPITAL LETTER AA +{0x0A734, 0x0A735}, // LATIN CAPITAL LETTER AO +{0x0A736, 0x0A737}, // LATIN CAPITAL LETTER AU +{0x0A738, 0x0A739}, // LATIN CAPITAL LETTER AV +{0x0A73A, 0x0A73B}, // LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR +{0x0A73C, 0x0A73D}, // LATIN CAPITAL LETTER AY +{0x0A73E, 0x0A73F}, // LATIN CAPITAL LETTER REVERSED C WITH DOT +{0x0A740, 0x0A741}, // LATIN CAPITAL LETTER K WITH STROKE +{0x0A742, 0x0A743}, // LATIN CAPITAL LETTER K WITH DIAGONAL STROKE +{0x0A744, 0x0A745}, // LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE +{0x0A746, 0x0A747}, // LATIN CAPITAL LETTER BROKEN L +{0x0A748, 0x0A749}, // LATIN CAPITAL LETTER L WITH HIGH STROKE +{0x0A74A, 0x0A74B}, // LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY +{0x0A74C, 0x0A74D}, // LATIN CAPITAL LETTER O WITH LOOP +{0x0A74E, 0x0A74F}, // LATIN CAPITAL LETTER OO +{0x0A750, 0x0A751}, // LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER +{0x0A752, 0x0A753}, // LATIN CAPITAL LETTER P WITH FLOURISH +{0x0A754, 0x0A755}, // LATIN CAPITAL LETTER P WITH SQUIRREL TAIL +{0x0A756, 0x0A757}, // LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER +{0x0A758, 0x0A759}, // LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE +{0x0A75A, 0x0A75B}, // LATIN CAPITAL LETTER R ROTUNDA +{0x0A75C, 0x0A75D}, // LATIN CAPITAL LETTER RUM ROTUNDA +{0x0A75E, 0x0A75F}, // LATIN CAPITAL LETTER V WITH DIAGONAL STROKE +{0x0A760, 0x0A761}, // LATIN CAPITAL LETTER VY +{0x0A762, 0x0A763}, // LATIN CAPITAL LETTER VISIGOTHIC Z +{0x0A764, 0x0A765}, // LATIN CAPITAL LETTER THORN WITH STROKE +{0x0A766, 0x0A767}, // LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER +{0x0A768, 0x0A769}, // LATIN CAPITAL LETTER VEND +{0x0A76A, 0x0A76B}, // LATIN CAPITAL LETTER ET +{0x0A76C, 0x0A76D}, // LATIN CAPITAL LETTER IS +{0x0A76E, 0x0A76F}, // LATIN CAPITAL LETTER CON +{0x0A779, 0x0A77A}, // LATIN CAPITAL LETTER INSULAR D +{0x0A77B, 0x0A77C}, // LATIN CAPITAL LETTER INSULAR F +{0x0A77D, 0x01D79}, // LATIN CAPITAL LETTER INSULAR G +{0x0A77E, 0x0A77F}, // LATIN CAPITAL LETTER TURNED INSULAR G +{0x0A780, 0x0A781}, // LATIN CAPITAL LETTER TURNED L +{0x0A782, 0x0A783}, // LATIN CAPITAL LETTER INSULAR R +{0x0A784, 0x0A785}, // LATIN CAPITAL LETTER INSULAR S +{0x0A786, 0x0A787}, // LATIN CAPITAL LETTER INSULAR T +{0x0A78B, 0x0A78C}, // LATIN CAPITAL LETTER SALTILLO +{0x0A78D, 0x00265}, // LATIN CAPITAL LETTER TURNED H +{0x0A790, 0x0A791}, // LATIN CAPITAL LETTER N WITH DESCENDER +{0x0A792, 0x0A793}, // LATIN CAPITAL LETTER C WITH BAR +{0x0A796, 0x0A797}, // LATIN CAPITAL LETTER B WITH FLOURISH +{0x0A798, 0x0A799}, // LATIN CAPITAL LETTER F WITH STROKE +{0x0A79A, 0x0A79B}, // LATIN CAPITAL LETTER VOLAPUK AE +{0x0A79C, 0x0A79D}, // LATIN CAPITAL LETTER VOLAPUK OE +{0x0A79E, 0x0A79F}, // LATIN CAPITAL LETTER VOLAPUK UE +{0x0A7A0, 0x0A7A1}, // LATIN CAPITAL LETTER G WITH OBLIQUE STROKE +{0x0A7A2, 0x0A7A3}, // LATIN CAPITAL LETTER K WITH OBLIQUE STROKE +{0x0A7A4, 0x0A7A5}, // LATIN CAPITAL LETTER N WITH OBLIQUE STROKE +{0x0A7A6, 0x0A7A7}, // LATIN CAPITAL LETTER R WITH OBLIQUE STROKE +{0x0A7A8, 0x0A7A9}, // LATIN CAPITAL LETTER S WITH OBLIQUE STROKE +{0x0A7AA, 0x00266}, // LATIN CAPITAL LETTER H WITH HOOK +{0x0A7AB, 0x0025C}, // LATIN CAPITAL LETTER REVERSED OPEN E +{0x0A7AC, 0x00261}, // LATIN CAPITAL LETTER SCRIPT G +{0x0A7AD, 0x0026C}, // LATIN CAPITAL LETTER L WITH BELT +{0x0A7AE, 0x0026A}, // LATIN CAPITAL LETTER SMALL CAPITAL I +{0x0A7B0, 0x0029E}, // LATIN CAPITAL LETTER TURNED K +{0x0A7B1, 0x00287}, // LATIN CAPITAL LETTER TURNED T +{0x0A7B2, 0x0029D}, // LATIN CAPITAL LETTER J WITH CROSSED-TAIL +{0x0A7B3, 0x0AB53}, // LATIN CAPITAL LETTER CHI +{0x0A7B4, 0x0A7B5}, // LATIN CAPITAL LETTER BETA +{0x0A7B6, 0x0A7B7}, // LATIN CAPITAL LETTER OMEGA +// {0x0A7B8, 0x0A7B9}, // LATIN CAPITAL LETTER U WITH STROKE +// {0x0A7BA, 0x0A7BB}, // LATIN CAPITAL LETTER GLOTTAL A +// {0x0A7BC, 0x0A7BD}, // LATIN CAPITAL LETTER GLOTTAL I +// {0x0A7BE, 0x0A7BF}, // LATIN CAPITAL LETTER GLOTTAL U +// {0x0A7C0, 0x0A7C1}, // LATIN CAPITAL LETTER OLD POLISH O +// {0x0A7C2, 0x0A7C3}, // LATIN CAPITAL LETTER ANGLICANA W +// {0x0A7C4, 0x0A794}, // LATIN CAPITAL LETTER C WITH PALATAL HOOK +// {0x0A7C5, 0x00282}, // LATIN CAPITAL LETTER S WITH HOOK +// {0x0A7C6, 0x01D8E}, // LATIN CAPITAL LETTER Z WITH PALATAL HOOK +// {0x0A7C7, 0x0A7C8}, // LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY +// {0x0A7C9, 0x0A7CA}, // LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY +// {0x0A7D0, 0x0A7D1}, // LATIN CAPITAL LETTER CLOSED INSULAR G +// {0x0A7D6, 0x0A7D7}, // LATIN CAPITAL LETTER MIDDLE SCOTS S +// {0x0A7D8, 0x0A7D9}, // LATIN CAPITAL LETTER SIGMOID S +// {0x0A7F5, 0x0A7F6}, // LATIN CAPITAL LETTER REVERSED HALF H +{0x0FF21, 0x0FF41}, // FULLWIDTH LATIN CAPITAL LETTER A +{0x0FF22, 0x0FF42}, // FULLWIDTH LATIN CAPITAL LETTER B +{0x0FF23, 0x0FF43}, // FULLWIDTH LATIN CAPITAL LETTER C +{0x0FF24, 0x0FF44}, // FULLWIDTH LATIN CAPITAL LETTER D +{0x0FF25, 0x0FF45}, // FULLWIDTH LATIN CAPITAL LETTER E +{0x0FF26, 0x0FF46}, // FULLWIDTH LATIN CAPITAL LETTER F +{0x0FF27, 0x0FF47}, // FULLWIDTH LATIN CAPITAL LETTER G +{0x0FF28, 0x0FF48}, // FULLWIDTH LATIN CAPITAL LETTER H +{0x0FF29, 0x0FF49}, // FULLWIDTH LATIN CAPITAL LETTER I +{0x0FF2A, 0x0FF4A}, // FULLWIDTH LATIN CAPITAL LETTER J +{0x0FF2B, 0x0FF4B}, // FULLWIDTH LATIN CAPITAL LETTER K +{0x0FF2C, 0x0FF4C}, // FULLWIDTH LATIN CAPITAL LETTER L +{0x0FF2D, 0x0FF4D}, // FULLWIDTH LATIN CAPITAL LETTER M +{0x0FF2E, 0x0FF4E}, // FULLWIDTH LATIN CAPITAL LETTER N +{0x0FF2F, 0x0FF4F}, // FULLWIDTH LATIN CAPITAL LETTER O +{0x0FF30, 0x0FF50}, // FULLWIDTH LATIN CAPITAL LETTER P +{0x0FF31, 0x0FF51}, // FULLWIDTH LATIN CAPITAL LETTER Q +{0x0FF32, 0x0FF52}, // FULLWIDTH LATIN CAPITAL LETTER R +{0x0FF33, 0x0FF53}, // FULLWIDTH LATIN CAPITAL LETTER S +{0x0FF34, 0x0FF54}, // FULLWIDTH LATIN CAPITAL LETTER T +{0x0FF35, 0x0FF55}, // FULLWIDTH LATIN CAPITAL LETTER U +{0x0FF36, 0x0FF56}, // FULLWIDTH LATIN CAPITAL LETTER V +{0x0FF37, 0x0FF57}, // FULLWIDTH LATIN CAPITAL LETTER W +{0x0FF38, 0x0FF58}, // FULLWIDTH LATIN CAPITAL LETTER X +{0x0FF39, 0x0FF59}, // FULLWIDTH LATIN CAPITAL LETTER Y +{0x0FF3A, 0x0FF5A}, // FULLWIDTH LATIN CAPITAL LETTER Z +}; + +static short cfold_idx[HT_SIZE] = { + -1, 285, 897, -1, 803, 1334, -1, 1211, -1, 711, 81, 1122, 307, 619, 1658, + -1, 1036, 1569, 545, -1, 337, 950, 462, -1, 856, 237, 1236, 1399, 765, 148, + 1176, 1447, 680, 66, 1701, 1089, 1409, 604, 19, 391, 1003, 485, 1530, 911, 302, + 1285, 1414, 817, 197, 1279, 1331, 725, 1372, 107, 1134, 1482, 647, 1686, 1050, 440, + 1583, 4, -1, 350, 964, -1, -1, 255, 872, 1248, 1319, 778, -1, 1188, 1461, + 694, 94, 1103, -1, 632, 1622, 47, 405, 1017, 530, 1544, 925, 1430, -1, 1352, + 831, 211, 1219, 1400, 739, 123, 1496, 1159, -1, 579, -1, 1064, 1427, 560, 32, + 978, 364, 498, 1588, -1, 433, 886, -1, 792, -1, 173, 1192, 1475, -1, 59, + 1117, 1643, 660, 1640, -1, 419, 1025, 534, 1558, 326, 939, 1300, -1, 845, 225, + 1231, 1377, 753, 137, 1171, 1436, 669, 1679, -1, 265, 1078, 582, -1, 992, 380, + 524, 1521, -1, 289, 900, -1, 806, 186, 1340, 1200, -1, 714, 87, 1128, 1664, + 625, -1, 1039, -1, 548, 1572, -1, 257, 953, 1306, 859, 240, -1, 1242, 1405, + 768, 151, 1183, 1450, 683, 72, 1092, -1, 1611, 610, 25, 394, 1006, 491, 1533, + 914, 305, 1417, -1, 820, 200, 1291, 1337, 728, 113, 1378, 1140, 1485, 653, 1692, + 445, 1053, -1, 10, 967, 353, -1, -1, -1, 258, 875, 1254, 781, -1, 163, + 1464, -1, 697, -1, 1106, -1, 638, 1625, -1, 408, -1, 517, 1547, 312, 928, + 1295, -1, 834, 214, 1225, 1355, 742, 126, 1148, 1314, 585, 1499, 1657, 1067, -1, + 564, 38, 981, 369, 504, 1510, 1592, 276, 889, 1266, 795, 176, -1, 1478, -1, + 703, 65, 299, 1123, 666, 1317, 1028, 422, 1561, 1646, -1, 329, 942, 461, 848, + -1, 228, 1237, 1383, 756, 140, 1177, 1439, 672, 1685, -1, 267, 1081, 588, 3, + 383, 995, 465, 1603, 903, -1, 1274, -1, 809, 189, 1206, 1346, 717, 1356, 93, + 1135, 1670, 631, -1, 1042, 431, 551, 1575, 956, 342, -1, -1, 862, 244, -1, + 1249, -1, 771, 154, 1189, 1453, 686, 78, 1095, 1423, 616, 31, 1614, 397, 1009, + 497, 553, 309, 917, 1420, 1343, 823, 203, 1297, 1384, 731, 115, 1488, 1536, 659, + 1591, 1698, 448, 1056, -1, 16, 970, 356, 482, -1, -1, 263, 878, 1260, 784, + 166, -1, 1193, 1467, 700, 104, 1109, -1, 644, 1628, 1018, 411, 523, 1550, -1, + 316, 931, -1, 837, -1, 217, 1214, 1361, 745, 129, 1154, 1502, 591, 1663, -1, + 1070, 1595, 566, 44, 372, 984, 510, 1513, 892, 280, 454, -1, 798, 179, 1201, + 1324, 706, 1481, 71, 1129, -1, 609, 1650, 1031, 425, 540, 1564, -1, 332, 945, + -1, 851, 232, -1, 1243, 1389, 760, 143, 1166, 1442, 675, 56, 1084, 1631, 594, + 1691, 9, 386, 998, 471, 1525, 906, 294, 1280, 1321, 812, 192, 1212, 1362, 720, + 99, 1408, 1141, 1676, 637, -1, 1045, 1433, 1578, -1, 959, 345, 466, -1, 865, + 248, 1255, -1, 773, 157, -1, 1178, 1456, 689, 84, 1098, -1, 622, 37, 239, + 400, 1012, 503, 1539, 310, 920, 1286, 1349, 826, 206, 1309, 1390, 734, 118, 1149, + 1426, 665, 1491, 1617, 451, 1059, 563, 22, 358, 973, 488, -1, 881, 365, -1, + -1, 787, 168, -1, 1470, -1, -1, 110, 1112, -1, 650, 1632, 1020, 414, 529, + 1553, 1432, 320, 934, -1, 840, 220, -1, 1220, 1367, 748, 132, 1160, 1505, 597, + 1669, 1073, 261, 572, 1598, 50, 375, 987, 514, 1516, 895, 283, -1, 1330, 801, + 182, 1207, -1, 709, 77, 1654, 1118, -1, 615, -1, 1034, -1, 543, 1567, 948, + 335, 571, -1, 854, 235, 1232, 1395, 763, 1697, 146, 1172, 1445, 678, 62, 1087, + 269, 600, 15, 1608, 389, 1001, 481, 535, 298, 429, 909, 1327, 815, 195, 1275, + 1368, 723, 103, 1528, 1641, 643, 1682, -1, 437, 1048, 1581, 0, 962, 348, -1, + -1, 870, 252, 1261, -1, 776, 160, -1, 1184, 1459, 692, 90, 1101, -1, 628, + 43, 1015, 403, 509, 541, 1429, 367, 923, 1542, 829, 209, 1620, 1215, 1396, 737, + 121, 1155, 1494, 575, -1, 1062, 1425, 558, 1586, 28, 362, 976, 494, -1, 884, + 272, -1, -1, 790, 171, 1190, 1473, -1, 55, -1, 1115, -1, 656, 1638, 417, + 1023, 477, 1556, 937, 324, 458, 1296, 843, 223, 1226, 1373, 751, 1508, 135, 1167, + 1675, 603, -1, 1076, -1, 578, 1601, -1, 378, 990, 520, 1519, 286, 898, 1283, + 1336, 804, 184, 1213, -1, 712, 83, 1124, 1660, 621, -1, -1, 1037, -1, 546, + 1570, 951, 338, 1302, -1, 857, 238, 1238, 1401, 766, 149, 1703, 1179, 1448, 681, + 68, 1090, 1637, 606, 21, 1004, 392, 487, 559, 1315, 303, 912, 1287, 818, 1333, + 198, 1281, 1374, 726, 109, 1136, 1415, 649, 1483, 1531, 442, 1051, 1584, 6, 351, + 965, 474, 1019, 873, 1688, 1250, -1, 779, -1, -1, 1462, 695, 96, -1, 1104, + -1, 634, 49, 406, 1623, 513, 531, 926, 1431, 1545, -1, 832, 212, 1221, 1353, + 1402, 740, 124, 1161, 1497, 581, 1653, 1065, 241, 561, 34, 1589, 366, 979, 500, + -1, 887, -1, 1262, -1, 793, 174, 1476, -1, -1, 61, 1119, 295, 662, 1642, + -1, 420, 1026, 536, 1559, 940, 327, -1, -1, 846, 226, 1233, 1379, 754, 138, + 1681, 1173, 1437, 670, -1, 1079, 1635, 584, -1, 993, 381, 526, 1522, -1, 290, + 901, -1, 807, 1342, 187, 1202, -1, 715, 89, 1130, 1666, 627, -1, -1, 428, + 1040, 549, 1573, 340, 954, 1308, -1, 860, 242, 1244, -1, 769, 152, 1185, 1451, + 684, -1, 74, 275, 1093, 612, 27, 1007, 395, 493, 537, 557, 306, 915, 1418, + 821, 201, 1339, 1303, 1380, 729, 231, 1142, 1486, 655, 1534, 1054, 446, 1612, 12, + 1694, 354, 968, 478, -1, 876, 260, 1256, -1, 782, 164, 1191, 1465, 698, 100, + 1107, 1413, 1626, 640, -1, -1, -1, 519, 539, 929, 314, 1548, -1, 835, 215, + 1227, 1357, 743, 127, 1316, 1150, 1500, 587, 1659, 251, 1068, 565, 40, 982, 370, + 506, 1511, 1593, 409, 890, 1268, 796, 1320, 177, 1196, 1479, 704, 67, 1125, 301, + 605, 1648, -1, 423, 1029, 1562, -1, 330, 943, -1, -1, 849, 229, 1239, 1385, + 758, 141, 1162, 1440, 673, 1687, -1, 1082, -1, 590, 5, 996, 384, 467, 1604, + 1406, 292, 904, 1276, 810, 190, 1348, 1208, 1358, 718, 95, 1137, 1672, 633, -1, + 1043, 432, 1576, -1, -1, 343, 957, 464, 863, -1, 245, 1251, -1, 757, 155, + 1454, -1, 687, 80, -1, 277, 1096, 618, 33, 398, 1010, 499, 1537, 918, 1422, + 1615, -1, 824, 204, 1299, 1345, 732, 1386, 116, 1489, 1700, 661, -1, 1057, 449, + 552, 18, -1, 357, 971, 484, -1, 264, 879, 1263, -1, 785, 167, 1468, -1, + 701, 106, 1110, 287, 646, 1629, -1, 412, -1, 525, 1313, 318, 932, 1301, 1551, + 838, 218, 1216, 1363, 746, 130, 1156, 1503, 1665, 593, -1, 259, 1071, 568, 46, + 985, 373, 1514, 1596, -1, 281, 893, 1272, 799, 180, 1326, 1203, -1, 707, 73, + 1131, -1, 611, 1651, 1032, 426, 533, 1565, -1, 333, 946, 567, 852, 1311, 233, + 1245, 1391, 761, 144, 1168, 1443, 676, 58, 1693, 1085, 1606, 596, 11, 387, 999, + 1526, -1, 907, 296, 1282, 1410, 813, 193, 1267, 1323, 721, 1364, 1678, 1143, -1, + 639, -1, 1046, 1435, 1579, -1, -1, 346, 960, 468, 866, 249, 868, 1257, -1, + 774, 158, 1180, 1457, 690, 86, 1099, 279, 624, 1618, 39, 401, 1013, 505, 1540, + 921, 311, 1288, 1351, 827, 207, 1492, 1392, 735, 119, 1411, 1151, -1, 667, -1, + 452, 1060, 463, 24, 974, 360, 490, -1, -1, 268, 882, 1271, 788, 169, -1, + 1197, 1471, -1, 112, 1113, 291, 652, 1634, -1, 415, 1554, 473, -1, 322, 935, + 456, 1292, 841, 221, 1222, 1369, 749, 133, 1163, 1434, 599, 1506, 1671, 1074, 1633, + 574, 1599, 376, 988, 516, 1517, 896, 284, -1, -1, 802, 183, 1209, 1332, 710, + 1656, 79, 1120, -1, 617, -1, 1035, -1, 544, 1568, -1, 336, 949, -1, 855, + 236, -1, 1234, 1397, 764, 147, 1174, 1446, 679, 64, 1088, 1639, 602, 1609, 17, + 390, 1002, 483, 1529, 435, 300, 910, 1329, 816, 196, 1277, 1370, 724, 105, 1684, + 1699, -1, 645, -1, 438, 1049, 1582, 2, 963, 349, 472, -1, -1, 254, 871, + 1246, 777, -1, 161, 1186, 1460, 693, 92, 1102, -1, 630, 45, 1621, 404, 1016, + 511, 555, 924, 1543, -1, -1, 830, 210, 1217, 1398, 738, 122, 1157, 1312, 577, + 1495, -1, 1063, 1421, 1587, 30, 977, 363, 496, -1, -1, 273, 885, -1, 791, + 172, -1, 1474, -1, -1, 57, 1116, -1, 658, -1, 1024, 418, 532, 1557, -1, + 325, 938, 459, 844, 224, 1298, 1228, 1375, 752, 136, 1169, 1509, 668, 1677, 1077, + -1, 580, -1, -1, 379, 991, 522, 1520, 899, 288, -1, 1338, 805, 185, 1198, + -1, 713, 85, 1662, 1126, -1, 623, -1, 1038, -1, 547, 1571, 952, 339, 1304, + -1, 858, 439, 1240, 1403, 767, -1, 150, 1181, 1449, 682, 70, 1091, -1, 608, + 23, 1610, 393, 1005, 489, 1532, 304, 913, 1416, 1335, 819, 199, 1289, 1376, 727, + 111, 1138, 1484, 651, 1690, -1, 444, 1052, 1585, 8, 966, 352, 476, -1, -1, + 256, 874, 1252, 780, 162, -1, 1463, -1, 696, 98, 1105, -1, 636, 51, 1624, + 407, 515, 1546, -1, 927, 1293, -1, 833, -1, 213, 1223, 1404, 741, 125, 1146, + 1498, 583, 1655, -1, 247, 1066, 562, 36, 368, 980, 502, 1590, 888, 274, 1264, + -1, 794, 175, 1194, 1477, 702, 63, -1, 441, 1121, 664, 1644, 1027, 421, 1560, + -1, 941, 328, -1, 460, 847, 227, 1235, 1381, -1, 755, 139, 1175, 1438, 671, + 1683, 1080, -1, 586, 1, 1602, 382, 994, 528, 1523, 902, -1, -1, 1344, 808, + 188, 1204, 1354, 716, 91, 1132, 1668, 629, -1, -1, 430, 1041, 550, 1574, 955, + 341, 1310, -1, 861, 243, 1247, -1, 770, 153, -1, 1187, 1452, 685, 76, 1094, + -1, 614, 29, 1008, 396, 455, 495, 1273, 308, 916, 1419, 822, 1341, 202, 1305, + 1382, 730, 114, 1144, 1487, 657, 1535, 1613, 447, 1055, 1696, 14, 355, 969, 480, + -1, 877, 262, 1258, -1, 783, 165, -1, 1466, 699, -1, 102, 1108, -1, 642, + 1627, -1, 410, 521, 1549, -1, 313, 315, 930, 836, 216, -1, 1229, 1359, 744, + 128, 1152, 1501, 589, 1661, 1069, 253, 1594, -1, 42, 371, 983, 508, 1512, 891, + 278, 1270, 1322, 797, 178, 1199, 1480, 705, 69, -1, 443, 1127, 607, 1649, 424, + 1030, 538, 1563, 944, 331, -1, -1, 850, 230, 1241, 1387, 759, 142, 1689, 1164, + 1441, 674, 54, 1083, 1605, 592, 7, 997, 385, 469, 1524, -1, 293, 905, 1278, + 1350, 811, 191, 1210, 1360, 719, 97, 1139, 1407, 635, 1674, -1, 434, 1044, 1577, + -1, 344, 958, -1, -1, 864, 246, 1253, -1, 772, 156, -1, 1455, 688, -1, + 82, 1097, -1, 620, 35, 1011, 399, 53, 501, 1424, 919, 1284, 1538, 825, 205, + 1347, 1307, 1388, 733, 117, 1147, 1490, 663, 1616, 1058, 450, 554, 1702, 20, 972, + -1, 486, -1, 880, 266, 1265, -1, 786, -1, 1195, 1469, -1, 108, -1, 1111, + 1630, 648, -1, 413, -1, 527, 1552, 933, 317, 319, -1, 839, 219, 1218, 1365, + 747, 1504, 131, 1158, 1667, 595, 52, 1072, -1, 570, 48, 1597, 374, 986, 512, + 1515, 282, 894, -1, 1328, 800, 181, 1205, -1, 708, 75, 1133, 1652, 613, -1, + -1, 427, 1033, 542, 1566, 334, 947, 569, -1, 853, 234, 1230, 1393, 762, 145, + 1170, 1444, 1695, 677, 60, 271, 1086, 598, 13, 1000, 388, 479, 1527, 1412, 297, + 908, 1607, 814, 194, 1325, 1269, 1366, 722, 101, 1145, 1645, 641, 1680, 1047, 436, + 1580, -1, -1, 347, 961, 470, 867, 869, 250, 1259, -1, 775, 159, 1182, 1458, + 691, 88, -1, 1100, -1, 626, 41, 402, 1014, 507, 1541, 922, 1428, 1290, 1619, + 828, 208, -1, 1394, 736, 1493, 120, 1153, -1, 573, -1, 1061, 453, 556, 26, + -1, 359, 361, 492, 975, 270, 883, -1, -1, 789, 170, 1472, -1, -1, 1021, + 1114, 1647, 654, 1636, -1, 416, 1022, 475, 1555, 936, 321, 323, 457, 842, 222, + 1224, 1294, 750, 134, 1318, 1165, 1371, 601, 1507, 1075, 1673, 576, 1600, 989, 377, + 518, 1518, +}; + +static const size_t cfold_len = sizeof cfold / sizeof cfold[0]; +static const uint32_t* cfold_tab = &cfold[0][0]; + +static inline uint32_t hash(uint32_t key, size_t len) { + uint64_t x = key*0xc6a4a7935bd1e99d; + return (uint32_t)((uint32_t)x*len >> 32); +} + +static inline int cfold_lookup(uint32_t codep) { + int idx, i = hash(codep, HT_SIZE); + while ((idx = cfold_idx[i]) != -1 && cfold_tab[idx] != codep) { + if (++i == HT_SIZE) i = 0; + } + return idx; +} + +static inline int utf8_isupper(uint32_t codep) { + if (codep < 128) return (codep >= 'A') & (codep <= 'Z'); + return (cfold_lookup(codep) & 1) == 0; +} + +static inline int utf8_islower(uint32_t codep) { + if (codep <= 'z') return codep >= 'a'; + int idx = cfold_lookup(codep); + return (idx != -1) & (idx & 1); +} + +static inline int utf8_toupper(uint32_t codep) { + int idx = cfold_lookup(codep); + return (idx == -1) | !(idx & 1) ? codep : cfold_tab[idx - 1]; +} + +static inline int utf8_tolower(uint32_t codep) { + int idx = cfold_lookup(codep); + return (idx == -1) | (idx & 1) ? codep : cfold_tab[idx + 1]; +} + +static inline int utf8_isalpha(uint32_t codep) { + int idx = cfold_lookup(codep); + return (idx != -1); +} + +static inline int utf8_isalnum(uint32_t codep) { + int idx = cfold_lookup(codep); + return (idx != -1) | ((codep >= '0') & (codep <= '9')); +} + +// ------------------------------------------------------------ +#if 0 +#include + +int coll = 0, count = 0; + +void maketables() +{ + for (int i=0; i= HT_SIZE) index = 0; + ++ count; + } + cfold_idx[index] = i; + } +} + +void printtables() +{ + printf("static short cfold_idx[%d] = {\n ", HT_SIZE); + for (int i = 0; i < HT_SIZE; ++i) { + printf(" %d,", cfold_idx[i]); + if ((i+1) % 15 == 0) printf("\n "); + } + printf("\n};\n"); +} + +void printtest() +{ + for (size_t i=0; i %d\n", cfold[i][UPPER], utf8_tolower(cfold[i][UPPER])); + printf("toupp %d => %d\n", cfold[i][LOWER], utf8_toupper(cfold[i][LOWER])); + } +} + + +struct Buf { char str[8]; int len; }; + +static int pushchar(struct Buf* buf, uint8_t c) +{ + buf->str[buf->len++] = c; + return 0; +} + +static int utf8_encode(struct Buf* buf, uint32_t c) +{ + if (c < 0x80UL) { + return pushchar(buf, c); + } else if (c < 0x0800UL) { + return !((pushchar(buf, (c >> 6 & 0x1F) | 0xC0) == 0) && + (pushchar(buf, (c >> 0 & 0x3F) | 0x80) == 0)); + } else if (c < 0x010000UL) { + if (c >= 0xd800 && c <= 0xdfff) { + fprintf(stderr, "invalid codepoint %06x", c); + return -1; + } + return !((pushchar(buf, (c >> 12 & 0x0F) | 0xE0) == 0) && + (pushchar(buf, (c >> 6 & 0x3F) | 0x80) == 0) && + (pushchar(buf, (c >> 0 & 0x3F) | 0x80) == 0)); + } else if (c < 0x110000UL) { + return !((pushchar(buf, (c >> 18 & 0x07) | 0xF0) == 0) && + (pushchar(buf, (c >> 12 & 0x3F) | 0x80) == 0) && + (pushchar(buf, (c >> 6 & 0x3F) | 0x80) == 0) && + (pushchar(buf, (c >> 0 & 0x3F) | 0x80) == 0)); + } else { + fprintf(stderr, "unable to encode %06x as UTF-8", c); + return -1; + } +} + +void printchars() +{ + c_forrange (i, int, cfold_len) { + struct Buf b1 = {{0}}, b2 = {{0}}; + utf8_encode(&b1, cfold[i][UPPER]); + utf8_encode(&b2, cfold[i][LOWER]); + printf("%4d: %6u : %s => %s : %d\n", i, cfold[i][UPPER], b1.str, b2.str, cfold[i][LOWER] - cfold[i][UPPER]); + } +} + +int main() +{ + //printchars(); + maketables(); + //printtables(); + printtest(); + printf("\ncoll1 %d, probe1: %d\n", coll, count); + printf("sizes %zu\n", cfold_len*8 + HT_SIZE*2); + + printf("size %zu\n", cfold_len); +} +#endif -- cgit v1.2.3