diff options
Diffstat (limited to 'src/re.c')
| -rw-r--r-- | src/re.c | 792 |
1 files changed, 81 insertions, 711 deletions
@@ -7,16 +7,11 @@ #include "mruby.h" #include <string.h> #include "mruby/string.h" -#include "mruby/khash.h" #include "encoding.h" #include "re.h" -#include "mruby/numeric.h" -#include "mruby/range.h" #include "mruby/array.h" #include "regint.h" #include "mruby/class.h" -#include "mruby/hash.h" -#include "mruby/variable.h" #include "error.h" #ifdef INCLUDE_REGEXP @@ -54,13 +49,10 @@ unsigned long ruby_scan_oct(const char*, size_t, size_t*); unsigned long ruby_scan_hex(const char*, size_t, size_t*); static mrb_value mrb_match_to_a(mrb_state *mrb, mrb_value match); -static mrb_value mrb_reg_preprocess(mrb_state *mrb, const char *p, const char *end, mrb_encoding *enc, - mrb_encoding **fixed_enc, onig_errmsg_buffer err); -static void mrb_reg_expr_str(mrb_state *mrb, mrb_value str, const char *s, long len, - mrb_encoding *enc, mrb_encoding *resenc); +static mrb_value mrb_reg_preprocess(mrb_state *mrb, const char *p, const char *end, onig_errmsg_buffer err); +static void mrb_reg_expr_str(mrb_state *mrb, mrb_value str, const char *s, long len); static char * option_to_str(char str[4], int options); -static mrb_value reg_cache; //static int may_need_recompile; //static int reg_kcode = DEFAULT_KCODE; /* ------------------------------------------------------------------------- */ @@ -94,22 +86,20 @@ mrb_reg_s_new_instance(mrb_state *mrb, /*int argc, mrb_value *argv, */mrb_value re->usecnt = 0; return mrb_funcall_argv(mrb, mrb_obj_value(re), "initialize", argc, argv); } -//#define mrb_enc_mbcput(a,b,c) a + mrb_value mrb_reg_quote(mrb_state *mrb, mrb_value str) { - mrb_encoding *enc = mrb_enc_get(mrb, str); char *s, *send, *t; mrb_value tmp; - int c,clen; - int ascii_only = mrb_enc_str_asciionly_p(mrb, str); + int c; s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); while (s < send) { - c = mrb_enc_ascget(mrb, s, send, &clen, enc); + c = *s; if (c == -1) { - s += mbclen(s, send, enc); + s += send - s; continue; } switch (c) { @@ -121,38 +111,29 @@ mrb_reg_quote(mrb_state *mrb, mrb_value str) case '\t': case '\f': case '\n': case '\r': goto meta_found; } - s += clen; + s++; } //tmp = mrb_str_new3(str); tmp = mrb_str_new(mrb, RSTRING_PTR(str), RSTRING_LEN(str)); - if (ascii_only) { - mrb_enc_associate(mrb, tmp, mrb_usascii_encoding(mrb)); - } return tmp; meta_found: tmp = mrb_str_new(mrb, 0, RSTRING_LEN(str)*2); - if (ascii_only) { - mrb_enc_associate(mrb, tmp, mrb_usascii_encoding(mrb)); - } - else { - mrb_enc_copy(mrb, tmp, str); - } t = RSTRING_PTR(tmp); /* copy upto metacharacter */ memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str)); t += s - RSTRING_PTR(str); while (s < send) { - c = mrb_enc_ascget(mrb, s, send, &clen, enc); + c = *s; if (c == -1) { - int n = mbclen(s, send, enc); + int n = send - s; while (n--) *t++ = *s++; continue; } - s += clen; + s++; switch (c) { case '[': case ']': case '{': case '}': case '(': case ')': case '|': case '-': @@ -263,7 +244,7 @@ mrb_reg_nth_match(mrb_state *mrb, mrb_int nth, mrb_value match) if (start == -1) return mrb_nil_value(); end = m->rmatch->regs.end[nth]; len = end - start; - str = mrb_str_substr(mrb, mrb_obj_value(m->str), start, len); + str = mrb_str_subseq(mrb, mrb_obj_value(m->str), start, len); return str; } @@ -379,75 +360,13 @@ mrb_reg_options(mrb_state *mrb, mrb_value re) return options; } -static void -reg_enc_error(mrb_state *mrb, mrb_value re, mrb_value str) -{ - mrb_raise(mrb, E_ENCODING_ERROR, - "incompatible encoding regexp match (%s regexp with %s string)", - mrb_enc_name(mrb_enc_get(mrb, re)), - mrb_enc_name(mrb_enc_get(mrb, str))); -} - -static int -mrb_reg_fixed_encoding_p(mrb_value re) -{ - /*if (FL_TEST(re, KCODE_FIXED)) - return Qtrue; - else */ - return 0/*Qfalse*/; -} - -static mrb_encoding* -mrb_reg_prepare_enc(mrb_state *mrb, mrb_value re, mrb_value str, int warn) -{ - mrb_encoding *enc = 0; - - if (mrb_enc_str_coderange(mrb, str) == ENC_CODERANGE_BROKEN) { - mrb_raise(mrb, E_ARGUMENT_ERROR, - "invalid byte sequence in %s", - mrb_enc_name(mrb_enc_get(mrb, str))); - } - - mrb_reg_check(mrb, re); - enc = mrb_enc_get(mrb, str); - if (!mrb_enc_str_asciicompat_p(mrb, str)) { - if (RREGEXP(re)->ptr->enc != enc) { - reg_enc_error(mrb, re, str); - } - } - else if (mrb_reg_fixed_encoding_p(re)) { - if (RREGEXP(re)->ptr->enc != enc && - (!mrb_enc_asciicompat(mrb, RREGEXP(re)->ptr->enc) || - mrb_enc_str_coderange(mrb, str) != ENC_CODERANGE_7BIT)) { - reg_enc_error(mrb, re, str); - } - enc = RREGEXP(re)->ptr->enc; - } - if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) && - enc != mrb_ascii8bit_encoding(mrb) && - mrb_enc_str_coderange(mrb, str) != ENC_CODERANGE_7BIT) { - mrb_warn("regexp match /.../n against to %s string", - mrb_enc_name(enc)); - } - return enc; -} - static mrb_value mrb_reg_desc(mrb_state *mrb, const char *s, long len, mrb_value re) { - mrb_encoding *enc = mrb_enc_get(mrb, re); mrb_value str = mrb_str_new_cstr(mrb, "/");//mrb_str_buf_new2("/"); - mrb_encoding *resenc = mrb_default_internal_encoding(mrb); - if (resenc == NULL) resenc = mrb_default_external_encoding(mrb); - if (re.tt && mrb_enc_asciicompat(mrb, enc)) { - mrb_enc_copy(mrb, str, re); - } - else { - mrb_enc_associate(mrb, str, mrb_usascii_encoding(mrb)); - } - mrb_reg_expr_str(mrb, str, s, len, enc, resenc); - mrb_str_buf_cat(mrb, str, "/", strlen("/"));//mrb_str_buf_cat2(str, "/"); + mrb_reg_expr_str(mrb, str, s, len); + mrb_str_buf_cat(mrb, str, "/", strlen("/")); if (re.tt) { char opts[4]; mrb_reg_check(mrb, re); @@ -476,18 +395,14 @@ mrb_reg_prepare_re(mrb_state *mrb, mrb_value re, mrb_value str) OnigErrorInfo einfo; const char *pattern; mrb_value unescaped; - mrb_encoding *fixed_enc = 0; - mrb_encoding *enc = mrb_reg_prepare_enc(mrb, re, str, 1); - - if (reg->enc == enc) return reg; + mrb_encoding *enc = mrb_ascii8bit_encoding(mrb); mrb_reg_check(mrb, re); reg = RREGEXP(re)->ptr; pattern = RREGEXP_SRC_PTR(re); unescaped = mrb_reg_preprocess(mrb, - pattern, pattern + RREGEXP(re)->src->len, enc, - &fixed_enc, err); + pattern, pattern + RREGEXP(re)->src->len, err); if (mrb_nil_p(unescaped)) { mrb_raise(mrb, E_ARGUMENT_ERROR, "regexp preprocess failed: %s", err); @@ -675,18 +590,6 @@ ruby_scan_hex(const char *start, size_t len, size_t *retlen) return retval; } -static int -check_unicode_range(unsigned long code, onig_errmsg_buffer err) -{ - if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */ - 0x10ffff < code) { - //errcpy(err, "invalid Unicode range"); - printf("invalid Unicode range"); - return -1; - } - return 0; -} - #define BYTEWIDTH 8 int @@ -735,59 +638,6 @@ mrb_uv_to_utf8(mrb_state *mrb, char buf[6], unsigned long uv) return 0; } -static int -append_utf8(mrb_state *mrb, unsigned long uv, - mrb_value buf, mrb_encoding **encp, onig_errmsg_buffer err) -{ - if (check_unicode_range(uv, err) != 0) - return -1; - if (uv < 0x80) { - char escbuf[5]; - snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv); - mrb_str_buf_cat(mrb, buf, escbuf, 4); - } - else { - int len; - char utf8buf[6]; - len = mrb_uv_to_utf8(mrb, utf8buf, uv); - mrb_str_buf_cat(mrb, buf, utf8buf, len); - - if (*encp == 0) - *encp = mrb_utf8_encoding(mrb); - else if (*encp != mrb_utf8_encoding(mrb)) { - //errcpy(err, "UTF-8 character in non UTF-8 regexp"); - printf("UTF-8 character in non UTF-8 regexp"); - return -1; - } - } - return 0; -} - -static int -unescape_unicode_bmp(mrb_state *mrb, const char **pp, const char *end, - mrb_value buf, mrb_encoding **encp, onig_errmsg_buffer err) -{ - const char *p = *pp; - size_t len; - unsigned long code; - - if (end < p+4) { - //errcpy(err, "invalid Unicode escape"); - printf("invalid Unicode escape"); - return -1; - } - code = ruby_scan_hex(p, 4, &len); - if (len != 4) { - //errcpy(err, "invalid Unicode escape"); - printf("invalid Unicode escape"); - return -1; - } - if (append_utf8(mrb, code, buf, encp, err) != 0) - return -1; - *pp = p + 4; - return 0; -} - unsigned long ruby_scan_oct(const char *start, size_t len, size_t *retlen) { @@ -802,400 +652,29 @@ ruby_scan_oct(const char *start, size_t len, size_t *retlen) return retval; } -static int -read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err) -{ - const char *p = *pp; - int code; - int meta_prefix = 0, ctrl_prefix = 0; - size_t len; - - if (p == end || *p++ != '\\') { - //errcpy(err, "too short escaped multibyte character"); - printf("too short escaped multibyte character"); - return -1; - } - -again: - if (p == end) { - //errcpy(err, "too short escape sequence"); - printf("too short escape sequence"); - return -1; - } - switch (*p++) { - case '\\': code = '\\'; break; - case 'n': code = '\n'; break; - case 't': code = '\t'; break; - case 'r': code = '\r'; break; - case 'f': code = '\f'; break; - case 'v': code = '\013'; break; - case 'a': code = '\007'; break; - case 'e': code = '\033'; break; - - /* \OOO */ - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - p--; - code = scan_oct(p, end < p+3 ? end-p : 3, &len); - p += len; - break; - - case 'x': /* \xHH */ - code = scan_hex(p, end < p+2 ? end-p : 2, &len); - if (len < 1) { - //errcpy(err, "invalid hex escape"); - printf("invalid hex escape"); - return -1; - } - p += len; - break; - - case 'M': /* \M-X, \M-\C-X, \M-\cX */ - if (meta_prefix) { - //errcpy(err, "duplicate meta escape"); - printf("duplicate meta escape"); - return -1; - } - meta_prefix = 1; - if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) { - if (*p == '\\') { - p++; - goto again; - } - else { - code = *p++; - break; - } - } - //errcpy(err, "too short meta escape"); - printf("too short meta escape"); - return -1; - - case 'C': /* \C-X, \C-\M-X */ - if (p == end || *p++ != '-') { - //errcpy(err, "too short control escape"); - printf("too short control escape"); - return -1; - } - case 'c': /* \cX, \c\M-X */ - if (ctrl_prefix) { - //errcpy(err, "duplicate control escape"); - printf("duplicate control escape"); - return -1; - } - ctrl_prefix = 1; - if (p < end && (*p & 0x80) == 0) { - if (*p == '\\') { - p++; - goto again; - } - else { - code = *p++; - break; - } - } - //errcpy(err, "too short control escape"); - printf("too short control escape"); - return -1; - - default: - //errcpy(err, "unexpected escape sequence"); - printf("unexpected escape sequence"); - return -1; - } - if (code < 0 || 0xff < code) { - //errcpy(err, "invalid escape code"); - printf("invalid escape code"); - return -1; - } - - if (ctrl_prefix) - code &= 0x1f; - if (meta_prefix) - code |= 0x80; - - *pp = p; - return code; -} - -static int -unescape_escaped_nonascii(mrb_state *mrb, const char **pp, const char *end, mrb_encoding *enc, - mrb_value buf, mrb_encoding **encp, onig_errmsg_buffer err) -{ - const char *p = *pp; - int chmaxlen = mrb_enc_mbmaxlen(enc); - //char *chbuf = ALLOCA_N(char, chmaxlen); - char *chbuf = mrb_malloc(mrb, chmaxlen); - int chlen = 0; - int byte; - int l; - - memset(chbuf, 0, chmaxlen); - - byte = read_escaped_byte(&p, end, err); - if (byte == -1) { - return -1; - } - - chbuf[chlen++] = byte; - while (chlen < chmaxlen && - MBCLEN_NEEDMORE_P(mrb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) { - byte = read_escaped_byte(&p, end, err); - if (byte == -1) { - return -1; - } - chbuf[chlen++] = byte; - } - - l = mrb_enc_precise_mbclen(chbuf, chbuf+chlen, enc); - if (MBCLEN_INVALID_P(l)) { - //errcpy(err, "invalid multibyte escape"); - printf("invalid multibyte escape"); - return -1; - } - if (1 < chlen || (chbuf[0] & 0x80)) { - mrb_str_buf_cat(mrb, buf, chbuf, chlen); - - if (*encp == 0) - *encp = enc; - else if (*encp != enc) { - //errcpy(err, "escaped non ASCII character in UTF-8 regexp"); - printf("escaped non ASCII character in UTF-8 regexp"); - return -1; - } - } - else { - char escbuf[5]; - snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff); - mrb_str_buf_cat(mrb, buf, escbuf, 4); - } - *pp = p; - return 0; -} - -static int -unescape_unicode_list(mrb_state *mrb, const char **pp, const char *end, - mrb_value buf, mrb_encoding **encp, onig_errmsg_buffer err) -{ - const char *p = *pp; - int has_unicode = 0; - unsigned long code; - size_t len; - - while (p < end && ISSPACE(*p)) p++; - - while (1) { - code = ruby_scan_hex(p, end-p, &len); - if (len == 0) - break; - if (6 < len) { /* max 10FFFF */ - //errcpy(err, "invalid Unicode range"); - printf("invalid Unicode range"); - return -1; - } - p += len; - if (append_utf8(mrb, code, buf, encp, err) != 0) - return -1; - has_unicode = 1; - - while (p < end && ISSPACE(*p)) p++; - } - - if (has_unicode == 0) { - //errcpy(err, "invalid Unicode list"); - printf("invalid Unicode list"); - return -1; - } - - *pp = p; - - return 0; -} - -static int -unescape_nonascii(mrb_state *mrb, const char *p, const char *end, mrb_encoding *enc, - mrb_value buf, mrb_encoding **encp, int *has_property, - onig_errmsg_buffer err) -{ - char c; - char smallbuf[2]; - - while (p < end) { - int chlen = mrb_enc_precise_mbclen(p, end, enc); - if (!MBCLEN_CHARFOUND_P(chlen)) { - //errcpy(err, "invalid multibyte character"); - printf("invalid multibyte character"); - return -1; - } - chlen = MBCLEN_CHARFOUND_LEN(chlen); - if (1 < chlen || (*p & 0x80)) { - mrb_str_buf_cat(mrb, buf, p, chlen); - p += chlen; - if (*encp == 0) - *encp = enc; - else if (*encp != enc) { - //errcpy(err, "non ASCII character in UTF-8 regexp"); - printf("non ASCII character in UTF-8 regexp"); - return -1; - } - continue; - } - - switch (c = *p++) { - case '\\': - if (p == end) { - //errcpy(err, "too short escape sequence"); - printf("too short escape sequence"); - return -1; - } - switch (c = *p++) { - case '1': case '2': case '3': - case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */ - { - size_t octlen; - if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) { - /* backref or 7bit octal. - no need to unescape anyway. - re-escaping may break backref */ - goto escape_asis; - } - } - /* xxx: How about more than 199 subexpressions? */ - - case '0': /* \0, \0O, \0OO */ - - case 'x': /* \xHH */ - case 'c': /* \cX, \c\M-X */ - case 'C': /* \C-X, \C-\M-X */ - case 'M': /* \M-X, \M-\C-X, \M-\cX */ - p = p-2; - if (unescape_escaped_nonascii(mrb, &p, end, enc, buf, encp, err) != 0) - return -1; - break; - - case 'u': - if (p == end) { - //errcpy(err, "too short escape sequence"); - printf("too short escape sequence"); - return -1; - } - if (*p == '{') { - /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */ - p++; - if (unescape_unicode_list(mrb, &p, end, buf, encp, err) != 0) - return -1; - if (p == end || *p++ != '}') { - //errcpy(err, "invalid Unicode list"); - printf("invalid Unicode list"); - return -1; - } - break; - } - else { - /* \uHHHH */ - if (unescape_unicode_bmp(mrb, &p, end, buf, encp, err) != 0) - return -1; - break; - } - - case 'p': /* \p{Hiragana} */ - case 'P': - if (!*encp) { - *has_property = 1; - } - goto escape_asis; - - default: /* \n, \\, \d, \9, etc. */ -escape_asis: - smallbuf[0] = '\\'; - smallbuf[1] = c; - mrb_str_buf_cat(mrb, buf, smallbuf, 2); - break; - } - break; - - default: - mrb_str_buf_cat(mrb, buf, &c, 1); - break; - } - } - - return 0; -} - - static mrb_value -mrb_reg_preprocess(mrb_state *mrb, const char *p, const char *end, mrb_encoding *enc, - mrb_encoding **fixed_enc, onig_errmsg_buffer err) +mrb_reg_preprocess(mrb_state *mrb, const char *p, const char *end, onig_errmsg_buffer err) { - mrb_value buf; - int has_property = 0; - - //buf = mrb_str_buf_new(0); - buf = mrb_str_buf_new(mrb, 0); - - if (mrb_enc_asciicompat(mrb, enc)) - *fixed_enc = 0; - else { - *fixed_enc = enc; - mrb_enc_associate(mrb, buf, enc); - } - - if (unescape_nonascii(mrb, p, end, enc, buf, fixed_enc, &has_property, err) != 0) - return mrb_nil_value(); - - if (has_property && !*fixed_enc) { - *fixed_enc = enc; - } - - if (*fixed_enc) { - mrb_enc_associate(mrb, buf, *fixed_enc); - } - - return buf; + return mrb_nil_value(); } static int -mrb_reg_initialize(mrb_state *mrb, mrb_value obj, const char *s, long len, mrb_encoding *enc, +mrb_reg_initialize(mrb_state *mrb, mrb_value obj, const char *s, long len, int options, onig_errmsg_buffer err, const char *sourcefile, int sourceline) { struct RRegexp *re = RREGEXP(obj); mrb_value unescaped; - mrb_encoding *fixed_enc = 0; - mrb_encoding *a_enc = mrb_ascii8bit_encoding(mrb); + mrb_encoding *enc = mrb_ascii8bit_encoding(mrb); if (re->ptr) mrb_raise(mrb, E_TYPE_ERROR, "already initialized regexp"); re->ptr = 0; - if (mrb_enc_dummy_p(enc)) { - //errcpy(err, "can't make regexp with dummy encoding"); - printf("can't make regexp with dummy encoding"); - return -1; - } - - unescaped = mrb_reg_preprocess(mrb, s, s+len, enc, &fixed_enc, err); + unescaped = mrb_reg_preprocess(mrb, s, s+len, err); if (mrb_nil_p(unescaped)) return -1; - if (fixed_enc) { - if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) || - (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) { - //errcpy(err, "incompatible character encoding"); - printf("incompatible character encoding"); - return -1; - } - if (fixed_enc != a_enc) { - options |= ARG_ENCODING_FIXED; - enc = fixed_enc; - } - } - else if (!(options & ARG_ENCODING_FIXED)) { - enc = mrb_usascii_encoding(mrb); - } - - mrb_enc_associate(mrb, mrb_obj_value(re), enc); - if ((options & ARG_ENCODING_FIXED) || fixed_enc) { + if ((options & ARG_ENCODING_FIXED)) { //re->basic.flags |= KCODE_FIXED; re->flags|= KCODE_FIXED; } @@ -1207,7 +686,7 @@ mrb_reg_initialize(mrb_state *mrb, mrb_value obj, const char *s, long len, mrb_e options & ARG_REG_OPTION_MASK, err, sourcefile, sourceline); if (!re->ptr) return -1; - re->src = mrb_str_ptr(mrb_enc_str_new(mrb, s, len, enc)); + re->src = mrb_str_ptr(mrb_str_new(mrb, s, len)); return 0; } @@ -1217,8 +696,8 @@ mrb_reg_initialize_str(mrb_state *mrb, mrb_value obj, mrb_value str, int options const char *sourcefile, int sourceline) { int ret; - mrb_encoding *enc = mrb_enc_get(mrb, str); +#if 0 if (options & ARG_ENCODING_NONE) { mrb_encoding *ascii8bit = mrb_ascii8bit_encoding(mrb); if (enc != ascii8bit) { @@ -1230,8 +709,9 @@ mrb_reg_initialize_str(mrb_state *mrb, mrb_value obj, mrb_value str, int options enc = ascii8bit; } } +#endif - ret = mrb_reg_initialize(mrb, obj, RSTRING_PTR(str), RSTRING_LEN(str), enc, + ret = mrb_reg_initialize(mrb, obj, RSTRING_PTR(str), RSTRING_LEN(str), options, err, sourcefile, sourceline); return ret; @@ -1267,7 +747,6 @@ mrb_reg_initialize_m(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value se onig_errmsg_buffer err = ""; int flags = 0; mrb_value str; - mrb_encoding *enc; const char *ptr; long len; @@ -1286,10 +765,7 @@ mrb_reg_initialize_m(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value se flags = mrb_reg_options(mrb, re); ptr = RREGEXP_SRC_PTR(re); len = RREGEXP_SRC_LEN(re); - enc = mrb_enc_get(mrb, re); - if (mrb_reg_initialize(mrb, self, ptr, len, enc, flags, err, NULL, 0)) { - /*str = mrb_enc_str_new(mrb, ptr, len, enc); - mrb_reg_raise_str(str, flags, err);*/ + if (mrb_reg_initialize(mrb, self, ptr, len, flags, err, NULL, 0)) { printf("mrb_reg_raise_str(str, flags, err);"); } } @@ -1298,12 +774,10 @@ mrb_reg_initialize_m(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value se if (mrb_type(argv[1]) == MRB_TT_FIXNUM) flags = mrb_fixnum(argv[1]); else if (mrb_test(argv[1])) flags = ONIG_OPTION_IGNORECASE; } - enc = 0; if (argc == 3 && !mrb_nil_p(argv[2])) { //char *kcode = StringValuePtr(argv[2]); char *kcode = mrb_string_value_ptr(mrb, argv[2]); if (kcode[0] == 'n' || kcode[0] == 'N') { - enc = mrb_ascii8bit_encoding(mrb); flags |= ARG_ENCODING_NONE; } else { @@ -1314,9 +788,7 @@ mrb_reg_initialize_m(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value se str = argv[0]; //ptr = StringValuePtr(str); ptr = mrb_string_value_ptr(mrb, str); - if (enc - ? mrb_reg_initialize(mrb, self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0) - : mrb_reg_initialize_str(mrb, self, str, flags, err, NULL, 0)) { + if (mrb_reg_initialize_str(mrb, self, str, flags, err, NULL, 0)) { //mrb_reg_raise_str(str, flags, err); } } @@ -1346,7 +818,7 @@ mrb_reg_init_copy(mrb_state *mrb, mrb_value re/*, mrb_value copy*/) mrb_reg_check(mrb, copy); s = RREGEXP_SRC_PTR(copy); len = RREGEXP_SRC_LEN(copy); - if (mrb_reg_initialize(mrb, re, s, len, mrb_enc_get(mrb, copy), mrb_reg_options(mrb, copy), + if (mrb_reg_initialize(mrb, re, s, len, mrb_reg_options(mrb, copy), err, 0/*NULL*/, 0) != 0) { mrb_reg_raise(mrb, s, len, err, re); } @@ -1628,7 +1100,7 @@ mrb_reg_source(mrb_state *mrb, mrb_value re) mrb_value str; mrb_reg_check(mrb, re); - str = mrb_enc_str_new(mrb, RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), mrb_enc_get(mrb, re)); + str = mrb_str_new(mrb, RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re)); return str; } @@ -1757,23 +1229,12 @@ typedef struct { long char_pos; } pair_t; -static int -pair_byte_cmp(const void *pair1, const void *pair2) -{ - long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos; - return diff ? diff > 0 ? 1 : -1 : 0; -} - static void update_char_offset(mrb_state *mrb, mrb_value match) { struct rmatch *rm = RMATCH(match)->rmatch; struct re_registers *regs; - int i, num_regs, num_pos; - long c; - char *s, *p, *q; - mrb_encoding *enc; - pair_t *pairs; + int i, num_regs; if (rm->char_offset_updated) return; @@ -1787,55 +1248,12 @@ update_char_offset(mrb_state *mrb, mrb_value match) rm->char_offset_num_allocated = num_regs; } - enc = mrb_enc_get(mrb, mrb_obj_value(RMATCH(match)->str)); - if (mrb_enc_mbmaxlen(enc) == 1) { - for (i = 0; i < num_regs; i++) { - rm->char_offset[i].beg = BEG(i); - rm->char_offset[i].end = END(i); - } - rm->char_offset_updated = 1; - return; - } - - //pairs = ALLOCA_N(pair_t, num_regs*2); - pairs = mrb_malloc(mrb, sizeof(pair_t)*num_regs*2); - - num_pos = 0; for (i = 0; i < num_regs; i++) { - if (BEG(i) < 0) - continue; - pairs[num_pos++].byte_pos = BEG(i); - pairs[num_pos++].byte_pos = END(i); - } - qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp); - - s = p = RMATCH(match)->str->buf; - c = 0; - for (i = 0; i < num_pos; i++) { - q = s + pairs[i].byte_pos; - c += mrb_enc_strlen(p, q, enc); - pairs[i].char_pos = c; - p = q; - } - - for (i = 0; i < num_regs; i++) { - pair_t key, *found; - if (BEG(i) < 0) { - rm->char_offset[i].beg = -1; - rm->char_offset[i].end = -1; - continue; - } - - key.byte_pos = BEG(i); - found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); - rm->char_offset[i].beg = found->char_pos; - - key.byte_pos = END(i); - found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); - rm->char_offset[i].end = found->char_pos; + rm->char_offset[i].beg = BEG(i); + rm->char_offset[i].end = END(i); } - rm->char_offset_updated = 1; + return; } /* 15.2.16.3.2 */ @@ -2235,49 +1653,36 @@ option_to_str(char str[4], int options) #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */ static void -mrb_reg_expr_str(mrb_state *mrb, mrb_value str, const char *s, long len, - mrb_encoding *enc, mrb_encoding *resenc) +mrb_reg_expr_str(mrb_state *mrb, mrb_value str, const char *s, long len) { const char *p, *pend; int need_escape = 0; - int c, clen; + int c; p = s; pend = p + len; - if (mrb_enc_asciicompat(mrb, enc)) { - while (p < pend) { - c = mrb_enc_ascget(mrb, p, pend, &clen, enc); - if (c == -1) { - if (enc == resenc) { - p += mbclen(p, pend, enc); - } - else { - need_escape = 1; - break; - } - } - else if (c != '/' && mrb_enc_isprint(c, enc)) { - p += clen; - } - else { - need_escape = 1; - break; - } + while (p < pend) { + c = *p; + if (c == -1) { + p += pend - p; + } + else if (c != '/' && ISPRINT(c)) { + p++; + } + else { + need_escape = 1; + break; } - } - else { - need_escape = 1; } if (!need_escape) { mrb_str_buf_cat(mrb, str, s, len); } else { - int unicode_p = mrb_enc_unicode_p(enc); p = s; while (p<pend) { - c = mrb_enc_ascget(mrb, p, pend, &clen, enc); - if (c == '\\' && p+clen < pend) { - int n = clen + mbclen(p+clen, pend, enc); + c = *p; + if (c == '\\' && p+1 < pend) { + int n = 1 + pend - (p+1); mrb_str_buf_cat(mrb, str, p, n); p += n; continue; @@ -2285,38 +1690,21 @@ mrb_reg_expr_str(mrb_state *mrb, mrb_value str, const char *s, long len, else if (c == '/') { char c = '\\'; mrb_str_buf_cat(mrb, str, &c, 1); - mrb_str_buf_cat(mrb, str, p, clen); - } - else if (c == -1) { - clen = mrb_enc_precise_mbclen(p, pend, enc); - if (!MBCLEN_CHARFOUND_P(clen)) { - c = (unsigned char)*p; - clen = 1; - goto hex; - } - if (resenc) { - unsigned int c = mrb_enc_mbc_to_codepoint(p, pend, enc); - mrb_str_buf_cat_escaped_char(mrb, str, c, unicode_p); - } - else { - clen = MBCLEN_CHARFOUND_LEN(clen); - mrb_str_buf_cat(mrb, str, p, clen); - } + mrb_str_buf_cat(mrb, str, p, 1); } - else if (mrb_enc_isprint(c, enc)) { - mrb_str_buf_cat(mrb, str, p, clen); + else if (ISPRINT(c)) { + mrb_str_buf_cat(mrb, str, p, 1); } - else if (!mrb_enc_isspace(c, enc)) { + else if (!ISSPACE(c)) { char b[8]; - hex: snprintf(b, sizeof(b), "\\x%02X", c); mrb_str_buf_cat(mrb, str, b, 4); } else { - mrb_str_buf_cat(mrb, str, p, clen); + mrb_str_buf_cat(mrb, str, p, 1); } - p += clen; + p++; } } } @@ -2355,7 +1743,6 @@ mrb_reg_to_s(mrb_state *mrb, mrb_value re) mrb_reg_check(mrb, re); memset(optbuf, 0, 5); - mrb_enc_copy(mrb, str, re); options = RREGEXP(re)->ptr->options; ptr = (UChar*)RREGEXP_SRC_PTR(re); len = RREGEXP_SRC_LEN(re); @@ -2399,7 +1786,7 @@ again: ++ptr; len -= 2; - err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT, + err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, NULL); onig_free(rp); } @@ -2419,9 +1806,8 @@ again: } mrb_str_buf_cat(mrb, str, ":", strlen(":")); - mrb_reg_expr_str(mrb, str, (char*)ptr, len, enc, NULL); + mrb_reg_expr_str(mrb, str, (char*)ptr, len); mrb_str_buf_cat(mrb, str, ")", strlen(")")); - mrb_enc_copy(mrb, str, re); return str; } @@ -2663,8 +2049,6 @@ mrb_init_regexp(mrb_state *mrb) mrb_define_const(mrb, s, "MULTILINE", mrb_fixnum_value(ONIG_OPTION_MULTILINE)); mrb_define_const(mrb, s, "FIXEDENCODING", mrb_fixnum_value(ARG_ENCODING_FIXED)); - //mrb_global_variable(®_cache); - s = mrb_define_class(mrb, "MatchData", mrb->object_class); //mrb_undef_method(CLASS_OF(rb_cMatch), "new"); @@ -2705,27 +2089,23 @@ mrb_reg_regsub(mrb_state *mrb, mrb_value str, mrb_value src, struct re_registers { mrb_value val; char *p, *s, *e; - int no, clen; - mrb_encoding *str_enc = mrb_enc_get(mrb, str); - mrb_encoding *src_enc = mrb_enc_get(mrb, src); - int acompat = mrb_enc_asciicompat(mrb, str_enc); -#define ASCGET(mrb,s,e,cl) (acompat ? (*cl=1,ISASCII(s[0])?s[0]:-1) : mrb_enc_ascget(mrb, s, e, cl, str_enc)) struct RString *ps = mrb_str_ptr(str); + int no; val.tt = 0; p = s = ps->buf; e = s + ps->len; while (s < e) { - int c = ASCGET(mrb, s, e, &clen); + int c = *s; char *ss; if (c == -1) { - s += mbclen(s, e, str_enc); + s += e - s; continue; } ss = s; - s += clen; + s++; if (c != '\\' || s == e) continue; @@ -2733,16 +2113,16 @@ mrb_reg_regsub(mrb_state *mrb, mrb_value str, mrb_value src, struct re_registers if (!val.tt) { val = mrb_str_buf_new(mrb, ss-p); } - mrb_enc_str_buf_cat(mrb, val, p, ss-p, str_enc); + mrb_str_buf_cat(mrb, val, p, ss-p); - c = ASCGET(mrb, s, e, &clen); + c = *s; if (c == -1) { - s += mbclen(s, e, str_enc); - mrb_enc_str_buf_cat(mrb, val, ss, s-ss, str_enc); + s += e - s; + mrb_str_buf_cat(mrb, val, ss, s-ss); p = s; continue; } - s += clen; + s++; p = s; switch (c) { @@ -2757,18 +2137,18 @@ mrb_reg_regsub(mrb_state *mrb, mrb_value str, mrb_value src, struct re_registers break; case 'k': - if (s < e && ASCGET(mrb, s, e, &clen) == '<') { + if (s < e && *s == '<') { char *name, *name_end; - name_end = name = s + clen; + name_end = name = s + 1; while (name_end < e) { - c = ASCGET(mrb, name_end, e, &clen); + c = *name_end; if (c == '>') break; - name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen; + name_end += c == -1 ? e - name_end : 1; } if (name_end < e) { no = name_to_backref_number(mrb, regs, RREGEXP(regexp), name, name_end); - p = s = name_end + clen; + p = s = name_end + 1; break; } else { @@ -2776,7 +2156,7 @@ mrb_reg_regsub(mrb_state *mrb, mrb_value str, mrb_value src, struct re_registers } } - mrb_enc_str_buf_cat(mrb, val, ss, s-ss, str_enc); + mrb_str_buf_cat(mrb, val, ss, s-ss); continue; case '0': @@ -2785,11 +2165,11 @@ mrb_reg_regsub(mrb_state *mrb, mrb_value str, mrb_value src, struct re_registers break; case '`': - mrb_enc_str_buf_cat(mrb, val, RSTRING_PTR(src), BEG(0), src_enc); + mrb_str_buf_cat(mrb, val, RSTRING_PTR(src), BEG(0)); continue; case '\'': - mrb_enc_str_buf_cat(mrb, val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc); + mrb_str_buf_cat(mrb, val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0)); continue; case '+': @@ -2799,31 +2179,29 @@ mrb_reg_regsub(mrb_state *mrb, mrb_value str, mrb_value src, struct re_registers break; case '\\': - mrb_enc_str_buf_cat(mrb, val, s-clen, clen, str_enc); + mrb_str_buf_cat(mrb, val, s-1, 1); continue; default: - mrb_enc_str_buf_cat(mrb, val, ss, s-ss, str_enc); + mrb_str_buf_cat(mrb, val, ss, s-ss); continue; } if (no >= 0) { if (no >= regs->num_regs) continue; if (BEG(no) == -1) continue; - mrb_enc_str_buf_cat(mrb, val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc); + mrb_str_buf_cat(mrb, val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no)); } } /* while (s < e) { */ if (!val.tt) return str; if (p < e) { - mrb_enc_str_buf_cat(mrb, val, p, e-p, str_enc); + mrb_str_buf_cat(mrb, val, p, e-p); } return val; } -//#define NEW_NODE(t,a0,a1,a2) mrb_node_newnode((t),(int)(a0),(int)(a1),(int)(a2)) -//#define NEW_IF(c,t,e) NEW_NODE(NODE_IF,c,t,e) static inline NODE * lfp_svar_place(mrb_state *mrb, /*mrb_thread_t *th,*/ mrb_value *lfp) { @@ -3038,9 +2416,6 @@ mrb_memsearch(mrb_state *mrb, const void *x0, int m, const void *y0, int n, mrb_ } return -1; } - else if (enc == mrb_utf8_encoding(mrb)) { - return mrb_memsearch_qs_utf8(x0, m, y0, n); - } else { return mrb_memsearch_qs(x0, m, y0, n); } @@ -3077,12 +2452,7 @@ mrb_reg_new_str(mrb_state *mrb, mrb_value s, int options) mrb_value mrb_reg_regcomp(mrb_state *mrb, mrb_value str) { - mrb_value save_str = str; - if (reg_cache.tt && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str) - && ENCODING_GET(mrb, reg_cache) == ENCODING_GET(mrb, str) - && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0) - return reg_cache; - return reg_cache = mrb_reg_new_str(mrb, save_str, 0); + return mrb_reg_new_str(mrb, str, 0); } int @@ -3143,7 +2513,7 @@ is_special_global_name(const char *m, const char *e, mrb_encoding *enc) ++m; if (m < e && is_identchar(m, e, enc)) { if (!ISASCII(*m)) mb = 1; - m += mrb_enc_mbclen(m, e, enc); + m += e - m; } break; default: @@ -3228,7 +2598,7 @@ mrb_enc_symname2_p(const char *name, long len, mrb_encoding *enc) id: if (m >= e || (*m != '_' && !mrb_enc_isalpha(*m, enc) && ISASCII(*m))) return FALSE; - while (m < e && is_identchar(m, e, enc)) m += mrb_enc_mbclen(m, e, enc); + while (m < e && is_identchar(m, e, enc)) m += e - m; if (localid) { switch (*m) { case '!': case '?': case '=': ++m; |
