diff options
| author | Paolo Bosetti <[email protected]> | 2012-05-31 18:52:33 -0700 |
|---|---|---|
| committer | Paolo Bosetti <[email protected]> | 2012-05-31 18:52:33 -0700 |
| commit | 9c0bfd343679fcd84090b7611ed582ae31e0e3b9 (patch) | |
| tree | 75e6ac394862821a0e466ccfee361655c40ae749 /src | |
| parent | 6dbba7b799e0cf1a86ec86f347bbc1b40420d372 (diff) | |
| parent | 8180fee1808c56048b9fa18a8dd16014e694e48e (diff) | |
| download | mruby-9c0bfd343679fcd84090b7611ed582ae31e0e3b9.tar.gz mruby-9c0bfd343679fcd84090b7611ed582ae31e0e3b9.zip | |
Merge branch 'master' of git://github.com/mruby/mruby into XCode
Diffstat (limited to 'src')
| -rw-r--r-- | src/array.c | 107 | ||||
| -rw-r--r-- | src/ascii.c | 96 | ||||
| -rw-r--r-- | src/class.c | 53 | ||||
| -rw-r--r-- | src/codegen.c | 25 | ||||
| -rw-r--r-- | src/encoding.c | 1685 | ||||
| -rw-r--r-- | src/encoding.h | 9 | ||||
| -rw-r--r-- | src/gc.c | 30 | ||||
| -rw-r--r-- | src/hash.c | 115 | ||||
| -rw-r--r-- | src/init.c | 2 | ||||
| -rw-r--r-- | src/kernel.c | 8 | ||||
| -rw-r--r-- | src/load.c | 1 | ||||
| -rw-r--r-- | src/object.c | 19 | ||||
| -rw-r--r-- | src/re.c | 793 | ||||
| -rw-r--r-- | src/sprintf.c | 34 | ||||
| -rw-r--r-- | src/string.c | 2544 | ||||
| -rw-r--r-- | src/struct.c | 2 | ||||
| -rw-r--r-- | src/symbol.c | 148 | ||||
| -rw-r--r-- | src/transcode.c | 4386 | ||||
| -rw-r--r-- | src/transcode_data.h | 109 | ||||
| -rw-r--r-- | src/unicode.c | 2607 | ||||
| -rw-r--r-- | src/us_ascii.c | 34 | ||||
| -rw-r--r-- | src/utf_8.c | 460 |
22 files changed, 653 insertions, 12614 deletions
diff --git a/src/array.c b/src/array.c index 7b486430f..187a8404d 100644 --- a/src/array.c +++ b/src/array.c @@ -10,9 +10,6 @@ #include "mruby/string.h" #include "mruby/class.h" -mrb_value mrb_exec_recursive_paired(mrb_state *mrb, mrb_value (*func) (mrb_state *, mrb_value, mrb_value, int), - mrb_value obj, mrb_value paired_obj, void* arg); - //#define ARY_DEFAULT_LEN 16 #define ARY_DEFAULT_LEN 4 #define ARY_SHRINK_RATIO 5 /* must be larger than 2 */ @@ -30,8 +27,8 @@ ary_elt(mrb_value ary, long offset) return RARRAY_PTR(ary)[offset]; } -mrb_value -mrb_ary_new_capa(mrb_state *mrb, size_t capa) +static struct RArray* +ary_new_capa(mrb_state *mrb, size_t capa) { struct RArray *a; size_t blen; @@ -55,6 +52,13 @@ mrb_ary_new_capa(mrb_state *mrb, size_t capa) a->capa = capa; a->len = 0; + return a; +} + +mrb_value +mrb_ary_new_capa(mrb_state *mrb, size_t capa) +{ + struct RArray *a = ary_new_capa(mrb, capa); return mrb_obj_value(a); } @@ -65,7 +69,7 @@ mrb_ary_new(mrb_state *mrb) } mrb_value -mrb_ary_new_from_values(mrb_state *mrb, mrb_value *vals, size_t size) +mrb_ary_new_from_values(mrb_state *mrb, size_t size, mrb_value *vals) { mrb_value ary; struct RArray *a; @@ -84,7 +88,7 @@ mrb_assoc_new(mrb_state *mrb, mrb_value car, mrb_value cdr) mrb_value arv[2]; arv[0] = car; arv[1] = cdr; - return mrb_ary_new_from_values(mrb, arv, 2); + return mrb_ary_new_from_values(mrb, 2, arv); } void @@ -156,7 +160,7 @@ mrb_ary_s_create(mrb_state *mrb, mrb_value self) int len; mrb_get_args(mrb, "*", &vals, &len); - return mrb_ary_new_from_values(mrb, vals, (size_t)len); + return mrb_ary_new_from_values(mrb, (size_t)len, vals); } void @@ -200,25 +204,6 @@ mrb_ary_plus(mrb_state *mrb, mrb_value self) return ary; } -static mrb_value -recursive_cmp(mrb_state *mrb, mrb_value ary1, mrb_value ary2, int recur) -{ - long i, len; - - if (recur) return mrb_undef_value(); /* Subtle! */ - len = RARRAY_LEN(ary1); - if (len > RARRAY_LEN(ary2)) { - len = RARRAY_LEN(ary2); - } - - for (i=0; i<len; i++) { - mrb_value r = mrb_funcall(mrb, ary_elt(ary1, i), "<=>", 1, ary_elt(ary2, i)); - if (mrb_type(r) != MRB_TT_FIXNUM || mrb_fixnum(r) != 0) return r; - } - - return mrb_undef_value(); -} - /* * call-seq: * ary <=> other_ary -> -1, 0, +1 or nil @@ -242,15 +227,23 @@ mrb_ary_cmp(mrb_state *mrb, mrb_value ary1) { mrb_value ary2; struct RArray *a1, *a2; - mrb_value r; - long len; + mrb_value r = mrb_nil_value(); + long i, len; mrb_get_args(mrb, "o", &ary2); if (mrb_type(ary2) != MRB_TT_ARRAY) return mrb_nil_value(); a1 = RARRAY(ary1); a2 = RARRAY(ary2); if (a1->len == a2->len && a1->buf == a2->buf) return mrb_fixnum_value(0); - r = mrb_exec_recursive_paired(mrb, recursive_cmp, ary1, ary2, &ary2); - if (mrb_type(r) != MRB_TT_UNDEF) return r; + else { + len = RARRAY_LEN(ary1); + if (len > RARRAY_LEN(ary2)) { + len = RARRAY_LEN(ary2); + } + for (i=0; i<len; i++) { + r = mrb_funcall(mrb, ary_elt(ary1, i), "<=>", 1, ary_elt(ary2, i)); + if (mrb_type(r) != MRB_TT_FIXNUM || mrb_fixnum(r) != 0) return r; + } + } len = a1->len - a2->len; return mrb_fixnum_value((len == 0)? 0: (len > 0)? 1: -1); } @@ -569,7 +562,7 @@ mrb_ary_aget(mrb_state *mrb, mrb_value self) if ((len = mrb_fixnum(argv[0])) < 0) return mrb_nil_value(); if (a->len == (size_t)index) return mrb_ary_new(mrb); if ((size_t)len > a->len - index) len = a->len - index; - return mrb_ary_new_from_values(mrb, a->buf + index, len); + return mrb_ary_new_from_values(mrb, len, a->buf + index); default: mrb_raise(mrb, E_ARGUMENT_ERROR, "wrong number of arguments"); @@ -654,7 +647,7 @@ mrb_ary_first(mrb_state *mrb, mrb_value self) /* len == 1 */ size = mrb_fixnum(*vals); if (size > a->len) size = a->len; - return mrb_ary_new_from_values(mrb, a->buf, size); + return mrb_ary_new_from_values(mrb, size, a->buf); } mrb_value @@ -676,7 +669,7 @@ mrb_ary_last(mrb_state *mrb, mrb_value self) /* len == 1 */ size = mrb_fixnum(*vals); if (size > a->len) size = a->len; - return mrb_ary_new_from_values(mrb, a->buf + a->len - size, size); + return mrb_ary_new_from_values(mrb, size, a->buf + a->len - size); } mrb_value @@ -716,7 +709,7 @@ mrb_ary_splat(mrb_state *mrb, mrb_value v) return v; } else { - return mrb_ary_new_from_values(mrb, &v, 1); + return mrb_ary_new_from_values(mrb, 1, &v); } } @@ -923,19 +916,6 @@ mrb_ary_join_m(mrb_state *mrb, mrb_value ary) return mrb_ary_join(mrb, ary, sep); } -static mrb_value -recursive_equal(mrb_state *mrb, mrb_value ary1, mrb_value ary2, int recur) -{ - long i; - - if (recur) return mrb_true_value(); /* Subtle! */ - for (i=0; i<RARRAY_LEN(ary1); i++) { - if (!mrb_equal(mrb, ary_elt(ary1, i), ary_elt(ary2, i))) - return mrb_false_value(); - } - return mrb_true_value(); -} - /* 15.2.12.5.33 (x) */ /* * call-seq: @@ -970,20 +950,15 @@ mrb_ary_equal(mrb_state *mrb, mrb_value ary1) } } if (RARRAY_LEN(ary1) != RARRAY_LEN(ary2)) return mrb_false_value(); - return mrb_exec_recursive_paired(mrb, recursive_equal, ary1, ary2, &ary2); -} - -static mrb_value -recursive_eql(mrb_state *mrb, mrb_value ary1, mrb_value ary2, int recur) -{ - long i; + else { + long i; - if (recur) return mrb_true_value(); /* Subtle! */ - for (i=0; i<RARRAY_LEN(ary1); i++) { - if (!mrb_eql(mrb, ary_elt(ary1, i), ary_elt(ary2, i))) - return mrb_false_value(); + for (i=0; i<RARRAY_LEN(ary1); i++) { + if (!mrb_equal(mrb, ary_elt(ary1, i), ary_elt(ary2, i))) + return mrb_false_value(); + } + return mrb_true_value(); } - return mrb_true_value(); } /* 15.2.12.5.34 (x) */ @@ -1001,10 +976,18 @@ mrb_ary_eql(mrb_state *mrb, mrb_value ary1) mrb_value ary2; mrb_get_args(mrb, "o", &ary2); - if (mrb_obj_equal(mrb, ary1,ary2)) return mrb_true_value(); + if (mrb_obj_equal(mrb, ary1, ary2)) return mrb_true_value(); if (mrb_type(ary2) != MRB_TT_ARRAY) return mrb_false_value(); if (RARRAY_LEN(ary1) != RARRAY_LEN(ary2)) return mrb_false_value(); - return mrb_exec_recursive_paired(mrb, recursive_eql, ary1, ary2, &ary2); + else { + long i; + + for (i=0; i<RARRAY_LEN(ary1); i++) { + if (!mrb_eql(mrb, ary_elt(ary1, i), ary_elt(ary2, i))) + return mrb_false_value(); + } + return mrb_true_value(); + } } void diff --git a/src/ascii.c b/src/ascii.c deleted file mode 100644 index 91bd54073..000000000 --- a/src/ascii.c +++ /dev/null @@ -1,96 +0,0 @@ -/********************************************************************** - ascii.c - Oniguruma (regular expression library) -**********************************************************************/ -/*- - * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "mruby.h" -#ifdef INCLUDE_ENCODING -#include "regenc.h" - -OnigEncodingDefine(ascii, ASCII) = { - onigenc_single_byte_mbc_enc_len, - "ASCII-8BIT",/* name */ - 1, /* max byte length */ - 1, /* min byte length */ - onigenc_is_mbc_newline_0x0a, - onigenc_single_byte_mbc_to_code, - onigenc_single_byte_code_to_mbclen, - onigenc_single_byte_code_to_mbc, - onigenc_ascii_mbc_case_fold, - onigenc_ascii_apply_all_case_fold, - onigenc_ascii_get_case_fold_codes_by_str, - onigenc_minimum_property_name_to_ctype, - onigenc_ascii_is_code_ctype, - onigenc_not_support_get_ctype_code_range, - onigenc_single_byte_left_adjust_char_head, - onigenc_always_true_is_allowed_reverse_match -}; -ENC_ALIAS("BINARY", "ASCII-8BIT") -ENC_REPLICATE("IBM437", "ASCII-8BIT") -ENC_ALIAS("CP437", "IBM437") -ENC_REPLICATE("IBM737", "ASCII-8BIT") -ENC_ALIAS("CP737", "IBM737") -ENC_REPLICATE("IBM775", "ASCII-8BIT") -ENC_ALIAS("CP775", "IBM775") -ENC_REPLICATE("CP850", "ASCII-8BIT") -ENC_ALIAS("IBM850", "CP850") -ENC_REPLICATE("IBM852", "ASCII-8BIT") -ENC_REPLICATE("CP852", "IBM852") -ENC_REPLICATE("IBM855", "ASCII-8BIT") -ENC_REPLICATE("CP855", "IBM855") -ENC_REPLICATE("IBM857", "ASCII-8BIT") -ENC_ALIAS("CP857", "IBM857") -ENC_REPLICATE("IBM860", "ASCII-8BIT") -ENC_ALIAS("CP860", "IBM860") -ENC_REPLICATE("IBM861", "ASCII-8BIT") -ENC_ALIAS("CP861", "IBM861") -ENC_REPLICATE("IBM862", "ASCII-8BIT") -ENC_ALIAS("CP862", "IBM862") -ENC_REPLICATE("IBM863", "ASCII-8BIT") -ENC_ALIAS("CP863", "IBM863") -ENC_REPLICATE("IBM864", "ASCII-8BIT") -ENC_ALIAS("CP864", "IBM864") -ENC_REPLICATE("IBM865", "ASCII-8BIT") -ENC_ALIAS("CP865", "IBM865") -ENC_REPLICATE("IBM866", "ASCII-8BIT") -ENC_ALIAS("CP866", "IBM866") -ENC_REPLICATE("IBM869", "ASCII-8BIT") -ENC_ALIAS("CP869", "IBM869") -ENC_REPLICATE("Windows-1258", "ASCII-8BIT") -ENC_ALIAS("CP1258", "Windows-1258") -ENC_REPLICATE("GB1988", "ASCII-8BIT") -ENC_REPLICATE("macCentEuro", "ASCII-8BIT") -ENC_REPLICATE("macCroatian", "ASCII-8BIT") -ENC_REPLICATE("macCyrillic", "ASCII-8BIT") -ENC_REPLICATE("macGreek", "ASCII-8BIT") -ENC_REPLICATE("macIceland", "ASCII-8BIT") -ENC_REPLICATE("macRoman", "ASCII-8BIT") -ENC_REPLICATE("macRomania", "ASCII-8BIT") -ENC_REPLICATE("macThai", "ASCII-8BIT") -ENC_REPLICATE("macTurkish", "ASCII-8BIT") -ENC_REPLICATE("macUkraine", "ASCII-8BIT") -#endif //INCLUDE_ENCODING diff --git a/src/class.c b/src/class.c index f96922f4b..b13ab2288 100644 --- a/src/class.c +++ b/src/class.c @@ -17,12 +17,6 @@ #include "mruby/khash.h" -#ifdef INCLUDE_REGEXP - #define mrb_usascii_str_new2 mrb_usascii_str_new_cstr -#else - #define mrb_usascii_str_new2 mrb_str_new_cstr -#endif - KHASH_MAP_INIT_INT(mt, struct RProc*); KHASH_MAP_INIT_INT(iv, mrb_value); @@ -1052,7 +1046,7 @@ mrb_mod_to_s(mrb_state *mrb, mrb_value klass) { //if (FL_TEST(klass, FL_SINGLETON)) { if (mrb_type(klass) == MRB_TT_SCLASS) { - mrb_value s = mrb_usascii_str_new2(mrb, "#<"); + mrb_value s = mrb_str_new_cstr(mrb, "#<"); mrb_value v = mrb_iv_get(mrb, klass, mrb_intern(mrb, "__attached__")); mrb_str_cat2(mrb, s, "Class:"); @@ -1140,6 +1134,48 @@ mrb_mod_undef(mrb_state *mrb, mrb_value mod) return mrb_nil_value(); } +static mrb_sym +mrb_sym_value(mrb_state *mrb, mrb_value val) +{ + if(val.tt == MRB_TT_STRING) { + return mrb_intern(mrb, RSTRING_PTR(val)); + } + else if(val.tt != MRB_TT_SYMBOL) { + mrb_value obj = mrb_funcall(mrb, val, "inspect", 0); + mrb_raise(mrb, E_TYPE_ERROR, "%s is not a symbol", + mrb_string_value_ptr(mrb, obj)); + } + return mrb_symbol(val); +} + +mrb_value +mrb_mod_const_defined(mrb_state *mrb, mrb_value mod) +{ + mrb_value sym; + mrb_get_args(mrb, "o", &sym); + if(mrb_const_defined(mrb, mod, mrb_sym_value(mrb, sym))) { + return mrb_true_value(); + } + return mrb_false_value(); +} + +mrb_value +mrb_mod_const_get(mrb_state *mrb, mrb_value mod) +{ + mrb_value sym; + mrb_get_args(mrb, "o", &sym); + return mrb_const_get(mrb, mod, mrb_sym_value(mrb, sym)); +} + +mrb_value +mrb_mod_const_set(mrb_state *mrb, mrb_value mod) +{ + mrb_value sym, value; + mrb_get_args(mrb, "oo", &sym, &value); + mrb_const_set(mrb, mod, mrb_sym_value(mrb, sym), value); + return value; +} + static mrb_value mrb_mod_eqq(mrb_state *mrb, mrb_value mod) @@ -1197,6 +1233,9 @@ mrb_init_class(mrb_state *mrb) mrb_define_method(mrb, mod, "to_s", mrb_mod_to_s, ARGS_NONE()); mrb_define_method(mrb, mod, "alias_method", mrb_mod_alias, ARGS_ANY()); mrb_define_method(mrb, mod, "undef_method", mrb_mod_undef, ARGS_ANY()); + mrb_define_method(mrb, mod, "const_defined?", mrb_mod_const_defined, ARGS_REQ(1)); + mrb_define_method(mrb, mod, "const_get", mrb_mod_const_get, ARGS_REQ(1)); + mrb_define_method(mrb, mod, "const_set", mrb_mod_const_set, ARGS_REQ(2)); mrb_define_method(mrb, mod, "===", mrb_mod_eqq, ARGS_REQ(1)); } diff --git a/src/codegen.c b/src/codegen.c index 11e9eb236..1f4fa818c 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -1225,15 +1225,30 @@ codegen(codegen_scope *s, node *tree, int val) break; case NODE_OP_ASGN: - codegen(s, tree->car, VAL); - codegen(s, tree->cdr->cdr->car, VAL); - genop(s, MKOP_A(OP_LOADNIL, cursp())); - pop(); pop(); { mrb_sym sym = (mrb_sym)tree->cdr->car; const char *name = mrb_sym2name(s->mrb, sym); - int idx = new_msym(s, sym); + int idx; + + codegen(s, tree->car, VAL); + if ((name[0] == '|' && strlen(name) == 2 && name[1] == '|') || + (name[0] == '&' && strlen(name) == 2 && name[1] == '&')) { + int pos; + + pop(); + pos = new_label(s); + genop(s, MKOP_AsBx(name[0] == '|' ? OP_JMPIF : OP_JMPNOT, cursp(), 0)); + codegen(s, tree->cdr->cdr->car, VAL); + pop(); + gen_assignment(s, tree->car, cursp(), val); + dispatch(s, pos); + break; + } + codegen(s, tree->cdr->cdr->car, VAL); + genop(s, MKOP_A(OP_LOADNIL, cursp())); + pop(); pop(); + idx = new_msym(s, sym); if (name[0] == '+' && strlen(name) == 1) { genop(s, MKOP_ABC(OP_ADD, cursp(), idx, 1)); } diff --git a/src/encoding.c b/src/encoding.c deleted file mode 100644 index 8e4257829..000000000 --- a/src/encoding.c +++ /dev/null @@ -1,1685 +0,0 @@ -/* -** encoding.c - Encoding class -** -** See Copyright Notice in mruby.h -*/ - -#include "mruby.h" -#ifdef INCLUDE_ENCODING -#include <ctype.h> -#ifndef NO_LOCALE_CHARMAP -#ifdef __CYGWIN__ -#include <windows.h> -#endif -#ifdef HAVE_LANGINFO_H -#include <langinfo.h> -#endif -#endif - -#define USE_UPPER_CASE_TABLE - -#include <ctype.h> -#include <stdio.h> -#include "regenc.h" -#include "regint.h" -#include "encoding.h" -#include "st.h" -#include <string.h> -#include "mruby/numeric.h" -#include "mruby/string.h" -#include "mruby/array.h" -#include "mruby/variable.h" -#include "mruby/hash.h" - -#define pprintf printf -#define mrb_warning printf -#define mrb_bug printf -#ifndef INT_MAX -#define INT_MAX 2147483647 -#endif -#define mrb_isascii(c) ((unsigned long)(c) < 128) -#define OBJ_FREEZE(a) -static mrb_sym id_encoding; -//mrb_value mrb_cEncoding; -static mrb_value mrb_encoding_list; - -struct mrb_encoding_entry { - const char *name; - mrb_encoding *enc; - mrb_encoding *base; -}; - -static struct { - struct mrb_encoding_entry *list; - int count; - int size; - st_table *names; -} enc_table; - -void mrb_enc_init(mrb_state *mrb); - -enum { - ENCINDEX_ASCII, - ENCINDEX_UTF_8, - ENCINDEX_US_ASCII, - ENCINDEX_BUILTIN_MAX -}; -#define ENCODING_COUNT ENCINDEX_BUILTIN_MAX -#define ENCODING_NAMELEN_MAX 63 -#define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX) -#define STRCASECMP(s1, s2) (st_strcasecmp(s1, s2)) - -//#define BUILTIN_TYPE(x) (int)(((struct RBasic*)(x))->flags & T_MASK) -#ifndef FALSE -#define FALSE 0 -#endif - -#ifndef TRUE -#define TRUE 1 -#endif - -#ifndef OTHER -#define OTHER 2 -#endif - -#define mrb_usascii_str_new2 mrb_usascii_str_new_cstr - -static const struct mrb_data_type encoding_data_type = { - "encoding", 0, -}; -#define is_data_encoding(obj) (DATA_TYPE(obj) == &encoding_data_type) - -// RUBY_IMMEDIATE_MASK = 0x03, -//#define IMMEDIATE_MASK RUBY_IMMEDIATE_MASK -//#define IMMEDIATE_P(x) ((VALUE)(x) & IMMEDIATE_MASK) -//#define SPECIAL_CONST_P(x) (IMMEDIATE_P(x) || !RTEST(x)) - -static mrb_value -enc_new(mrb_state *mrb, mrb_encoding *encoding) -{ - return mrb_obj_value(Data_Wrap_Struct(mrb, ENCODE_CLASS, &encoding_data_type, encoding)); -} - -#define enc_autoload_p(enc) (!mrb_enc_mbmaxlen(enc)) - -#define UNSPECIFIED_ENCODING INT_MAX - - -static mrb_value -mrb_enc_from_encoding_index(mrb_state *mrb, int idx) -{ - mrb_value list, enc; - - if (mrb_nil_p(list = mrb_encoding_list)) { - mrb_bug("mrb_enc_from_encoding_index(%d): no mrb_encoding_list", idx); - } - enc = mrb_ary_ref(mrb, list, idx);//mrb_ary_entry(list, idx); - if (mrb_nil_p(enc)) { - mrb_bug("mrb_enc_from_encoding_index(%d): not created yet", idx); - } - return enc; -} - -mrb_value -mrb_enc_from_encoding(mrb_state *mrb, mrb_encoding *encoding) -{ - int idx; - if (!encoding) return mrb_nil_value(); - idx = ENC_TO_ENCINDEX(encoding); - return mrb_enc_from_encoding_index(mrb, idx); -} - -static int enc_autoload(mrb_state *mrb, mrb_encoding *enc); -static int -check_encoding(mrb_state *mrb, mrb_encoding *enc) -{ - int index = mrb_enc_to_index(enc); - if (mrb_enc_from_index(mrb, index) != enc) - return -1; - if (enc_autoload_p(enc)) { - index = enc_autoload(mrb, enc); - } - return index; -} - -static int -enc_check_encoding(mrb_state *mrb, mrb_value obj) -{ - if (SPECIAL_CONST_P(obj) || !is_data_encoding(obj)) { - return -1; - } - return check_encoding(mrb, RDATA(obj)->data); -} - -static int -must_encoding(mrb_state *mrb, mrb_value enc) -{ - int index = enc_check_encoding(mrb, enc); - if (index < 0) { - mrb_raise(mrb, E_TYPE_ERROR, "wrong argument type %s (expected Encoding)", - mrb_obj_classname(mrb, enc)); - } - return index; -} - -int -mrb_to_encoding_index(mrb_state *mrb, mrb_value enc) -{ - int idx; - - idx = enc_check_encoding(mrb, enc); - if (idx >= 0) { - return idx; - } - else if (mrb_nil_p(enc = mrb_check_string_type(mrb, enc))) { - return -1; - } - if (!mrb_enc_asciicompat(mrb, mrb_enc_get(mrb, enc))) { - return -1; - } - //return mrb_enc_find_index(StringValueCStr(enc)); - return mrb_enc_find_index(mrb, mrb_string_value_cstr(mrb, &enc)); - -} - -static mrb_encoding * -to_encoding(mrb_state *mrb, mrb_value enc) -{ - int idx; - - //StringValue(enc); - mrb_string_value(mrb, &enc); - - if (!mrb_enc_asciicompat(mrb, mrb_enc_get(mrb, enc))) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid name encoding (non ASCII)"); - } - //idx = mrb_enc_find_index(StringValueCStr(enc)); - idx = mrb_enc_find_index(mrb, mrb_string_value_cstr(mrb, &enc)); - if (idx < 0) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "unknown encoding name - %s", RSTRING_PTR(enc)); - } - return mrb_enc_from_index(mrb, idx); -} - -mrb_encoding * -mrb_to_encoding(mrb_state *mrb, mrb_value enc) -{ - if (enc_check_encoding(mrb, enc) >= 0) return RDATA(enc)->data; - return to_encoding(mrb, enc); -} - -static int -enc_table_expand(int newsize) -{ - struct mrb_encoding_entry *ent; - int count = newsize; - - if (enc_table.size >= newsize) return newsize; - newsize = (newsize + 7) / 8 * 8; - ent = realloc(enc_table.list, sizeof(*enc_table.list) * newsize); - if (!ent) return -1; - memset(ent + enc_table.size, 0, sizeof(*ent)*(newsize - enc_table.size)); - enc_table.list = ent; - enc_table.size = newsize; - return count; -} - -static int -enc_register_at(mrb_state *mrb, int index, const char *name, mrb_encoding *encoding) -{ - struct mrb_encoding_entry *ent = &enc_table.list[index]; - mrb_value list; - mrb_value ref_ary; - - if (!valid_encoding_name_p(name)) return -1; - if (!ent->name) { - ent->name = name = strdup(name); - } - else if (STRCASECMP(name, ent->name)) { - return -1; - } - if (!ent->enc) { - ent->enc = xmalloc(sizeof(mrb_encoding)); - } - if (encoding) { - *ent->enc = *encoding; - } - else { - memset(ent->enc, 0, sizeof(*ent->enc)); - } - encoding = ent->enc; - encoding->name = name; - encoding->ruby_encoding_index = index; - st_insert(enc_table.names, (st_data_t)name, (st_data_t)index); - list = mrb_encoding_list; - //if (list && mrb_nil_p((mrb_ary_ref(mrb, list, index)))) { - if (list.tt) { - ref_ary = mrb_ary_ref(mrb, list, index); - if mrb_nil_p(ref_ary) { - /* initialize encoding data */ - mrb_ary_set(mrb, list, index, enc_new(mrb, encoding));//rb_ary_store(list, index, enc_new(encoding)); - } - } - return index; -} - - -static int -enc_register(mrb_state *mrb, const char *name, mrb_encoding *encoding) -{ - int index = enc_table.count; - - if ((index = enc_table_expand(index + 1)) < 0) return -1; - enc_table.count = index; - return enc_register_at(mrb, index - 1, name, encoding); -} - -static void set_encoding_const(mrb_state *, const char*, mrb_encoding*); -int mrb_enc_registered(const char*); - -static void -enc_check_duplication(mrb_state *mrb, const char *name) -{ - if (mrb_enc_registered(name) >= 0) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "encoding %s is already registered", name); - } -} -static mrb_encoding* -set_base_encoding(int index, mrb_encoding *base) -{ - mrb_encoding *enc = enc_table.list[index].enc; - - enc_table.list[index].base = base; - if (mrb_enc_dummy_p(base)) ENC_SET_DUMMY(enc); - return enc; -} - -int -mrb_enc_replicate(mrb_state *mrb, const char *name, mrb_encoding *encoding) -{ - int idx; - - enc_check_duplication(mrb, name); - idx = enc_register(mrb, name, encoding); - set_base_encoding(idx, encoding); - set_encoding_const(mrb, name, mrb_enc_from_index(mrb, idx)); - return idx; -} - -/* 15.2.40.2.17 */ -/* - * call-seq: - * enc.replicate(name) -> encoding - * - * Returns a replicated encoding of _enc_ whose name is _name_. - * The new encoding should have the same byte structure of _enc_. - * If _name_ is used by another encoding, raise ArgumentError. - * - */ -static mrb_value -enc_replicate(mrb_state *mrb, mrb_value encoding) -{ - mrb_value name; - mrb_get_args(mrb, "o", &name); - return mrb_enc_from_encoding_index(mrb, - //mrb_enc_replicate(mrb, StringValueCStr(name), - mrb_enc_replicate(mrb, mrb_string_value_cstr(mrb, &name), - mrb_to_encoding(mrb, encoding))); -} -static int -enc_replicate_with_index(mrb_state *mrb, const char *name, mrb_encoding *origenc, int idx) -{ - if (idx < 0) { - idx = enc_register(mrb, name, origenc); - } - else { - idx = enc_register_at(mrb, idx, name, origenc); - } - if (idx >= 0) { - set_base_encoding(idx, origenc); - set_encoding_const(mrb, name, mrb_enc_from_index(mrb, idx)); - } - return idx; -} -int -mrb_encdb_replicate(mrb_state *mrb, const char *name, const char *orig) -{ - int origidx = mrb_enc_registered(orig); - int idx = mrb_enc_registered(name); - - if (origidx < 0) { - origidx = enc_register(mrb, orig, 0); - } - return enc_replicate_with_index(mrb, name, mrb_enc_from_index(mrb, origidx), idx); -} -int -mrb_define_dummy_encoding(mrb_state *mrb, const char *name) -{ - int index = mrb_enc_replicate(mrb, name, mrb_ascii8bit_encoding(mrb)); - mrb_encoding *enc = enc_table.list[index].enc; - - ENC_SET_DUMMY(enc); - return index; -} - -int -mrb_encdb_dummy(mrb_state *mrb, const char *name) -{ - int index = enc_replicate_with_index(mrb, name, mrb_ascii8bit_encoding(mrb), - mrb_enc_registered(name)); - mrb_encoding *enc = enc_table.list[index].enc; - - ENC_SET_DUMMY(enc); - return index; -} - -/* 15.2.40.2.13 */ -/* - * call-seq: - * enc.dummy? -> true or false - * - * Returns true for dummy encodings. - * A dummy encoding is an encoding for which character handling is not properly - * implemented. - * It is used for stateful encodings. - * - * Encoding::ISO_2022_JP.dummy? #=> true - * Encoding::UTF_8.dummy? #=> false - * - */ -static mrb_value -enc_dummy_p(mrb_state *mrb, mrb_value enc) -{ - return ENC_DUMMY_P(enc_table.list[must_encoding(mrb, enc)].enc) ? mrb_true_value() : mrb_false_value(); -} - -/* 15.2.40.2.12 */ -/* - * call-seq: - * enc.ascii_compatible? -> true or false - * - * Returns whether ASCII-compatible or not. - * - * Encoding::UTF_8.ascii_compatible? #=> true - * Encoding::UTF_16BE.ascii_compatible? #=> false - * - */ -static mrb_value -enc_ascii_compatible_p(mrb_state *mrb, mrb_value enc) -{ - return mrb_enc_asciicompat(mrb, enc_table.list[must_encoding(mrb, enc)].enc) ? mrb_true_value() : mrb_false_value(); -} - -static const char * -enc_alias_internal(const char *alias, int idx) -{ - alias = strdup(alias); - st_insert(enc_table.names, (st_data_t)alias, (st_data_t)idx); - return alias; -} - -/* - * Returns 1 when the encoding is Unicode series other than UTF-7 else 0. - */ -int -mrb_enc_unicode_p(mrb_encoding *enc) -{ - const char *name = mrb_enc_name(enc); - return name[0] == 'U' && name[1] == 'T' && name[2] == 'F' && name[4] != '7'; -} - -extern mrb_encoding OnigEncodingUTF_8; -extern mrb_encoding OnigEncodingUS_ASCII; - -void -mrb_enc_init(mrb_state *mrb) -{ - enc_table_expand(ENCODING_COUNT + 1); - if (!enc_table.names) { - enc_table.names = st_init_strcasetable(); - } -#define ENC_REGISTER(enc) enc_register_at(mrb, ENCINDEX_##enc, mrb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc) - ENC_REGISTER(ASCII); - ENC_REGISTER(UTF_8); - ENC_REGISTER(US_ASCII); -#undef ENC_REGISTER - enc_table.count = ENCINDEX_BUILTIN_MAX; -} - -mrb_encoding * -mrb_enc_from_index(mrb_state *mrb, int index) -{ - if (!enc_table.list) { - mrb_enc_init(mrb); - } - if (index < 0 || enc_table.count <= index) { - return 0; - } - return enc_table.list[index].enc; -} - -int -mrb_enc_registered(const char *name) -{ - st_data_t idx = 0; - - if (!name) return -1; - if (!enc_table.list) return -1; - if (st_lookup(enc_table.names, (st_data_t)name, &idx)) { - return (int)idx; - } - return -1; -} - -mrb_value -mrb_require_safe(mrb_value fname, int safe) -{ - mrb_value result = mrb_nil_value(); - return result; -} -static int -load_encoding(const char *name) -{ - mrb_value enclib;// = mrb_sprintf("enc/%s.so", name); - //mrb_value verbose;// = ruby_verbose; - //mrb_value debug;// = ruby_debug; - //mrb_value loaded; - char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3; - int idx; - - while (s < e) { - if (!ISALNUM(*s)) *s = '_'; - else if (ISUPPER(*s)) *s = TOLOWER(*s); - ++s; - } - OBJ_FREEZE(enclib); - //ruby_verbose = mrb_false_value(); - //ruby_debug = mrb_false_value(); - //loaded = mrb_protect(require_enc, enclib, 0); - //ruby_verbose = verbose; - //ruby_debug = debug; - //rb_set_errinfo(mrb_nil_value()); - //if (mrb_nil_p(loaded)) return -1; - if ((idx = mrb_enc_registered(name)) < 0) return -1; - if (enc_autoload_p(enc_table.list[idx].enc)) return -1; - return idx; -} - -static int -enc_autoload(mrb_state *mrb, mrb_encoding *enc) -{ - int i; - mrb_encoding *base = enc_table.list[ENC_TO_ENCINDEX(enc)].base; - - if (base) { - i = 0; - do { - if (i >= enc_table.count) return -1; - } while (enc_table.list[i].enc != base && (++i, 1)); - if (enc_autoload_p(base)) { - if (enc_autoload(mrb, base) < 0) return -1; - } - i = ENC_TO_ENCINDEX(enc); - enc_register_at(mrb, i, mrb_enc_name(enc), base); - } - else { - i = load_encoding(mrb_enc_name(enc)); - } - return i; -} - -int -mrb_enc_find_index(mrb_state *mrb, const char *name) -{ - int i = mrb_enc_registered(name); - mrb_encoding *enc; - - if (i < 0) { - i = load_encoding(name); - } - else if (!(enc = mrb_enc_from_index(mrb, i))) { - if (i != UNSPECIFIED_ENCODING) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "encoding %s is not registered", name); - } - } - else if (enc_autoload_p(enc)) { - if (enc_autoload(mrb, enc) < 0) { - //mrb_warn("failed to load encoding (%s); use ASCII-8BIT instead", - printf("failed to load encoding (%s); use ASCII-8BIT instead", - name); - return 0; - } - } - return i; -} - -mrb_encoding * -mrb_enc_find(mrb_state *mrb, const char *name) -{ - int idx = mrb_enc_find_index(mrb, name); - if (idx < 0) idx = 0; - return mrb_enc_from_index(mrb, idx); -} - -static inline int -enc_capable(mrb_value obj) -{ - if (SPECIAL_CONST_P(obj)) return (mrb_type(obj) == MRB_TT_SYMBOL); - switch (mrb_type(obj)/*BUILTIN_TYPE(obj)*/) { - case MRB_TT_STRING: - case MRB_TT_REGEX: - case MRB_TT_FILE: - return TRUE; - case MRB_TT_DATA: - if (is_data_encoding(obj)) return TRUE; - default: - return FALSE; - } -} - -mrb_sym -mrb_id_encoding(mrb_state *mrb) -{ - //CONST_ID(id_encoding, "encoding"); - id_encoding = mrb_intern(mrb, "encoding"); - return id_encoding; -} - -int -mrb_enc_get_index(mrb_state *mrb, mrb_value obj) -{ - int i = -1; - mrb_value tmp; - struct RString *ps; - - if (SPECIAL_CONST_P(obj)) { - if (mrb_type(obj) != MRB_TT_SYMBOL) return -1; - //obj = mrb_id2str(SYM2ID(obj)); - obj = mrb_str_new_cstr(mrb, mrb_sym2name(mrb, SYM2ID(obj))); - } - switch (mrb_type(obj)/*BUILTIN_TYPE(obj)*/) { - as_default: - default: - case MRB_TT_STRING: - case MRB_TT_REGEX: - i = (int)ENCODING_GET_INLINED(obj); - ps = mrb_str_ptr(obj); - if (i == ENCODING_INLINE_MAX) { - mrb_value iv; - - //iv = rb_ivar_get(obj, mrb_id_encoding(mrb)); - iv = mrb_iv_get(mrb, obj, mrb_id_encoding(mrb)); - i = mrb_fixnum(iv); - } - break; - - case MRB_TT_FILE: - tmp = mrb_funcall(mrb, obj, "internal_encoding", 0, 0); - if (mrb_nil_p(tmp)) obj = mrb_funcall(mrb, obj, "external_encoding", 0, 0); - else obj = tmp; - if (mrb_nil_p(obj)) break; - case MRB_TT_DATA: - if (is_data_encoding(obj)) { - i = enc_check_encoding(mrb, obj); - } - else { - goto as_default; - } - break; - } - return i; -} - -void -mrb_enc_set_index(mrb_state *mrb, mrb_value obj, int idx) -{ - if (idx < ENCODING_INLINE_MAX) { - ENCODING_SET_INLINED(obj, idx); - return; - } - ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX); - //mrb_ivar_set(obj, mrb_id_encoding(mrb), INT2NUM(idx)); - mrb_iv_set(mrb, obj, mrb_id_encoding(mrb), mrb_fixnum_value(idx)); - return; -} - -mrb_value -mrb_enc_associate_index(mrb_state *mrb, mrb_value obj, int idx) -{ -/* enc_check_capable(obj);*/ - if (mrb_enc_get_index(mrb, obj) == idx) - return obj; - if (SPECIAL_CONST_P(obj)) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "cannot set encoding"); - } - if (!ENC_CODERANGE_ASCIIONLY(obj) || - !mrb_enc_asciicompat(mrb, mrb_enc_from_index(mrb, idx))) { - ENC_CODERANGE_CLEAR(obj); - } - mrb_enc_set_index(mrb, obj, idx); - return obj; -} - -mrb_value -mrb_enc_associate(mrb_state *mrb, mrb_value obj, mrb_encoding *enc) -{ - return mrb_enc_associate_index(mrb, obj, mrb_enc_to_index(enc)); -} - -mrb_encoding* -mrb_enc_get(mrb_state *mrb, mrb_value obj) -{ - return mrb_enc_from_index(mrb, mrb_enc_get_index(mrb, obj)); -} - -mrb_encoding* -mrb_enc_check(mrb_state *mrb, mrb_value str1, mrb_value str2) -{ - mrb_encoding *enc = mrb_enc_compatible(mrb, str1, str2); - if (!enc) - mrb_raise(mrb, E_ENCODING_ERROR, "incompatible character encodings: %s and %s", - mrb_enc_name(mrb_enc_get(mrb, str1)), - mrb_enc_name(mrb_enc_get(mrb, str2))); - return enc; -} - -mrb_encoding* -mrb_enc_compatible(mrb_state *mrb, mrb_value str1, mrb_value str2) -{ - int idx1, idx2; - mrb_encoding *enc1, *enc2; - - idx1 = mrb_enc_get_index(mrb, str1); - idx2 = mrb_enc_get_index(mrb, str2); - - if (idx1 < 0 || idx2 < 0) - return 0; - - if (idx1 == idx2) { - return mrb_enc_from_index(mrb, idx1); - } - enc1 = mrb_enc_from_index(mrb, idx1); - enc2 = mrb_enc_from_index(mrb, idx2); - - if (mrb_type(str2) == MRB_TT_STRING && RSTRING_LEN(str2) == 0) - //return (idx1 == ENCINDEX_US_ASCII && mrb_enc_asciicompat(mrb, enc2)) ? enc2 : enc1; - return enc1; - if (mrb_type(str1) == MRB_TT_STRING && RSTRING_LEN(str1) == 0) - //return (idx2 == ENCINDEX_US_ASCII && mrb_enc_asciicompat(mrb, enc1)) ? enc1 : enc2; - return enc2; - if (!mrb_enc_asciicompat(mrb, enc1) || !mrb_enc_asciicompat(mrb, enc2)) { - return 0; - } - - /* objects whose encoding is the same of contents */ - //if (mrb_type(str2)/*BUILTIN_TYPE(str2)*/ != MRB_TT_STRING && idx2 == ENCINDEX_US_ASCII) - //return enc1; - //if (mrb_type(str1)/*BUILTIN_TYPE(str1)*/ != MRB_TT_STRING && idx1 == ENCINDEX_US_ASCII) - //return enc2; - - if (mrb_type(str1)/*BUILTIN_TYPE(str1)*/ != MRB_TT_STRING) { - mrb_value tmp = str1; - int idx0 = idx1; - str1 = str2; - str2 = tmp; - idx1 = idx2; - idx2 = idx0; - } - if (mrb_type(str1)/*BUILTIN_TYPE(str1)*/ == MRB_TT_STRING) { - int cr1, cr2; - - cr1 = mrb_enc_str_coderange(mrb, str1); - if (mrb_type(str2)/*BUILTIN_TYPE(str2)*/ == MRB_TT_STRING) { - cr2 = mrb_enc_str_coderange(mrb, str2); - if (cr1 != cr2) { - /* may need to handle ENC_CODERANGE_BROKEN */ - if (cr1 == ENC_CODERANGE_7BIT) return enc2; - if (cr2 == ENC_CODERANGE_7BIT) return enc1; - } - if (cr2 == ENC_CODERANGE_7BIT) { - if (idx1 == ENCINDEX_ASCII) return enc2; - return enc1; - } - } - if (cr1 == ENC_CODERANGE_7BIT) - return enc2; - } - return 0; -} - -void -mrb_enc_copy(mrb_state *mrb, mrb_value obj1, mrb_value obj2) -{ - mrb_enc_associate_index(mrb, obj1, mrb_enc_get_index(mrb, obj2)); -} - - -/* - * call-seq: - * obj.encoding -> encoding - * - * Returns the Encoding object that represents the encoding of obj. - */ - -mrb_value -mrb_obj_encoding(mrb_state *mrb, mrb_value obj) -{ - mrb_encoding *enc = mrb_enc_get(mrb, obj); - if (!enc) { - mrb_raise(mrb, E_TYPE_ERROR, "unknown encoding"); - } - return mrb_enc_from_encoding(mrb, enc); -} - -int -mrb_enc_fast_mbclen(const char *p, const char *e, mrb_encoding *enc) -{ - return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); -} - -int -mrb_enc_mbclen(const char *p, const char *e, mrb_encoding *enc) -{ - int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); - if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p) - return MBCLEN_CHARFOUND_LEN(n); - else { - int min = mrb_enc_mbminlen(enc); - return min <= e-p ? min : (int)(e-p); - } -} - -int -mrb_enc_precise_mbclen(const char *p, const char *e, mrb_encoding *enc) -{ - int n; - if (e <= p) - return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); - n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); - if (e-p < n) - return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p)); - return n; -} - -int -mrb_enc_ascget(mrb_state *mrb, const char *p, const char *e, int *len, mrb_encoding *enc) -{ - unsigned int c, l; - if (e <= p) - return -1; - if (mrb_enc_asciicompat(mrb, enc)) { - c = (unsigned char)*p; - if (!ISASCII(c)) - return -1; - if (len) *len = 1; - return c; - } - l = mrb_enc_precise_mbclen(p, e, enc); - if (!MBCLEN_CHARFOUND_P(l)) - return -1; - c = mrb_enc_mbc_to_codepoint(p, e, enc); - if (!mrb_enc_isascii(c, enc)) - return -1; - if (len) *len = l; - return c; -} - -unsigned int -mrb_enc_codepoint_len(mrb_state *mrb, const char *p, const char *e, int *len_p, mrb_encoding *enc) -{ - int r; - if (e <= p) - mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string"); - r = mrb_enc_precise_mbclen(p, e, enc); - if (MBCLEN_CHARFOUND_P(r)) { - if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r); - return mrb_enc_mbc_to_codepoint(p, e, enc); - } - else - mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid byte sequence in %s", mrb_enc_name(enc)); - return 0; -} - -#undef mrb_enc_codepoint -unsigned int -mrb_enc_codepoint(mrb_state *mrb, const char *p, const char *e, mrb_encoding *enc) -{ - return mrb_enc_codepoint_len(mrb, p, e, 0, enc); -} - -int -mrb_enc_codelen(mrb_state *mrb, int c, mrb_encoding *enc) -{ - int n = ONIGENC_CODE_TO_MBCLEN(enc,c); - if (n == 0) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid codepoint 0x%x in %s", c, mrb_enc_name(enc)); - } - return n; -} - -int -mrb_enc_toupper(int c, mrb_encoding *enc) -{ - return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c)); -} - -int -mrb_enc_tolower(int c, mrb_encoding *enc) -{ - return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c)); -} - -/* 15.2.40.2.14 */ -/* - * call-seq: - * enc.inspect -> string - * - * Returns a string which represents the encoding for programmers. - * - * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>" - * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>" - */ -static mrb_value -enc_inspect(mrb_state *mrb, mrb_value self) -{ - mrb_value str; - //mrb_value str = mrb_sprintf("#<%s:%s%s>", mrb_obj_classname(mrb, self), - // mrb_enc_name((mrb_encoding*)(DATA_PTR(self))), - // (mrb_fixnum(enc_dummy_p(mrb, self)) ? " (dummy)" : "")); - char buf[256]; - sprintf(buf, "#<%s:%s%s>", mrb_obj_classname(mrb, self), - mrb_enc_name((mrb_encoding*)(DATA_PTR(self))), - (mrb_enc_dummy_p((mrb_encoding*)(DATA_PTR(self))) ? " (dummy)" : "")); - str = mrb_str_new(mrb, buf, strlen(buf)); - ENCODING_CODERANGE_SET(mrb, str, mrb_usascii_encindex(), ENC_CODERANGE_7BIT); - return str; -} - -/* 15.2.40.2.15 */ -/* 15.2.40.2.18 */ -/* - * call-seq: - * enc.name -> string - * - * Returns the name of the encoding. - * - * Encoding::UTF_8.name #=> "UTF-8" - */ -static mrb_value -enc_name(mrb_state *mrb, mrb_value self) -{ - return mrb_usascii_str_new2(mrb, mrb_enc_name((mrb_encoding*)DATA_PTR(self))); -} - -struct fn_arg { - mrb_state *mrb; - enum st_retval (*func)(ANYARGS); - void *a; -}; - -static enum st_retval -fn_i(st_data_t key, st_data_t val, st_data_t arg) { - struct fn_arg *a = (struct fn_arg*)arg; - - return (*a->func)(a->mrb, key, val, a->a); -} - -static int -st_foreachNew(mrb_state *mrb, st_table *tbl, enum st_retval (*func)(ANYARGS), void *a) -{ - struct fn_arg arg = { - mrb, - func, - a, - }; - - return st_foreach(tbl, fn_i, (st_data_t)&arg); -} - -static enum st_retval -enc_names_i(mrb_state *mrb, st_data_t name, st_data_t idx, st_data_t args) -{ - mrb_value *arg = (mrb_value*)args; - int iargs = mrb_fixnum(arg[0]); - //if ((int)idx == (int)arg[0]) { - if ((int)idx == iargs) { - mrb_value str = mrb_usascii_str_new2(mrb, (char*)name); - //OBJ_FREEZE(str); - mrb_ary_push(mrb, arg[1], str); - } - return ST_CONTINUE; -} - -/* 15.2.40.2.16 */ -/* - * call-seq: - * enc.names -> array - * - * Returns the list of name and aliases of the encoding. - * - * Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J"] - */ -static mrb_value -enc_names(mrb_state *mrb, mrb_value self) -{ - mrb_value args[2]; - - args[0] = mrb_fixnum_value(mrb_to_encoding_index(mrb, self)); - args[1] = mrb_ary_new_capa(mrb, 0);//mrb_ary_new2(0); - st_foreachNew(mrb, enc_table.names, enc_names_i, args); - return args[1]; -} - -/* 15.2.40.2.8 */ -/* - * call-seq: - * Encoding.list -> [enc1, enc2, ...] - * - * Returns the list of loaded encodings. - * - * Encoding.list - * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>, - * #<Encoding:ISO-2022-JP (dummy)>] - * - * Encoding.find("US-ASCII") - * #=> #<Encoding:US-ASCII> - * - * Encoding.list - * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>, - * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>] - * - */ -static mrb_value -enc_list(mrb_state *mrb, mrb_value klass) -{ - struct RArray *ar = (struct RArray*)mrb_encoding_list.value.p; - mrb_value ary = mrb_ary_new_capa(mrb, 0);//mrb_ary_new2(0); - //mrb_ary_replace_m(mrb, ary/*, mmrb_encoding_list*/); - mrb_ary_replace(mrb, mrb_ary_ptr(ary), ar->buf, enc_table.count); - return ary; -} - -/* 15.2.40.2.7 */ -/* - * call-seq: - * Encoding.find(string) -> enc - * Encoding.find(symbol) -> enc - * - * Search the encoding with specified <i>name</i>. - * <i>name</i> should be a string or symbol. - * - * Encoding.find("US-ASCII") #=> #<Encoding:US-ASCII> - * Encoding.find(:Shift_JIS) #=> #<Encoding:Shift_JIS> - * - * Names which this method accept are encoding names and aliases - * including following special aliases - * - * "external":: default external encoding - * "internal":: default internal encoding - * "locale":: locale encoding - * "filesystem":: filesystem encoding - * - * An ArgumentError is raised when no encoding with <i>name</i>. - * Only <code>Encoding.find("internal")</code> however returns nil - * when no encoding named "internal", in other words, when Ruby has no - * default internal encoding. - */ -static mrb_value -enc_find(mrb_state *mrb, mrb_value klass) -{ - mrb_value enc; - - mrb_get_args(mrb, "o", &enc); - return mrb_enc_from_encoding(mrb, to_encoding(mrb, enc)); -} - -/* 15.2.40.2.2 */ -/* - * call-seq: - * Encoding.compatible?(str1, str2) -> enc or nil - * - * Checks the compatibility of two strings. - * If they are compatible, means concatenatable, - * returns an encoding which the concatenated string will be. - * If they are not compatible, nil is returned. - * - * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b") - * #=> #<Encoding:ISO-8859-1> - * - * Encoding.compatible?( - * "\xa1".force_encoding("iso-8859-1"), - * "\xa1\xa1".force_encoding("euc-jp")) - * #=> nil - * - */ -static mrb_value -enc_compatible_p(mrb_state *mrb, mrb_value klass) -{ - mrb_value str1; - mrb_value str2; - mrb_encoding *enc; - - mrb_get_args(mrb, "oo", &str1, &str2); - if (!enc_capable(str1)) return mrb_nil_value(); - if (!enc_capable(str2)) return mrb_nil_value(); - enc = mrb_enc_compatible(mrb, str1, str2); - if (!enc) return mrb_nil_value(); - return mrb_enc_from_encoding(mrb, enc); -} - -/* 15.2.40.2.19 */ -/* :nodoc: */ -static mrb_value -enc_dump(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value self) -{ - //mrb_scan_args(argc, argv, "01", 0); - return enc_name(mrb, self); -} - -/* 15.2.40.2.11 */ -/* :nodoc: */ -static mrb_value -enc_load(mrb_state *mrb, mrb_value klass) -{ - mrb_value str; - - mrb_get_args(mrb, "o", &str); - return enc_find(mrb, str); -} - -mrb_encoding * -mrb_ascii8bit_encoding(mrb_state *mrb) -{ - if (!enc_table.list) { - mrb_enc_init(mrb); - } - return enc_table.list[ENCINDEX_ASCII].enc; -} - -int -mrb_ascii8bit_encindex(void) -{ - return ENCINDEX_ASCII; -} - -mrb_encoding * -mrb_utf8_encoding(mrb_state *mrb) -{ - if (!enc_table.list) { - mrb_enc_init(mrb); - } - return enc_table.list[ENCINDEX_UTF_8].enc; -} - -int -mrb_utf8_encindex(void) -{ - return ENCINDEX_UTF_8; -} - -mrb_encoding * -mrb_usascii_encoding(mrb_state *mrb) -{ - if (!enc_table.list) { - mrb_enc_init(mrb); - } - return enc_table.list[ENCINDEX_US_ASCII].enc; -} - -int -mrb_usascii_encindex(void) -{ - return ENCINDEX_US_ASCII; -} - -int -mrb_locale_encindex(mrb_state *mrb) -{ - mrb_value charmap = mrb_locale_charmap(mrb, mrb_obj_value(ENCODE_CLASS)); - int idx; - - if (mrb_nil_p(charmap)) - idx = mrb_usascii_encindex(); - //else if ((idx = mrb_enc_find_index(StringValueCStr(charmap))) < 0) - else if ((idx = mrb_enc_find_index(mrb, mrb_string_value_cstr(mrb, &charmap))) < 0) - idx = mrb_ascii8bit_encindex(); - - if (mrb_enc_registered("locale") < 0) enc_alias_internal("locale", idx); - - return idx; -} - -mrb_encoding * -mrb_locale_encoding(mrb_state *mrb) -{ - return mrb_enc_from_index(mrb, mrb_locale_encindex(mrb)); -} - -static int -enc_set_filesystem_encoding(mrb_state *mrb) -{ - int idx; -#if defined NO_LOCALE_CHARMAP - idx = mrb_enc_to_index(mrb_default_external_encoding(mrb)); -#elif defined _WIN32 || defined __CYGWIN__ - char cp[sizeof(int) * 8 / 3 + 4]; - //snprintf(cp, sizeof cp, "CP%d", AreFileApisANSI() ? GetACP() : GetOEMCP()); - idx = mrb_enc_find_index(mrb, cp); - if (idx < 0) idx = mrb_ascii8bit_encindex(); -#else - idx = mrb_enc_to_index(mrb_default_external_encoding(mrb)); -#endif - - enc_alias_internal("filesystem", idx); - return idx; -} - -int -mrb_filesystem_encindex(void) -{ - int idx = mrb_enc_registered("filesystem"); - if (idx < 0) - idx = mrb_ascii8bit_encindex(); - return idx; -} - -mrb_encoding * -mrb_filesystem_encoding(mrb_state *mrb) -{ - return mrb_enc_from_index(mrb, mrb_filesystem_encindex()); -} - -struct default_encoding { - int index; /* -2 => not yet set, -1 => nil */ - mrb_encoding *enc; -}; - -static struct default_encoding default_external = {0}; - -static int -enc_set_default_encoding(mrb_state *mrb, struct default_encoding *def, mrb_value encoding, const char *name) -{ - int overridden = FALSE; - - if (def->index != -2) - /* Already set */ - overridden = TRUE; - - if (mrb_nil_p(encoding)) { - def->index = -1; - def->enc = 0; - st_insert(enc_table.names, (st_data_t)strdup(name), - (st_data_t)UNSPECIFIED_ENCODING); - } - else { - def->index = mrb_enc_to_index(mrb_to_encoding(mrb, encoding)); - def->enc = 0; - enc_alias_internal(name, def->index); - } - - if (def == &default_external) - enc_set_filesystem_encoding(mrb); - - return overridden; -} - -mrb_encoding * -mrb_default_external_encoding(mrb_state *mrb) -{ - if (default_external.enc) return default_external.enc; - - if (default_external.index >= 0) { - default_external.enc = mrb_enc_from_index(mrb, default_external.index); - return default_external.enc; - } - else { - return mrb_locale_encoding(mrb); - } -} - -mrb_value -mrb_enc_default_external(mrb_state *mrb) -{ - return mrb_enc_from_encoding(mrb, mrb_default_external_encoding(mrb)); -} - -/* 15.2.40.2.3 */ -/* - * call-seq: - * Encoding.default_external -> enc - * - * Returns default external encoding. - * - * It is initialized by the locale or -E option. - */ -static mrb_value -get_default_external(mrb_state *mrb, mrb_value klass) -{ - return mrb_enc_default_external(mrb); -} - -void -mrb_enc_set_default_external(mrb_state *mrb, mrb_value encoding) -{ - if (mrb_nil_p(encoding)) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "default external can not be nil"); - } - enc_set_default_encoding(mrb, &default_external, encoding, - "external"); -} - -/* 15.2.40.2.4 */ -/* - * call-seq: - * Encoding.default_external = enc - * - * Sets default external encoding. - */ -static mrb_value -set_default_external(mrb_state *mrb, mrb_value klass) -{ - mrb_value encoding; - - mrb_get_args(mrb, "o", &encoding); - mrb_warning("setting Encoding.default_external"); - mrb_enc_set_default_external(mrb, encoding); - return encoding; -} - -static struct default_encoding default_internal = {-2}; - -mrb_encoding * -mrb_default_internal_encoding(mrb_state *mrb) -{ - if (!default_internal.enc && default_internal.index >= 0) { - default_internal.enc = mrb_enc_from_index(mrb, default_internal.index); - } - return default_internal.enc; /* can be NULL */ -} - -mrb_value -mrb_enc_default_internal(mrb_state *mrb) -{ - /* Note: These functions cope with default_internal not being set */ - return mrb_enc_from_encoding(mrb, mrb_default_internal_encoding(mrb)); -} - -/* 15.2.40.2.5 */ -/* - * call-seq: - * Encoding.default_internal -> enc - * - * Returns default internal encoding. - * - * It is initialized by the source internal_encoding or -E option. - */ -static mrb_value -get_default_internal(mrb_state *mrb, mrb_value klass) -{ - return mrb_enc_default_internal(mrb); -} - -void -mrb_enc_set_default_internal(mrb_state *mrb, mrb_value encoding) -{ - enc_set_default_encoding(mrb, &default_internal, encoding, - "internal"); -} - -/* 15.2.40.2.6 */ -/* - * call-seq: - * Encoding.default_internal = enc or nil - * - * Sets default internal encoding. - * Or removes default internal encoding when passed nil. - */ -static mrb_value -set_default_internal(mrb_state *mrb, mrb_value klass) -{ - mrb_value encoding; - - mrb_get_args(mrb, "o", &encoding); - mrb_warning("setting Encoding.default_internal"); - mrb_enc_set_default_internal(mrb, encoding); - return encoding; -} - -#define digit(x) ((x) >= '0' && (x) <= '9') -#ifndef _MSC_VER -#define strstart(s, n) (strncasecmp(s, n, strlen(n)) == 0) -#else -#define strstart(s, n) (_stricmp(s, n) == 0) -#endif -#define C_CODESET "US-ASCII" /* Return this as the encoding of the - * C/POSIX locale. Could as well one day - * become "UTF-8". */ -#if defined _WIN32 || defined __CYGWIN__ -#define JA_CODESET "Windows-31J" -#else -#define JA_CODESET "EUC-JP" -#endif - -static char buf[16]; - -const char * -nl_langinfo_codeset(void) -{ - const char *l, *p; - int n; - - if (((l = getenv("LC_ALL")) && *l) || - ((l = getenv("LC_CTYPE")) && *l) || - ((l = getenv("LANG")) && *l)) { - /* check standardized locales */ - if (!strcmp(l, "C") || !strcmp(l, "POSIX")) - return C_CODESET; - /* check for encoding name fragment */ - p = strchr(l, '.'); - if (!p++) p = l; - if (strstart(p, "UTF")) - return "UTF-8"; - if ((n = 5, strstart(p, "8859-")) || (n = 9, strstart(p, "ISO-8859-"))) { - if (digit(p[n])) { - p += n; - memcpy(buf, "ISO-8859-\0\0", 12); - buf[9] = *p++; - if (digit(*p)) buf[10] = *p++; - return buf; - } - } - if (strstart(p, "KOI8-R")) return "KOI8-R"; - if (strstart(p, "KOI8-U")) return "KOI8-U"; - if (strstart(p, "620")) return "TIS-620"; - if (strstart(p, "2312")) return "GB2312"; - if (strstart(p, "HKSCS")) return "Big5HKSCS"; /* no MIME charset */ - if (strstart(p, "BIG5")) return "Big5"; - if (strstart(p, "GBK")) return "GBK"; /* no MIME charset */ - if (strstart(p, "18030")) return "GB18030"; /* no MIME charset */ - if (strstart(p, "Shift_JIS") || strstart(p, "SJIS")) return "Windows-31J"; - /* check for conclusive modifier */ - if (strstart(p, "euro")) return "ISO-8859-15"; - /* check for language (and perhaps country) codes */ - if (strstart(l, "zh_TW")) return "Big5"; - if (strstart(l, "zh_HK")) return "Big5HKSCS"; /* no MIME charset */ - if (strstart(l, "zh")) return "GB2312"; - if (strstart(l, "ja")) return JA_CODESET; - if (strstart(l, "ko")) return "EUC-KR"; - if (strstart(l, "ru")) return "KOI8-R"; - if (strstart(l, "uk")) return "KOI8-U"; - if (strstart(l, "pl") || strstart(l, "hr") || - strstart(l, "hu") || strstart(l, "cs") || - strstart(l, "sk") || strstart(l, "sl")) return "ISO-8859-2"; - if (strstart(l, "eo") || strstart(l, "mt")) return "ISO-8859-3"; - if (strstart(l, "el")) return "ISO-8859-7"; - if (strstart(l, "he")) return "ISO-8859-8"; - if (strstart(l, "tr")) return "ISO-8859-9"; - if (strstart(l, "th")) return "TIS-620"; /* or ISO-8859-11 */ - if (strstart(l, "lt")) return "ISO-8859-13"; - if (strstart(l, "cy")) return "ISO-8859-14"; - if (strstart(l, "ro")) return "ISO-8859-2"; /* or ISO-8859-16 */ - if (strstart(l, "am") || strstart(l, "vi")) return "UTF-8"; - /* Send me further rules if you like, but don't forget that we are - * *only* interested in locale naming conventions on platforms - * that do not already provide an nl_langinfo(CODESET) implementation. */ - } - return NULL; -} - -/* 15.2.40.2.9 */ -/* - * call-seq: - * Encoding.locale_charmap -> string - * - * Returns the locale charmap name. - * - * Debian GNU/Linux - * LANG=C - * Encoding.locale_charmap #=> "ANSI_X3.4-1968" - * LANG=ja_JP.EUC-JP - * Encoding.locale_charmap #=> "EUC-JP" - * - * SunOS 5 - * LANG=C - * Encoding.locale_charmap #=> "646" - * LANG=ja - * Encoding.locale_charmap #=> "eucJP" - * - * The result is highly platform dependent. - * So Encoding.find(Encoding.locale_charmap) may cause an error. - * If you need some encoding object even for unknown locale, - * Encoding.find("locale") can be used. - * - */ -mrb_value -mrb_locale_charmap(mrb_state *mrb, mrb_value klass) -{ -#if defined NO_LOCALE_CHARMAP - return mrb_usascii_str_new2(mrb, "ASCII-8BIT"); -#elif defined _WIN32 || defined __CYGWIN__ - const char *nl_langinfo_codeset(void); - const char *codeset = nl_langinfo_codeset(); - char cp[sizeof(int) * 3 + 4]; - if (!codeset) { - //snprintf(cp, sizeof(cp), "CP%d", GetConsoleCP()); - codeset = cp; - } - return mrb_usascii_str_new2(mrb, codeset); -#elif defined HAVE_LANGINFO_H - char *codeset; - codeset = nl_langinfo(CODESET); - return mrb_usascii_str_new2(mrb, codeset); -#else - return mrb_nil_value(); -#endif -} -static void -set_encoding_const(mrb_state *mrb, const char *name, mrb_encoding *enc) -{ - mrb_value encoding = mrb_enc_from_encoding(mrb, enc); - char *s = (char*)name; - int haslower = 0, hasupper = 0, valid = 0; - - if (ISDIGIT(*s)) return; - if (ISUPPER(*s)) { - hasupper = 1; - while (*++s && (ISALNUM(*s) || *s == '_')) { - if (ISLOWER(*s)) haslower = 1; - } - } - if (!*s) { - if (s - name > ENCODING_NAMELEN_MAX) return; - valid = 1; - //mrb_define_const(mrb_cEncoding, name, encoding); - mrb_define_const(mrb, ENCODE_CLASS, name, encoding); - } - if (!valid || haslower) { - size_t len = s - name; - if (len > ENCODING_NAMELEN_MAX) return; - if (!haslower || !hasupper) { - do { - if (ISLOWER(*s)) haslower = 1; - if (ISUPPER(*s)) hasupper = 1; - } while (*++s && (!haslower || !hasupper)); - len = s - name; - } - len += strlen(s); - if (len++ > ENCODING_NAMELEN_MAX) return; - //MEMCPY(s = ALLOCA_N(char, len), name, char, len); - memcpy(s = mrb_malloc(mrb, len), name, len); - name = s; - if (!valid) { - if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s); - for (; *s; ++s) { - if (!ISALNUM(*s)) *s = '_'; - } - if (hasupper) { - mrb_define_const(mrb, ENCODE_CLASS, name, encoding); - } - } - if (haslower) { - for (s = (char*)name; *s; ++s) { - if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s); - } - mrb_define_const(mrb, ENCODE_CLASS, name, encoding); - } - } -} -static enum st_retval -mrb_enc_name_list_i(mrb_state *mrb, st_data_t name, st_data_t idx, mrb_value *arg) -{ - mrb_value ary = *arg; - mrb_value str = mrb_usascii_str_new2(mrb, (char*)name); - //OBJ_FREEZE(str); - mrb_ary_push(mrb, ary, str); - return ST_CONTINUE; -} - -/* 15.2.40.2.10 */ -/* - * call-seq: - * Encoding.name_list -> ["enc1", "enc2", ...] - * - * Returns the list of available encoding names. - * - * Encoding.name_list - * #=> ["US-ASCII", "ASCII-8BIT", "UTF-8", - * "ISO-8859-1", "Shift_JIS", "EUC-JP", - * "Windows-31J", - * "BINARY", "CP932", "eucJP"] - * - */ - -static mrb_value -mrb_enc_name_list(mrb_state *mrb, mrb_value klass) -{ - mrb_value ary = mrb_ary_new_capa(mrb, enc_table.names->num_entries);//mrb_ary_new2(enc_table.names->num_entries); - st_foreachNew(mrb, enc_table.names, mrb_enc_name_list_i, &ary); - return ary; -} - -static enum st_retval -mrb_enc_aliases_enc_i(mrb_state *mrb, st_data_t name, st_data_t orig, st_data_t arg) -{ - mrb_value *p = (mrb_value*)arg; - mrb_value aliases = p[0], ary = p[1]; - int idx = (int)orig; - mrb_value key, str = mrb_ary_ref(mrb, ary, idx);//mrb_ary_entry(ary, idx); - - if (mrb_nil_p(str)) { - mrb_encoding *enc = mrb_enc_from_index(mrb, idx); - - if (!enc) return ST_CONTINUE; - if (STRCASECMP((char*)name, mrb_enc_name(enc)) == 0) { - return ST_CONTINUE; - } - str = mrb_usascii_str_new2(mrb, mrb_enc_name(enc)); - OBJ_FREEZE(str); - mrb_ary_set(mrb, ary, idx, str);//rb_ary_store(ary, idx, str); - } - key = mrb_usascii_str_new2(mrb, (char*)name); - OBJ_FREEZE(key); - mrb_hash_set(mrb, aliases, key, str); - return ST_CONTINUE; -} - -/* 15.2.40.2.1 */ -/* - * call-seq: - * Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...} - * - * Returns the hash of available encoding alias and original encoding name. - * - * Encoding.aliases - * #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII", - * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"} - * - */ - -static mrb_value -mrb_enc_aliases(mrb_state *mrb, mrb_value klass) -{ - mrb_value aliases[2]; - aliases[0] = mrb_hash_new_capa(mrb, 0); - aliases[1] = mrb_ary_new(mrb); - st_foreachNew(mrb, enc_table.names, mrb_enc_aliases_enc_i, aliases); - return aliases[0]; -} - -void -mrb_init_encoding(mrb_state *mrb) -{ -#undef mrb_intern -#define mrb_intern(str) mrb_intern_const(str) - mrb_value list; - int i; - struct RClass *s; - - s = mrb_define_class(mrb, "Encoding", mrb->object_class); - //mrb_undef_alloc_func(mrb_cEncoding); - //mrb_undef_method(CLASS_OF(mrb_cEncoding), "new"); - mrb_define_class_method(mrb, s, "aliases", mrb_enc_aliases, ARGS_NONE()); /* 15.2.40.2.1 */ - mrb_define_class_method(mrb, s, "compatible?", enc_compatible_p, ARGS_REQ(2)); /* 15.2.40.2.2 */ - mrb_define_class_method(mrb, s, "default_external", get_default_external, ARGS_NONE()); /* 15.2.40.2.3 */ - mrb_define_class_method(mrb, s, "default_external=", set_default_external, ARGS_REQ(1)); /* 15.2.40.2.4 */ - mrb_define_class_method(mrb, s, "default_internal", get_default_internal, ARGS_NONE()); /* 15.2.40.2.5 */ - mrb_define_class_method(mrb, s, "default_internal=", set_default_internal, ARGS_REQ(1)); /* 15.2.40.2.6 */ - mrb_define_class_method(mrb, s, "find", enc_find, ARGS_REQ(1)); /* 15.2.40.2.7 */ - mrb_define_class_method(mrb, s, "list", enc_list, ARGS_NONE()); /* 15.2.40.2.8 */ - mrb_define_class_method(mrb, s, "locale_charmap", mrb_locale_charmap, ARGS_NONE()); /* 15.2.40.2.9 */ - mrb_define_class_method(mrb, s, "name_list", mrb_enc_name_list, ARGS_NONE()); /* 15.2.40.2.10 */ - mrb_define_class_method(mrb, s, "_load", enc_load, ARGS_REQ(1)); /* 15.2.40.2.11 */ - mrb_define_method(mrb, s, "ascii_compatible?", enc_ascii_compatible_p, ARGS_NONE()); /* 15.2.40.2.12 */ - mrb_define_method(mrb, s, "dummy?", enc_dummy_p, ARGS_NONE()); /* 15.2.40.2.13 */ - mrb_define_method(mrb, s, "inspect", enc_inspect, ARGS_NONE()); /* 15.2.40.2.14 */ - mrb_define_method(mrb, s, "name", enc_name, ARGS_NONE()); /* 15.2.40.2.15 */ - mrb_define_method(mrb, s, "names", enc_names, ARGS_NONE()); /* 15.2.40.2.16 */ - mrb_define_method(mrb, s, "replicate", enc_replicate, ARGS_REQ(1)); /* 15.2.40.2.17 */ - mrb_define_method(mrb, s, "to_s", enc_name, ARGS_NONE()); /* 15.2.40.2.18 */ - mrb_define_method(mrb, s, "_dump", enc_dump, ARGS_ANY()); /* 15.2.40.2.19 */ - -/* add kusuda --> */ - if (!enc_table.list) { - mrb_enc_init(mrb); - } -/* add kusuda --< */ - list = mrb_ary_new_capa(mrb, enc_table.count);//mrb_ary_new2(enc_table.count); - RBASIC(list)->c = 0; - mrb_encoding_list = list; - //mrb_gc_register_mark_object(list); - - for (i = 0; i < enc_table.count; ++i) { - mrb_ary_push(mrb, list, enc_new(mrb, enc_table.list[i].enc)); - } -} - -/* locale insensitive functions */ - -#define ctype_test(c, ctype) \ - (mrb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), ctype)) - -int mrb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); } -int mrb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); } -int mrb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); } -int mrb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); } -int mrb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); } -int mrb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); } -int mrb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); } -int mrb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); } -int mrb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); } -int mrb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); } -int mrb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); } -int mrb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); } - -int -mrb_tolower(int c) -{ - return mrb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c; -} - -int -mrb_toupper(int c) -{ - return mrb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c; -} -#endif //INCLUDE_ENCODING diff --git a/src/encoding.h b/src/encoding.h index c6c470644..1312fb947 100644 --- a/src/encoding.h +++ b/src/encoding.h @@ -174,11 +174,7 @@ int mrb_enc_codelen(mrb_state *mrb, int code, mrb_encoding *enc); #endif //INCLUDE_ENCODING /* code,ptr,encoding -> write buf */ -#ifdef INCLUDE_ENCODING -#define mrb_enc_mbcput(c,buf,enc) ONIGENC_CODE_TO_MBC(enc,c,(UChar*)(buf)) -#else -#define mrb_enc_mbcput(c,buf,enc) *(buf) = (char)(c) -#endif //INCLUDE_ENCODING +#define mrb_enc_mbcput(c,buf,enc) ((*(buf) = (char)(c)),1) /* start, ptr, end, encoding -> prev_char */ #define mrb_enc_prev_char(s,p,e,enc) (char *)onigenc_get_prev_char_head(enc,(UChar*)(s),(UChar*)(p),(UChar*)(e)) @@ -232,9 +228,6 @@ mrb_value mrb_enc_default_internal(mrb_state *mrb); void mrb_enc_set_default_external(mrb_state *mrb, mrb_value encoding); void mrb_enc_set_default_internal(mrb_state *mrb, mrb_value encoding); mrb_value mrb_locale_charmap(mrb_state *mrb, mrb_value klass); -#ifdef INCLUDE_ENCODING -int mrb_memsearch(mrb_state *mrb, const void*,int,const void*,int,mrb_encoding*); -#endif //INCLUDE_ENCODING mrb_value mrb_usascii_str_new_cstr(mrb_state *mrb, const char *ptr); int mrb_str_buf_cat_escaped_char(mrb_state *mrb, mrb_value result, unsigned int c, int unicode_p); @@ -242,6 +242,24 @@ mrb_init_heap(mrb_state *mrb) #endif } +static void +gc_protect(mrb_state *mrb, struct RBasic *p) +{ + if (mrb->arena_idx > MRB_ARENA_SIZE) { + /* arena overflow error */ + mrb->arena_idx = MRB_ARENA_SIZE - 4; /* force room in arena */ + mrb_raise(mrb, mrb->eRuntimeError_class, "arena overflow error"); + } + mrb->arena[mrb->arena_idx++] = p; +} + +void +mrb_gc_protect(mrb_state *mrb, mrb_value obj) +{ + if (SPECIAL_CONST_P(obj)) return; + gc_protect(mrb, RBASIC(obj)); +} + struct RBasic* mrb_obj_alloc(mrb_state *mrb, enum mrb_vtype ttype, struct RClass *cls) { @@ -264,12 +282,7 @@ mrb_obj_alloc(mrb_state *mrb, enum mrb_vtype ttype, struct RClass *cls) } mrb->live++; - if (mrb->arena_idx > MRB_ARENA_SIZE) { - /* arena overflow error */ - mrb->arena_idx = MRB_ARENA_SIZE - 2; /* force room in arena */ - mrb_raise(mrb, mrb->eRuntimeError_class, "arena overflow error"); - } - mrb->arena[mrb->arena_idx++] = p; + gc_protect(mrb, p); memset(p, 0, sizeof(RVALUE)); p->tt = ttype; p->c = cls; @@ -362,9 +375,8 @@ gc_mark_children(mrb_state *mrb, struct RBasic *obj) { struct RString *s = (struct RString*)obj; - while (s->flags & MRB_STR_SHARED) { - s = s->aux.shared; - if (!s) break; + if (s->flags & MRB_STR_SHARED) { + mrb_gc_mark(mrb, (struct RBasic*)s->aux.shared); } } break; diff --git a/src/hash.c b/src/hash.c index 28e718c0d..a06becd91 100644 --- a/src/hash.c +++ b/src/hash.c @@ -11,34 +11,28 @@ #include "mruby/array.h" #include "mruby/string.h" #include "mruby/variable.h" -#include "st.h" -#include <errno.h> #include <string.h> - - #include <stdio.h> -static khint_t +static inline khint_t mrb_hash_ht_hash_func(mrb_state *mrb, mrb_value key) { - char type = mrb_type(key); - mrb_value s1 = mrb_str_new(mrb, &type, 1); - mrb_value s2 = mrb_inspect(mrb, key); - s1 = mrb_str_cat(mrb, s1, RSTRING_PTR(s2), RSTRING_LEN(s2)); - return kh_str_hash_func(mrb, RSTRING_PTR(s1)); + khint_t h = mrb_type(key) << 24; + mrb_value h2; + + h2 = mrb_funcall(mrb, key, "hash", 0, 0); + h ^= h2.value.i; + return h; } -static khint_t +static inline khint_t mrb_hash_ht_hash_equal(mrb_state *mrb, mrb_value a, mrb_value b) { - return mrb_equal(mrb, a, b); + return mrb_eql(mrb, a, b); } KHASH_INIT(ht, mrb_value, mrb_value, 1, mrb_hash_ht_hash_func, mrb_hash_ht_hash_equal); -mrb_value mrb_exec_recursive_paired(mrb_state *mrb, mrb_value (*func) (mrb_state *, mrb_value, mrb_value, int), - mrb_value obj, mrb_value paired_obj, void* arg); - #ifndef FALSE #define FALSE 0 #endif @@ -61,12 +55,11 @@ mrb_hash_ht_key(mrb_state *mrb, mrb_value key) #define KEY(key) mrb_hash_ht_key(mrb, key) void -mrb_gc_mark_ht(mrb_state *mrb, struct RHash *c) +mrb_gc_mark_ht(mrb_state *mrb, struct RHash *hash) { khiter_t k; - khash_t(ht) *h = ((struct RHash*)c)->ht; + khash_t(ht) *h = hash->ht; - if (!h) return; for (k = kh_begin(h); k != kh_end(h); k++) if (kh_exist(h, k)) { mrb_gc_mark_value(mrb, kh_key(h, k)); @@ -75,23 +68,15 @@ mrb_gc_mark_ht(mrb_state *mrb, struct RHash *c) } size_t -mrb_gc_mark_ht_size(mrb_state *mrb, struct RHash *c) +mrb_gc_mark_ht_size(mrb_state *mrb, struct RHash *hash) { - size_t ht_size = 0; - khash_t(ht) *h = c->ht; - - /* ((struct RHash*)c)->ht */ - if (h) ht_size += kh_size(h)*2; - - return ht_size; + return kh_size(hash->ht)*2; } void -mrb_gc_free_ht(mrb_state *mrb, struct RHash *c) +mrb_gc_free_ht(mrb_state *mrb, struct RHash *hash) { - khash_t(ht) *h = c->ht; - - kh_destroy(ht, h); + kh_destroy(ht, hash->ht); } @@ -119,11 +104,9 @@ mrb_hash_get(mrb_state *mrb, mrb_value hash, mrb_value key) /* mrb_hash_aref */ khash_t(ht) *h = RHASH_TBL(hash); khiter_t k; - if (h) { - k = kh_get(ht, h, key); - if (k != kh_end(h)) - return kh_value(h, k); - } + k = kh_get(ht, h, key); + if (k != kh_end(h)) + return kh_value(h, k); /* not found */ if (MRB_RHASH_PROCDEFAULT_P(hash)) { @@ -176,21 +159,6 @@ mrb_hash_freeze(mrb_value hash) } mrb_value -mrb_hash(mrb_state *mrb, mrb_value obj) -{ - mrb_value hval = mrb_funcall(mrb, obj, "Hash", 0); -retry: - switch (mrb_type(hval)) { - case MRB_TT_FIXNUM: - return hval; - - default: - hval = mrb_to_int(mrb, hval); - goto retry; - } -} - -mrb_value mrb_hash_dup(mrb_state *mrb, mrb_value hash) { struct RHash* ret; @@ -675,7 +643,7 @@ mrb_hash_values_at(mrb_state *mrb, int argc, mrb_value *argv, mrb_value hash) long i; for (i=0; i<argc; i++) { - mrb_ary_push(mrb, result, KEY(mrb_hash_get(mrb, hash, argv[i]))); + mrb_ary_push(mrb, result, mrb_hash_get(mrb, hash, argv[i])); } return result; } @@ -1136,28 +1104,6 @@ mrb_hash_has_value(mrb_state *mrb, mrb_value hash) } static mrb_value -recursive_eql(mrb_state *mrb, mrb_value hash, mrb_value dt, int recur) -{ - khash_t(ht) *h1 = RHASH_TBL(hash); - khash_t(ht) *h2 = RHASH_TBL(dt); - khiter_t k1, k2; - mrb_value key1; - - for (k1 = kh_begin(h1); k1 != kh_end(h1); k1++) { - if (!kh_exist(h1, k1)) continue; - key1 = kh_key(h1,k1); - k2 = kh_get(ht, h2, key1); - if ( k2 != kh_end(h2)) { - if (mrb_equal(mrb, kh_value(h1,k1), kh_value(h2,k2))) { - continue; /* next key */ - } - } - return mrb_false_value(); - } - return mrb_true_value(); -} - -static mrb_value hash_equal(mrb_state *mrb, mrb_value hash1, mrb_value hash2, int eql) { if (mrb_obj_equal(mrb, hash1, hash2)) return mrb_true_value(); @@ -1171,9 +1117,25 @@ hash_equal(mrb_state *mrb, mrb_value hash1, mrb_value hash2, int eql) return mrb_fixnum_value(mrb_equal(mrb, hash2, hash1)); } if (RHASH_SIZE(hash1) != RHASH_SIZE(hash2)) return mrb_false_value(); - if (!RHASH(hash1)->ht || !RHASH(hash2)->ht) return mrb_true_value(); + else { + khash_t(ht) *h1 = RHASH_TBL(hash1); + khash_t(ht) *h2 = RHASH_TBL(hash2); + khiter_t k1, k2; + mrb_value key; - return mrb_exec_recursive_paired(mrb, recursive_eql, hash1, hash2, (void*)0); + for (k1 = kh_begin(h1); k1 != kh_end(h1); k1++) { + if (!kh_exist(h1, k1)) continue; + key = kh_key(h1,k1); + k2 = kh_get(ht, h2, key); + if (k2 != kh_end(h2)) { + if (mrb_equal(mrb, kh_value(h1,k1), kh_value(h2,k2))) { + continue; /* next key */ + } + } + return mrb_false_value(); + } + } + return mrb_true_value(); } /* 15.2.13.4.1 */ @@ -1319,9 +1281,6 @@ mrb_hash_rassoc(mrb_state *mrb, mrb_value hash) mrb_value key, value, has_key; mrb_get_args(mrb, "o", &key); - if (mrb_nil_p(key)) - mrb_raise(mrb, E_ARGUMENT_ERROR, "wrong number of arguments"); - has_key = mrb_hash_has_keyWithKey(mrb, hash, key); if (mrb_test(has_key)) { value = mrb_hash_get(mrb, hash, key); diff --git a/src/init.c b/src/init.c index 5aab8d6ae..17ce24313 100644 --- a/src/init.c +++ b/src/init.c @@ -20,7 +20,6 @@ void mrb_init_proc(mrb_state*); void mrb_init_range(mrb_state*); void mrb_init_string(mrb_state*); void mrb_init_regexp(mrb_state*); -void mrb_init_encoding(mrb_state*); void mrb_init_exception(mrb_state*); void mrb_init_time(mrb_state*); void mrb_init_io(mrb_state*); @@ -54,7 +53,6 @@ mrb_init_core(mrb_state *mrb) mrb_init_gc(mrb); #ifdef INCLUDE_REGEXP mrb_init_regexp(mrb); - mrb_init_encoding(mrb); #endif mrb_init_exception(mrb); mrb_init_print(mrb); diff --git a/src/kernel.c b/src/kernel.c index 17951afc7..9ee9e8d34 100644 --- a/src/kernel.c +++ b/src/kernel.c @@ -274,11 +274,11 @@ mrb_f_send(int argc, mrb_value *argv, mrb_value recv) static mrb_value mrb_f_send_m(mrb_state *mrb, mrb_value self) { - mrb_value *argv; + mrb_value name, block, *argv; int argc; - - mrb_get_args(mrb, "*", &argv, &argc); - return mrb_f_send(argc, argv, self); + + mrb_get_args(mrb, "&o*", &block, &name, &argv, &argc); + return mrb_funcall_with_block(mrb,self, mrb_string_value_ptr(mrb, name), argc, argv, block); } /* 15.3.1.2.1 */ diff --git a/src/load.c b/src/load.c index 28f52433a..c0684f1aa 100644 --- a/src/load.c +++ b/src/load.c @@ -336,7 +336,6 @@ read_rite_irep_record(mrb_state *mrb, unsigned char *src, mrb_irep *irep, uint32 uint16_t crc, tt, pdl, snl, offset, bufsize=MRB_DUMP_DEFAULT_STR_LEN; mrb_int fix_num; mrb_float f; - mrb_value str; int ai = mrb_gc_arena_save(mrb); recordStart = src; diff --git a/src/object.c b/src/object.c index 1d84909ec..81e3867a7 100644 --- a/src/object.c +++ b/src/object.c @@ -11,13 +11,6 @@ #include "mruby/class.h" #include "mruby/numeric.h" -#ifdef INCLUDE_REGEXP - #define mrb_usascii_str_new2 mrb_usascii_str_new_cstr -#else - #define mrb_usascii_str_new2 mrb_str_new_cstr - #define mrb_usascii_str_new mrb_str_new -#endif - #ifndef FALSE #define FALSE 0 #endif @@ -106,7 +99,7 @@ mrb_true(mrb_state *mrb, mrb_value obj) static mrb_value nil_to_s(mrb_state *mrb, mrb_value obj) { - return mrb_usascii_str_new(mrb, 0, 0); + return mrb_str_new(mrb, 0, 0); } /*********************************************************************** @@ -166,7 +159,7 @@ true_xor(mrb_state *mrb, mrb_value obj) static mrb_value true_to_s(mrb_state *mrb, mrb_value obj) { - return mrb_usascii_str_new2(mrb, "true"); + return mrb_str_new_cstr(mrb, "true"); } /* 15.2.5.3.4 */ @@ -279,7 +272,7 @@ false_or(mrb_state *mrb, mrb_value obj) static mrb_value false_to_s(mrb_state *mrb, mrb_value obj) { - return mrb_usascii_str_new2(mrb, "false"); + return mrb_str_new_cstr(mrb, "false"); } void @@ -626,12 +619,12 @@ mrb_Float(mrb_state *mrb, mrb_value val) mrb_value mrb_inspect(mrb_state *mrb, mrb_value obj) { - return mrb_obj_as_string(mrb, mrb_funcall(mrb, obj, "inspect", 0, 0)); + return mrb_obj_as_string(mrb, mrb_funcall(mrb, obj, "inspect", 0, 0)); } int mrb_eql(mrb_state *mrb, mrb_value obj1, mrb_value obj2) { - return RTEST(mrb_funcall(mrb, obj1, "eql?", 1, obj2)); + if (mrb_obj_eq(mrb, obj1, obj2)) return TRUE; + return RTEST(mrb_funcall(mrb, obj1, "eql?", 1, obj2)); } - @@ -7,16 +7,11 @@ #include "mruby.h" #include <string.h> #include "mruby/string.h" -#include "mruby/khash.h" #include "encoding.h" #include "re.h" -#include "mruby/numeric.h" -#include "mruby/range.h" #include "mruby/array.h" #include "regint.h" #include "mruby/class.h" -#include "mruby/hash.h" -#include "mruby/variable.h" #include "error.h" #ifdef INCLUDE_REGEXP @@ -54,13 +49,10 @@ unsigned long ruby_scan_oct(const char*, size_t, size_t*); unsigned long ruby_scan_hex(const char*, size_t, size_t*); static mrb_value mrb_match_to_a(mrb_state *mrb, mrb_value match); -static mrb_value mrb_reg_preprocess(mrb_state *mrb, const char *p, const char *end, mrb_encoding *enc, - mrb_encoding **fixed_enc, onig_errmsg_buffer err); -static void mrb_reg_expr_str(mrb_state *mrb, mrb_value str, const char *s, long len, - mrb_encoding *enc, mrb_encoding *resenc); +static mrb_value mrb_reg_preprocess(mrb_state *mrb, const char *p, const char *end, onig_errmsg_buffer err); +static void mrb_reg_expr_str(mrb_state *mrb, mrb_value str, const char *s, long len); static char * option_to_str(char str[4], int options); -static mrb_value reg_cache; //static int may_need_recompile; //static int reg_kcode = DEFAULT_KCODE; /* ------------------------------------------------------------------------- */ @@ -94,22 +86,20 @@ mrb_reg_s_new_instance(mrb_state *mrb, /*int argc, mrb_value *argv, */mrb_value re->usecnt = 0; return mrb_funcall_argv(mrb, mrb_obj_value(re), "initialize", argc, argv); } -//#define mrb_enc_mbcput(a,b,c) a + mrb_value mrb_reg_quote(mrb_state *mrb, mrb_value str) { - mrb_encoding *enc = mrb_enc_get(mrb, str); char *s, *send, *t; mrb_value tmp; - int c,clen; - int ascii_only = mrb_enc_str_asciionly_p(mrb, str); + int c; s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); while (s < send) { - c = mrb_enc_ascget(mrb, s, send, &clen, enc); + c = *s; if (c == -1) { - s += mbclen(s, send, enc); + s += send - s; continue; } switch (c) { @@ -121,38 +111,28 @@ mrb_reg_quote(mrb_state *mrb, mrb_value str) case '\t': case '\f': case '\n': case '\r': goto meta_found; } - s += clen; + s++; } - //tmp = mrb_str_new3(str); tmp = mrb_str_new(mrb, RSTRING_PTR(str), RSTRING_LEN(str)); - if (ascii_only) { - mrb_enc_associate(mrb, tmp, mrb_usascii_encoding(mrb)); - } return tmp; meta_found: tmp = mrb_str_new(mrb, 0, RSTRING_LEN(str)*2); - if (ascii_only) { - mrb_enc_associate(mrb, tmp, mrb_usascii_encoding(mrb)); - } - else { - mrb_enc_copy(mrb, tmp, str); - } t = RSTRING_PTR(tmp); /* copy upto metacharacter */ memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str)); t += s - RSTRING_PTR(str); while (s < send) { - c = mrb_enc_ascget(mrb, s, send, &clen, enc); + c = *s; if (c == -1) { - int n = mbclen(s, send, enc); + int n = send - s; while (n--) *t++ = *s++; continue; } - s += clen; + s++; switch (c) { case '[': case ']': case '{': case '}': case '(': case ')': case '|': case '-': @@ -263,7 +243,7 @@ mrb_reg_nth_match(mrb_state *mrb, mrb_int nth, mrb_value match) if (start == -1) return mrb_nil_value(); end = m->rmatch->regs.end[nth]; len = end - start; - str = mrb_str_substr(mrb, mrb_obj_value(m->str), start, len); + str = mrb_str_subseq(mrb, mrb_obj_value(m->str), start, len); return str; } @@ -379,75 +359,13 @@ mrb_reg_options(mrb_state *mrb, mrb_value re) return options; } -static void -reg_enc_error(mrb_state *mrb, mrb_value re, mrb_value str) -{ - mrb_raise(mrb, E_ENCODING_ERROR, - "incompatible encoding regexp match (%s regexp with %s string)", - mrb_enc_name(mrb_enc_get(mrb, re)), - mrb_enc_name(mrb_enc_get(mrb, str))); -} - -static int -mrb_reg_fixed_encoding_p(mrb_value re) -{ - /*if (FL_TEST(re, KCODE_FIXED)) - return Qtrue; - else */ - return 0/*Qfalse*/; -} - -static mrb_encoding* -mrb_reg_prepare_enc(mrb_state *mrb, mrb_value re, mrb_value str, int warn) -{ - mrb_encoding *enc = 0; - - if (mrb_enc_str_coderange(mrb, str) == ENC_CODERANGE_BROKEN) { - mrb_raise(mrb, E_ARGUMENT_ERROR, - "invalid byte sequence in %s", - mrb_enc_name(mrb_enc_get(mrb, str))); - } - - mrb_reg_check(mrb, re); - enc = mrb_enc_get(mrb, str); - if (!mrb_enc_str_asciicompat_p(mrb, str)) { - if (RREGEXP(re)->ptr->enc != enc) { - reg_enc_error(mrb, re, str); - } - } - else if (mrb_reg_fixed_encoding_p(re)) { - if (RREGEXP(re)->ptr->enc != enc && - (!mrb_enc_asciicompat(mrb, RREGEXP(re)->ptr->enc) || - mrb_enc_str_coderange(mrb, str) != ENC_CODERANGE_7BIT)) { - reg_enc_error(mrb, re, str); - } - enc = RREGEXP(re)->ptr->enc; - } - if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) && - enc != mrb_ascii8bit_encoding(mrb) && - mrb_enc_str_coderange(mrb, str) != ENC_CODERANGE_7BIT) { - mrb_warn("regexp match /.../n against to %s string", - mrb_enc_name(enc)); - } - return enc; -} - static mrb_value mrb_reg_desc(mrb_state *mrb, const char *s, long len, mrb_value re) { - mrb_encoding *enc = mrb_enc_get(mrb, re); mrb_value str = mrb_str_new_cstr(mrb, "/");//mrb_str_buf_new2("/"); - mrb_encoding *resenc = mrb_default_internal_encoding(mrb); - if (resenc == NULL) resenc = mrb_default_external_encoding(mrb); - if (re.tt && mrb_enc_asciicompat(mrb, enc)) { - mrb_enc_copy(mrb, str, re); - } - else { - mrb_enc_associate(mrb, str, mrb_usascii_encoding(mrb)); - } - mrb_reg_expr_str(mrb, str, s, len, enc, resenc); - mrb_str_buf_cat(mrb, str, "/", strlen("/"));//mrb_str_buf_cat2(str, "/"); + mrb_reg_expr_str(mrb, str, s, len); + mrb_str_buf_cat(mrb, str, "/", strlen("/")); if (re.tt) { char opts[4]; mrb_reg_check(mrb, re); @@ -476,18 +394,14 @@ mrb_reg_prepare_re(mrb_state *mrb, mrb_value re, mrb_value str) OnigErrorInfo einfo; const char *pattern; mrb_value unescaped; - mrb_encoding *fixed_enc = 0; - mrb_encoding *enc = mrb_reg_prepare_enc(mrb, re, str, 1); - - if (reg->enc == enc) return reg; + mrb_encoding *enc = mrb_ascii8bit_encoding(mrb); mrb_reg_check(mrb, re); reg = RREGEXP(re)->ptr; pattern = RREGEXP_SRC_PTR(re); unescaped = mrb_reg_preprocess(mrb, - pattern, pattern + RREGEXP(re)->src->len, enc, - &fixed_enc, err); + pattern, pattern + RREGEXP(re)->src->len, err); if (mrb_nil_p(unescaped)) { mrb_raise(mrb, E_ARGUMENT_ERROR, "regexp preprocess failed: %s", err); @@ -675,18 +589,6 @@ ruby_scan_hex(const char *start, size_t len, size_t *retlen) return retval; } -static int -check_unicode_range(unsigned long code, onig_errmsg_buffer err) -{ - if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */ - 0x10ffff < code) { - //errcpy(err, "invalid Unicode range"); - printf("invalid Unicode range"); - return -1; - } - return 0; -} - #define BYTEWIDTH 8 int @@ -735,59 +637,6 @@ mrb_uv_to_utf8(mrb_state *mrb, char buf[6], unsigned long uv) return 0; } -static int -append_utf8(mrb_state *mrb, unsigned long uv, - mrb_value buf, mrb_encoding **encp, onig_errmsg_buffer err) -{ - if (check_unicode_range(uv, err) != 0) - return -1; - if (uv < 0x80) { - char escbuf[5]; - snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv); - mrb_str_buf_cat(mrb, buf, escbuf, 4); - } - else { - int len; - char utf8buf[6]; - len = mrb_uv_to_utf8(mrb, utf8buf, uv); - mrb_str_buf_cat(mrb, buf, utf8buf, len); - - if (*encp == 0) - *encp = mrb_utf8_encoding(mrb); - else if (*encp != mrb_utf8_encoding(mrb)) { - //errcpy(err, "UTF-8 character in non UTF-8 regexp"); - printf("UTF-8 character in non UTF-8 regexp"); - return -1; - } - } - return 0; -} - -static int -unescape_unicode_bmp(mrb_state *mrb, const char **pp, const char *end, - mrb_value buf, mrb_encoding **encp, onig_errmsg_buffer err) -{ - const char *p = *pp; - size_t len; - unsigned long code; - - if (end < p+4) { - //errcpy(err, "invalid Unicode escape"); - printf("invalid Unicode escape"); - return -1; - } - code = ruby_scan_hex(p, 4, &len); - if (len != 4) { - //errcpy(err, "invalid Unicode escape"); - printf("invalid Unicode escape"); - return -1; - } - if (append_utf8(mrb, code, buf, encp, err) != 0) - return -1; - *pp = p + 4; - return 0; -} - unsigned long ruby_scan_oct(const char *start, size_t len, size_t *retlen) { @@ -802,400 +651,29 @@ ruby_scan_oct(const char *start, size_t len, size_t *retlen) return retval; } -static int -read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err) -{ - const char *p = *pp; - int code; - int meta_prefix = 0, ctrl_prefix = 0; - size_t len; - - if (p == end || *p++ != '\\') { - //errcpy(err, "too short escaped multibyte character"); - printf("too short escaped multibyte character"); - return -1; - } - -again: - if (p == end) { - //errcpy(err, "too short escape sequence"); - printf("too short escape sequence"); - return -1; - } - switch (*p++) { - case '\\': code = '\\'; break; - case 'n': code = '\n'; break; - case 't': code = '\t'; break; - case 'r': code = '\r'; break; - case 'f': code = '\f'; break; - case 'v': code = '\013'; break; - case 'a': code = '\007'; break; - case 'e': code = '\033'; break; - - /* \OOO */ - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - p--; - code = scan_oct(p, end < p+3 ? end-p : 3, &len); - p += len; - break; - - case 'x': /* \xHH */ - code = scan_hex(p, end < p+2 ? end-p : 2, &len); - if (len < 1) { - //errcpy(err, "invalid hex escape"); - printf("invalid hex escape"); - return -1; - } - p += len; - break; - - case 'M': /* \M-X, \M-\C-X, \M-\cX */ - if (meta_prefix) { - //errcpy(err, "duplicate meta escape"); - printf("duplicate meta escape"); - return -1; - } - meta_prefix = 1; - if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) { - if (*p == '\\') { - p++; - goto again; - } - else { - code = *p++; - break; - } - } - //errcpy(err, "too short meta escape"); - printf("too short meta escape"); - return -1; - - case 'C': /* \C-X, \C-\M-X */ - if (p == end || *p++ != '-') { - //errcpy(err, "too short control escape"); - printf("too short control escape"); - return -1; - } - case 'c': /* \cX, \c\M-X */ - if (ctrl_prefix) { - //errcpy(err, "duplicate control escape"); - printf("duplicate control escape"); - return -1; - } - ctrl_prefix = 1; - if (p < end && (*p & 0x80) == 0) { - if (*p == '\\') { - p++; - goto again; - } - else { - code = *p++; - break; - } - } - //errcpy(err, "too short control escape"); - printf("too short control escape"); - return -1; - - default: - //errcpy(err, "unexpected escape sequence"); - printf("unexpected escape sequence"); - return -1; - } - if (code < 0 || 0xff < code) { - //errcpy(err, "invalid escape code"); - printf("invalid escape code"); - return -1; - } - - if (ctrl_prefix) - code &= 0x1f; - if (meta_prefix) - code |= 0x80; - - *pp = p; - return code; -} - -static int -unescape_escaped_nonascii(mrb_state *mrb, const char **pp, const char *end, mrb_encoding *enc, - mrb_value buf, mrb_encoding **encp, onig_errmsg_buffer err) -{ - const char *p = *pp; - int chmaxlen = mrb_enc_mbmaxlen(enc); - //char *chbuf = ALLOCA_N(char, chmaxlen); - char *chbuf = mrb_malloc(mrb, chmaxlen); - int chlen = 0; - int byte; - int l; - - memset(chbuf, 0, chmaxlen); - - byte = read_escaped_byte(&p, end, err); - if (byte == -1) { - return -1; - } - - chbuf[chlen++] = byte; - while (chlen < chmaxlen && - MBCLEN_NEEDMORE_P(mrb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) { - byte = read_escaped_byte(&p, end, err); - if (byte == -1) { - return -1; - } - chbuf[chlen++] = byte; - } - - l = mrb_enc_precise_mbclen(chbuf, chbuf+chlen, enc); - if (MBCLEN_INVALID_P(l)) { - //errcpy(err, "invalid multibyte escape"); - printf("invalid multibyte escape"); - return -1; - } - if (1 < chlen || (chbuf[0] & 0x80)) { - mrb_str_buf_cat(mrb, buf, chbuf, chlen); - - if (*encp == 0) - *encp = enc; - else if (*encp != enc) { - //errcpy(err, "escaped non ASCII character in UTF-8 regexp"); - printf("escaped non ASCII character in UTF-8 regexp"); - return -1; - } - } - else { - char escbuf[5]; - snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff); - mrb_str_buf_cat(mrb, buf, escbuf, 4); - } - *pp = p; - return 0; -} - -static int -unescape_unicode_list(mrb_state *mrb, const char **pp, const char *end, - mrb_value buf, mrb_encoding **encp, onig_errmsg_buffer err) -{ - const char *p = *pp; - int has_unicode = 0; - unsigned long code; - size_t len; - - while (p < end && ISSPACE(*p)) p++; - - while (1) { - code = ruby_scan_hex(p, end-p, &len); - if (len == 0) - break; - if (6 < len) { /* max 10FFFF */ - //errcpy(err, "invalid Unicode range"); - printf("invalid Unicode range"); - return -1; - } - p += len; - if (append_utf8(mrb, code, buf, encp, err) != 0) - return -1; - has_unicode = 1; - - while (p < end && ISSPACE(*p)) p++; - } - - if (has_unicode == 0) { - //errcpy(err, "invalid Unicode list"); - printf("invalid Unicode list"); - return -1; - } - - *pp = p; - - return 0; -} - -static int -unescape_nonascii(mrb_state *mrb, const char *p, const char *end, mrb_encoding *enc, - mrb_value buf, mrb_encoding **encp, int *has_property, - onig_errmsg_buffer err) -{ - char c; - char smallbuf[2]; - - while (p < end) { - int chlen = mrb_enc_precise_mbclen(p, end, enc); - if (!MBCLEN_CHARFOUND_P(chlen)) { - //errcpy(err, "invalid multibyte character"); - printf("invalid multibyte character"); - return -1; - } - chlen = MBCLEN_CHARFOUND_LEN(chlen); - if (1 < chlen || (*p & 0x80)) { - mrb_str_buf_cat(mrb, buf, p, chlen); - p += chlen; - if (*encp == 0) - *encp = enc; - else if (*encp != enc) { - //errcpy(err, "non ASCII character in UTF-8 regexp"); - printf("non ASCII character in UTF-8 regexp"); - return -1; - } - continue; - } - - switch (c = *p++) { - case '\\': - if (p == end) { - //errcpy(err, "too short escape sequence"); - printf("too short escape sequence"); - return -1; - } - switch (c = *p++) { - case '1': case '2': case '3': - case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */ - { - size_t octlen; - if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) { - /* backref or 7bit octal. - no need to unescape anyway. - re-escaping may break backref */ - goto escape_asis; - } - } - /* xxx: How about more than 199 subexpressions? */ - - case '0': /* \0, \0O, \0OO */ - - case 'x': /* \xHH */ - case 'c': /* \cX, \c\M-X */ - case 'C': /* \C-X, \C-\M-X */ - case 'M': /* \M-X, \M-\C-X, \M-\cX */ - p = p-2; - if (unescape_escaped_nonascii(mrb, &p, end, enc, buf, encp, err) != 0) - return -1; - break; - - case 'u': - if (p == end) { - //errcpy(err, "too short escape sequence"); - printf("too short escape sequence"); - return -1; - } - if (*p == '{') { - /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */ - p++; - if (unescape_unicode_list(mrb, &p, end, buf, encp, err) != 0) - return -1; - if (p == end || *p++ != '}') { - //errcpy(err, "invalid Unicode list"); - printf("invalid Unicode list"); - return -1; - } - break; - } - else { - /* \uHHHH */ - if (unescape_unicode_bmp(mrb, &p, end, buf, encp, err) != 0) - return -1; - break; - } - - case 'p': /* \p{Hiragana} */ - case 'P': - if (!*encp) { - *has_property = 1; - } - goto escape_asis; - - default: /* \n, \\, \d, \9, etc. */ -escape_asis: - smallbuf[0] = '\\'; - smallbuf[1] = c; - mrb_str_buf_cat(mrb, buf, smallbuf, 2); - break; - } - break; - - default: - mrb_str_buf_cat(mrb, buf, &c, 1); - break; - } - } - - return 0; -} - - static mrb_value -mrb_reg_preprocess(mrb_state *mrb, const char *p, const char *end, mrb_encoding *enc, - mrb_encoding **fixed_enc, onig_errmsg_buffer err) +mrb_reg_preprocess(mrb_state *mrb, const char *p, const char *end, onig_errmsg_buffer err) { - mrb_value buf; - int has_property = 0; - - //buf = mrb_str_buf_new(0); - buf = mrb_str_buf_new(mrb, 0); - - if (mrb_enc_asciicompat(mrb, enc)) - *fixed_enc = 0; - else { - *fixed_enc = enc; - mrb_enc_associate(mrb, buf, enc); - } - - if (unescape_nonascii(mrb, p, end, enc, buf, fixed_enc, &has_property, err) != 0) - return mrb_nil_value(); - - if (has_property && !*fixed_enc) { - *fixed_enc = enc; - } - - if (*fixed_enc) { - mrb_enc_associate(mrb, buf, *fixed_enc); - } - - return buf; + return mrb_nil_value(); } static int -mrb_reg_initialize(mrb_state *mrb, mrb_value obj, const char *s, long len, mrb_encoding *enc, +mrb_reg_initialize(mrb_state *mrb, mrb_value obj, const char *s, long len, int options, onig_errmsg_buffer err, const char *sourcefile, int sourceline) { struct RRegexp *re = RREGEXP(obj); mrb_value unescaped; - mrb_encoding *fixed_enc = 0; - mrb_encoding *a_enc = mrb_ascii8bit_encoding(mrb); + mrb_encoding *enc = mrb_ascii8bit_encoding(mrb); if (re->ptr) mrb_raise(mrb, E_TYPE_ERROR, "already initialized regexp"); re->ptr = 0; - if (mrb_enc_dummy_p(enc)) { - //errcpy(err, "can't make regexp with dummy encoding"); - printf("can't make regexp with dummy encoding"); - return -1; - } - - unescaped = mrb_reg_preprocess(mrb, s, s+len, enc, &fixed_enc, err); + unescaped = mrb_reg_preprocess(mrb, s, s+len, err); if (mrb_nil_p(unescaped)) return -1; - if (fixed_enc) { - if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) || - (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) { - //errcpy(err, "incompatible character encoding"); - printf("incompatible character encoding"); - return -1; - } - if (fixed_enc != a_enc) { - options |= ARG_ENCODING_FIXED; - enc = fixed_enc; - } - } - else if (!(options & ARG_ENCODING_FIXED)) { - enc = mrb_usascii_encoding(mrb); - } - - mrb_enc_associate(mrb, mrb_obj_value(re), enc); - if ((options & ARG_ENCODING_FIXED) || fixed_enc) { + if ((options & ARG_ENCODING_FIXED)) { //re->basic.flags |= KCODE_FIXED; re->flags|= KCODE_FIXED; } @@ -1207,7 +685,7 @@ mrb_reg_initialize(mrb_state *mrb, mrb_value obj, const char *s, long len, mrb_e options & ARG_REG_OPTION_MASK, err, sourcefile, sourceline); if (!re->ptr) return -1; - re->src = mrb_str_ptr(mrb_enc_str_new(mrb, s, len, enc)); + re->src = mrb_str_ptr(mrb_str_new(mrb, s, len)); return 0; } @@ -1217,8 +695,8 @@ mrb_reg_initialize_str(mrb_state *mrb, mrb_value obj, mrb_value str, int options const char *sourcefile, int sourceline) { int ret; - mrb_encoding *enc = mrb_enc_get(mrb, str); +#if 0 if (options & ARG_ENCODING_NONE) { mrb_encoding *ascii8bit = mrb_ascii8bit_encoding(mrb); if (enc != ascii8bit) { @@ -1230,8 +708,9 @@ mrb_reg_initialize_str(mrb_state *mrb, mrb_value obj, mrb_value str, int options enc = ascii8bit; } } +#endif - ret = mrb_reg_initialize(mrb, obj, RSTRING_PTR(str), RSTRING_LEN(str), enc, + ret = mrb_reg_initialize(mrb, obj, RSTRING_PTR(str), RSTRING_LEN(str), options, err, sourcefile, sourceline); return ret; @@ -1267,7 +746,6 @@ mrb_reg_initialize_m(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value se onig_errmsg_buffer err = ""; int flags = 0; mrb_value str; - mrb_encoding *enc; const char *ptr; long len; @@ -1286,10 +764,7 @@ mrb_reg_initialize_m(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value se flags = mrb_reg_options(mrb, re); ptr = RREGEXP_SRC_PTR(re); len = RREGEXP_SRC_LEN(re); - enc = mrb_enc_get(mrb, re); - if (mrb_reg_initialize(mrb, self, ptr, len, enc, flags, err, NULL, 0)) { - /*str = mrb_enc_str_new(mrb, ptr, len, enc); - mrb_reg_raise_str(str, flags, err);*/ + if (mrb_reg_initialize(mrb, self, ptr, len, flags, err, NULL, 0)) { printf("mrb_reg_raise_str(str, flags, err);"); } } @@ -1298,12 +773,10 @@ mrb_reg_initialize_m(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value se if (mrb_type(argv[1]) == MRB_TT_FIXNUM) flags = mrb_fixnum(argv[1]); else if (mrb_test(argv[1])) flags = ONIG_OPTION_IGNORECASE; } - enc = 0; if (argc == 3 && !mrb_nil_p(argv[2])) { //char *kcode = StringValuePtr(argv[2]); char *kcode = mrb_string_value_ptr(mrb, argv[2]); if (kcode[0] == 'n' || kcode[0] == 'N') { - enc = mrb_ascii8bit_encoding(mrb); flags |= ARG_ENCODING_NONE; } else { @@ -1314,9 +787,7 @@ mrb_reg_initialize_m(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value se str = argv[0]; //ptr = StringValuePtr(str); ptr = mrb_string_value_ptr(mrb, str); - if (enc - ? mrb_reg_initialize(mrb, self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0) - : mrb_reg_initialize_str(mrb, self, str, flags, err, NULL, 0)) { + if (mrb_reg_initialize_str(mrb, self, str, flags, err, NULL, 0)) { //mrb_reg_raise_str(str, flags, err); } } @@ -1346,7 +817,7 @@ mrb_reg_init_copy(mrb_state *mrb, mrb_value re/*, mrb_value copy*/) mrb_reg_check(mrb, copy); s = RREGEXP_SRC_PTR(copy); len = RREGEXP_SRC_LEN(copy); - if (mrb_reg_initialize(mrb, re, s, len, mrb_enc_get(mrb, copy), mrb_reg_options(mrb, copy), + if (mrb_reg_initialize(mrb, re, s, len, mrb_reg_options(mrb, copy), err, 0/*NULL*/, 0) != 0) { mrb_reg_raise(mrb, s, len, err, re); } @@ -1628,7 +1099,7 @@ mrb_reg_source(mrb_state *mrb, mrb_value re) mrb_value str; mrb_reg_check(mrb, re); - str = mrb_enc_str_new(mrb, RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), mrb_enc_get(mrb, re)); + str = mrb_str_new(mrb, RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re)); return str; } @@ -1757,23 +1228,12 @@ typedef struct { long char_pos; } pair_t; -static int -pair_byte_cmp(const void *pair1, const void *pair2) -{ - long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos; - return diff ? diff > 0 ? 1 : -1 : 0; -} - static void update_char_offset(mrb_state *mrb, mrb_value match) { struct rmatch *rm = RMATCH(match)->rmatch; struct re_registers *regs; - int i, num_regs, num_pos; - long c; - char *s, *p, *q; - mrb_encoding *enc; - pair_t *pairs; + int i, num_regs; if (rm->char_offset_updated) return; @@ -1787,55 +1247,12 @@ update_char_offset(mrb_state *mrb, mrb_value match) rm->char_offset_num_allocated = num_regs; } - enc = mrb_enc_get(mrb, mrb_obj_value(RMATCH(match)->str)); - if (mrb_enc_mbmaxlen(enc) == 1) { - for (i = 0; i < num_regs; i++) { - rm->char_offset[i].beg = BEG(i); - rm->char_offset[i].end = END(i); - } - rm->char_offset_updated = 1; - return; - } - - //pairs = ALLOCA_N(pair_t, num_regs*2); - pairs = mrb_malloc(mrb, sizeof(pair_t)*num_regs*2); - - num_pos = 0; for (i = 0; i < num_regs; i++) { - if (BEG(i) < 0) - continue; - pairs[num_pos++].byte_pos = BEG(i); - pairs[num_pos++].byte_pos = END(i); - } - qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp); - - s = p = RMATCH(match)->str->buf; - c = 0; - for (i = 0; i < num_pos; i++) { - q = s + pairs[i].byte_pos; - c += mrb_enc_strlen(p, q, enc); - pairs[i].char_pos = c; - p = q; - } - - for (i = 0; i < num_regs; i++) { - pair_t key, *found; - if (BEG(i) < 0) { - rm->char_offset[i].beg = -1; - rm->char_offset[i].end = -1; - continue; - } - - key.byte_pos = BEG(i); - found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); - rm->char_offset[i].beg = found->char_pos; - - key.byte_pos = END(i); - found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); - rm->char_offset[i].end = found->char_pos; + rm->char_offset[i].beg = BEG(i); + rm->char_offset[i].end = END(i); } - rm->char_offset_updated = 1; + return; } /* 15.2.16.3.2 */ @@ -2235,49 +1652,36 @@ option_to_str(char str[4], int options) #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */ static void -mrb_reg_expr_str(mrb_state *mrb, mrb_value str, const char *s, long len, - mrb_encoding *enc, mrb_encoding *resenc) +mrb_reg_expr_str(mrb_state *mrb, mrb_value str, const char *s, long len) { const char *p, *pend; int need_escape = 0; - int c, clen; + int c; p = s; pend = p + len; - if (mrb_enc_asciicompat(mrb, enc)) { - while (p < pend) { - c = mrb_enc_ascget(mrb, p, pend, &clen, enc); - if (c == -1) { - if (enc == resenc) { - p += mbclen(p, pend, enc); - } - else { - need_escape = 1; - break; - } - } - else if (c != '/' && mrb_enc_isprint(c, enc)) { - p += clen; - } - else { - need_escape = 1; - break; - } + while (p < pend) { + c = *p; + if (c == -1) { + p += pend - p; + } + else if (c != '/' && ISPRINT(c)) { + p++; + } + else { + need_escape = 1; + break; } - } - else { - need_escape = 1; } if (!need_escape) { mrb_str_buf_cat(mrb, str, s, len); } else { - int unicode_p = mrb_enc_unicode_p(enc); p = s; while (p<pend) { - c = mrb_enc_ascget(mrb, p, pend, &clen, enc); - if (c == '\\' && p+clen < pend) { - int n = clen + mbclen(p+clen, pend, enc); + c = *p; + if (c == '\\' && p+1 < pend) { + int n = 1 + pend - (p+1); mrb_str_buf_cat(mrb, str, p, n); p += n; continue; @@ -2285,38 +1689,21 @@ mrb_reg_expr_str(mrb_state *mrb, mrb_value str, const char *s, long len, else if (c == '/') { char c = '\\'; mrb_str_buf_cat(mrb, str, &c, 1); - mrb_str_buf_cat(mrb, str, p, clen); - } - else if (c == -1) { - clen = mrb_enc_precise_mbclen(p, pend, enc); - if (!MBCLEN_CHARFOUND_P(clen)) { - c = (unsigned char)*p; - clen = 1; - goto hex; - } - if (resenc) { - unsigned int c = mrb_enc_mbc_to_codepoint(p, pend, enc); - mrb_str_buf_cat_escaped_char(mrb, str, c, unicode_p); - } - else { - clen = MBCLEN_CHARFOUND_LEN(clen); - mrb_str_buf_cat(mrb, str, p, clen); - } + mrb_str_buf_cat(mrb, str, p, 1); } - else if (mrb_enc_isprint(c, enc)) { - mrb_str_buf_cat(mrb, str, p, clen); + else if (ISPRINT(c)) { + mrb_str_buf_cat(mrb, str, p, 1); } - else if (!mrb_enc_isspace(c, enc)) { + else if (!ISSPACE(c)) { char b[8]; - hex: snprintf(b, sizeof(b), "\\x%02X", c); mrb_str_buf_cat(mrb, str, b, 4); } else { - mrb_str_buf_cat(mrb, str, p, clen); + mrb_str_buf_cat(mrb, str, p, 1); } - p += clen; + p++; } } } @@ -2355,7 +1742,6 @@ mrb_reg_to_s(mrb_state *mrb, mrb_value re) mrb_reg_check(mrb, re); memset(optbuf, 0, 5); - mrb_enc_copy(mrb, str, re); options = RREGEXP(re)->ptr->options; ptr = (UChar*)RREGEXP_SRC_PTR(re); len = RREGEXP_SRC_LEN(re); @@ -2399,7 +1785,7 @@ again: ++ptr; len -= 2; - err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT, + err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, NULL); onig_free(rp); } @@ -2419,9 +1805,8 @@ again: } mrb_str_buf_cat(mrb, str, ":", strlen(":")); - mrb_reg_expr_str(mrb, str, (char*)ptr, len, enc, NULL); + mrb_reg_expr_str(mrb, str, (char*)ptr, len); mrb_str_buf_cat(mrb, str, ")", strlen(")")); - mrb_enc_copy(mrb, str, re); return str; } @@ -2663,8 +2048,6 @@ mrb_init_regexp(mrb_state *mrb) mrb_define_const(mrb, s, "MULTILINE", mrb_fixnum_value(ONIG_OPTION_MULTILINE)); mrb_define_const(mrb, s, "FIXEDENCODING", mrb_fixnum_value(ARG_ENCODING_FIXED)); - //mrb_global_variable(®_cache); - s = mrb_define_class(mrb, "MatchData", mrb->object_class); //mrb_undef_method(CLASS_OF(rb_cMatch), "new"); @@ -2705,27 +2088,23 @@ mrb_reg_regsub(mrb_state *mrb, mrb_value str, mrb_value src, struct re_registers { mrb_value val; char *p, *s, *e; - int no, clen; - mrb_encoding *str_enc = mrb_enc_get(mrb, str); - mrb_encoding *src_enc = mrb_enc_get(mrb, src); - int acompat = mrb_enc_asciicompat(mrb, str_enc); -#define ASCGET(mrb,s,e,cl) (acompat ? (*cl=1,ISASCII(s[0])?s[0]:-1) : mrb_enc_ascget(mrb, s, e, cl, str_enc)) struct RString *ps = mrb_str_ptr(str); + int no; val.tt = 0; p = s = ps->buf; e = s + ps->len; while (s < e) { - int c = ASCGET(mrb, s, e, &clen); + int c = *s; char *ss; if (c == -1) { - s += mbclen(s, e, str_enc); + s += e - s; continue; } ss = s; - s += clen; + s++; if (c != '\\' || s == e) continue; @@ -2733,16 +2112,16 @@ mrb_reg_regsub(mrb_state *mrb, mrb_value str, mrb_value src, struct re_registers if (!val.tt) { val = mrb_str_buf_new(mrb, ss-p); } - mrb_enc_str_buf_cat(mrb, val, p, ss-p, str_enc); + mrb_str_buf_cat(mrb, val, p, ss-p); - c = ASCGET(mrb, s, e, &clen); + c = *s; if (c == -1) { - s += mbclen(s, e, str_enc); - mrb_enc_str_buf_cat(mrb, val, ss, s-ss, str_enc); + s += e - s; + mrb_str_buf_cat(mrb, val, ss, s-ss); p = s; continue; } - s += clen; + s++; p = s; switch (c) { @@ -2757,18 +2136,18 @@ mrb_reg_regsub(mrb_state *mrb, mrb_value str, mrb_value src, struct re_registers break; case 'k': - if (s < e && ASCGET(mrb, s, e, &clen) == '<') { + if (s < e && *s == '<') { char *name, *name_end; - name_end = name = s + clen; + name_end = name = s + 1; while (name_end < e) { - c = ASCGET(mrb, name_end, e, &clen); + c = *name_end; if (c == '>') break; - name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen; + name_end += c == -1 ? e - name_end : 1; } if (name_end < e) { no = name_to_backref_number(mrb, regs, RREGEXP(regexp), name, name_end); - p = s = name_end + clen; + p = s = name_end + 1; break; } else { @@ -2776,7 +2155,7 @@ mrb_reg_regsub(mrb_state *mrb, mrb_value str, mrb_value src, struct re_registers } } - mrb_enc_str_buf_cat(mrb, val, ss, s-ss, str_enc); + mrb_str_buf_cat(mrb, val, ss, s-ss); continue; case '0': @@ -2785,11 +2164,11 @@ mrb_reg_regsub(mrb_state *mrb, mrb_value str, mrb_value src, struct re_registers break; case '`': - mrb_enc_str_buf_cat(mrb, val, RSTRING_PTR(src), BEG(0), src_enc); + mrb_str_buf_cat(mrb, val, RSTRING_PTR(src), BEG(0)); continue; case '\'': - mrb_enc_str_buf_cat(mrb, val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc); + mrb_str_buf_cat(mrb, val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0)); continue; case '+': @@ -2799,31 +2178,29 @@ mrb_reg_regsub(mrb_state *mrb, mrb_value str, mrb_value src, struct re_registers break; case '\\': - mrb_enc_str_buf_cat(mrb, val, s-clen, clen, str_enc); + mrb_str_buf_cat(mrb, val, s-1, 1); continue; default: - mrb_enc_str_buf_cat(mrb, val, ss, s-ss, str_enc); + mrb_str_buf_cat(mrb, val, ss, s-ss); continue; } if (no >= 0) { if (no >= regs->num_regs) continue; if (BEG(no) == -1) continue; - mrb_enc_str_buf_cat(mrb, val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc); + mrb_str_buf_cat(mrb, val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no)); } } /* while (s < e) { */ if (!val.tt) return str; if (p < e) { - mrb_enc_str_buf_cat(mrb, val, p, e-p, str_enc); + mrb_str_buf_cat(mrb, val, p, e-p); } return val; } -//#define NEW_NODE(t,a0,a1,a2) mrb_node_newnode((t),(int)(a0),(int)(a1),(int)(a2)) -//#define NEW_IF(c,t,e) NEW_NODE(NODE_IF,c,t,e) static inline NODE * lfp_svar_place(mrb_state *mrb, /*mrb_thread_t *th,*/ mrb_value *lfp) { @@ -3038,9 +2415,6 @@ mrb_memsearch(mrb_state *mrb, const void *x0, int m, const void *y0, int n, mrb_ } return -1; } - else if (enc == mrb_utf8_encoding(mrb)) { - return mrb_memsearch_qs_utf8(x0, m, y0, n); - } else { return mrb_memsearch_qs(x0, m, y0, n); } @@ -3077,12 +2451,7 @@ mrb_reg_new_str(mrb_state *mrb, mrb_value s, int options) mrb_value mrb_reg_regcomp(mrb_state *mrb, mrb_value str) { - mrb_value save_str = str; - if (reg_cache.tt && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str) - && ENCODING_GET(mrb, reg_cache) == ENCODING_GET(mrb, str) - && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0) - return reg_cache; - return reg_cache = mrb_reg_new_str(mrb, save_str, 0); + return mrb_reg_new_str(mrb, str, 0); } int @@ -3143,7 +2512,7 @@ is_special_global_name(const char *m, const char *e, mrb_encoding *enc) ++m; if (m < e && is_identchar(m, e, enc)) { if (!ISASCII(*m)) mb = 1; - m += mrb_enc_mbclen(m, e, enc); + m += e - m; } break; default: @@ -3228,7 +2597,7 @@ mrb_enc_symname2_p(const char *name, long len, mrb_encoding *enc) id: if (m >= e || (*m != '_' && !mrb_enc_isalpha(*m, enc) && ISASCII(*m))) return FALSE; - while (m < e && is_identchar(m, e, enc)) m += mrb_enc_mbclen(m, e, enc); + while (m < e && is_identchar(m, e, enc)) m += e - m; if (localid) { switch (*m) { case '!': case '?': case '=': ++m; diff --git a/src/sprintf.c b/src/sprintf.c index dc9b83dec..79bd101ad 100644 --- a/src/sprintf.c +++ b/src/sprintf.c @@ -520,7 +520,6 @@ mrb_str_format(mrb_state *mrb, int argc, const mrb_value *argv, mrb_value fmt) ++argc; --argv; mrb_string_value(mrb, &fmt); - fmt = mrb_str_new4(mrb, fmt); p = RSTRING_PTR(fmt); end = p + RSTRING_LEN(fmt); blen = 0; @@ -668,44 +667,37 @@ retry: mrb_value tmp; unsigned int c; int n; -#ifdef INCLUDE_ENCODING - mrb_encoding *enc = mrb_enc_get(mrb, fmt); -#endif //INCLUDE_ENCODING tmp = mrb_check_string_type(mrb, val); if (!mrb_nil_p(tmp)) { if (RSTRING_LEN(tmp) != 1 ) { mrb_raise(mrb, E_ARGUMENT_ERROR, "%%c requires a character"); } -#ifdef INCLUDE_ENCODING - c = mrb_enc_codepoint_len(mrb, RSTRING_PTR(tmp), RSTRING_END(tmp), &n, enc); -#else c = RSTRING_PTR(tmp)[0]; n = 1; -#endif //INCLUDE_ENCODING } else { c = mrb_fixnum(val); - n = mrb_enc_codelen(mrb, c, enc); + n = 1; } if (n <= 0) { mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid character"); } if (!(flags & FWIDTH)) { CHECK(n); - mrb_enc_mbcput(c, &buf[blen], enc); + buf[blen] = c; blen += n; } else if ((flags & FMINUS)) { CHECK(n); - mrb_enc_mbcput(c, &buf[blen], enc); + buf[blen] = c; blen += n; FILL(' ', width-1); } else { FILL(' ', width-1); CHECK(n); - mrb_enc_mbcput(c, &buf[blen], enc); + buf[blen] = c; blen += n; } } @@ -717,25 +709,18 @@ format_s: { mrb_value arg = GETARG(); long len, slen; -#ifdef INCLUDE_ENCODING - mrb_encoding *enc = mrb_enc_get(mrb, fmt); -#endif //INCLUDE_ENCODING if (*p == 'p') arg = mrb_inspect(mrb, arg); str = mrb_obj_as_string(mrb, arg); len = RSTRING_LEN(str); - mrb_str_set_len(mrb, result, blen); + RSTRING_LEN(result) = blen; if (flags&(FPREC|FWIDTH)) { slen = RSTRING_LEN(str); if (slen < 0) { mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid mbstring sequence"); } if ((flags&FPREC) && (prec < slen)) { -#ifdef INCLUDE_ENCODING - char *p = mrb_enc_nth(mrb, RSTRING_PTR(str), RSTRING_END(str),prec, enc); -#else char *p = RSTRING_PTR(str) + prec; -#endif //INCLUDE_ENCODING slen = prec; len = p - RSTRING_PTR(str); } @@ -757,12 +742,10 @@ format_s: buf[blen++] = ' '; } } - mrb_enc_associate(mrb, result, enc); break; } } PUSH(RSTRING_PTR(str), len); - mrb_enc_associate(mrb, result, enc); } break; @@ -915,15 +898,8 @@ bin_retry: if (*p == 'X') { char *pp = s; int c; -#ifdef INCLUDE_ENCODING - mrb_encoding *enc = mrb_enc_get(mrb, fmt); -#endif //INCLUDE_ENCODING while ((c = (int)(unsigned char)*pp) != 0) { -#ifdef INCLUDE_ENCODING - *pp = mrb_enc_toupper(c, enc); -#else *pp = toupper(c); -#endif //INCLUDE_ENCODING pp++; } } diff --git a/src/string.c b/src/string.c index 83d78ccb9..b6ca9e489 100644 --- a/src/string.c +++ b/src/string.c @@ -9,13 +9,12 @@ #include <stdarg.h> #include <string.h> #include "mruby/string.h" +#include <ctype.h> #include "mruby/numeric.h" #include "mruby/range.h" -#include <ctype.h> #include "mruby/array.h" #include "mruby/class.h" #include "mruby/variable.h" -#include "mruby/hash.h" #include <stdio.h> #include "re.h" #ifdef INCLUDE_REGEXP @@ -23,8 +22,6 @@ #include "st.h" #endif //INCLUDE_REGEXP -#define mrb_usascii_str_new2 mrb_usascii_str_new_cstr - #ifndef FALSE #define FALSE 0 #endif @@ -38,33 +35,12 @@ const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz"; #ifdef INCLUDE_REGEXP static mrb_value get_pat(mrb_state *mrb, mrb_value pat, mrb_int quote); #endif //INCLUDE_REGEXP -#ifdef INCLUDE_ENCODING -static void mrb_enc_cr_str_copy_for_substr(mrb_state *mrb, mrb_value dest, mrb_value src); -#else -#define mrb_enc_cr_str_copy_for_substr(mrb, dest, src) -#endif //INCLUDE_ENCODING -static mrb_value str_replace(mrb_state *mrb, mrb_value str, mrb_value str2); -#ifdef INCLUDE_ENCODING -static long str_strlen(mrb_state *mrb, mrb_value str, mrb_encoding *enc); -#endif //INCLUDE_ENCODING -#ifdef INCLUDE_ENCODING -#define is_ascii_string(mrb, str) (mrb_enc_str_coderange(mrb, str) == ENC_CODERANGE_7BIT) -#define is_broken_string(mrb, str) (mrb_enc_str_coderange(mrb, str) == ENC_CODERANGE_BROKEN) -#define STR_ENC_GET(mrb, str) mrb_enc_from_index(mrb, ENCODING_GET(mrb, str)) -#endif //INCLUDE_ENCODING - -void -mrb_str_set_len(mrb_state *mrb, mrb_value str, long len) -{ - mrb_str_modify(mrb, str); - RSTRING_LEN(str) = len; - RSTRING_PTR(str)[len] = '\0'; -} +static mrb_value str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2); +static mrb_value mrb_str_subseq(mrb_state *mrb, mrb_value str, long beg, long len); #define RESIZE_CAPA(str,capacity) do {\ RSTRING(str)->buf = mrb_realloc(mrb, RSTRING(str)->buf, (capacity)+1);\ - if (!MRB_STR_NOCAPA_P(str))\ - RSTRING_CAPA(str) = capacity;\ + RSTRING_CAPA(str) = capacity;\ } while (0) #define STR_SET_LEN(str, n) do { \ @@ -75,86 +51,42 @@ mrb_str_set_len(mrb_state *mrb, mrb_value str, long len) RSTRING(str)->len--;\ } while (0) -#ifdef INCLUDE_ENCODING -static mrb_value mrb_enc_cr_str_buf_cat(mrb_state *mrb, mrb_value str, const char *ptr, long len, - int ptr_encindex, int ptr_cr, int *ptr_cr_ret); -#endif //INCLUDE_ENCODING - -#ifdef INCLUDE_ENCODING -mrb_value -mrb_usascii_str_new_cstr(mrb_state *mrb, const char *ptr) -{ - mrb_value str = mrb_str_new_cstr(mrb, ptr);//mrb_str_new2(ptr); - ENCODING_CODERANGE_SET(mrb, str, mrb_usascii_encindex(), ENC_CODERANGE_7BIT); - return str; -} - -mrb_value -mrb_external_str_new_with_enc(mrb_state *mrb, const char *ptr, long len, mrb_encoding *eenc) +static void +str_modify(mrb_state *mrb, mrb_value str) { - mrb_value str; - - str = mrb_str_new(mrb, ptr, len); - if (eenc == mrb_usascii_encoding(mrb) && - mrb_enc_str_coderange(mrb, str) != ENC_CODERANGE_7BIT) { - mrb_enc_associate(mrb, str, mrb_ascii8bit_encoding(mrb)); - return str; - } - mrb_enc_associate(mrb, str, eenc); - return mrb_str_conv_enc(mrb, str, eenc, mrb_default_internal_encoding(mrb)); -} + struct RString *s = mrb_str_ptr(str); -mrb_value -mrb_locale_str_new(mrb_state *mrb, const char *ptr, long len) -{ - return mrb_external_str_new_with_enc(mrb, ptr, len, mrb_locale_encoding(mrb)); -} + if (MRB_STR_SHARED_P(str)) { + char *ptr, *p; + long len; -mrb_value -mrb_str_buf_cat_ascii(mrb_state *mrb, mrb_value str, const char *ptr) -{ - /* ptr must reference NUL terminated ASCII string. */ - int encindex = ENCODING_GET(mrb, str); - mrb_encoding *enc = mrb_enc_from_index(mrb, encindex); - if (mrb_enc_asciicompat(mrb, enc)) { - return mrb_enc_cr_str_buf_cat(mrb, str, ptr, strlen(ptr), - encindex, ENC_CODERANGE_7BIT, 0); - } - else { - //char *buf = ALLOCA_N(char, mrb_enc_mbmaxlen(enc)); - char *buf = mrb_malloc(mrb, mrb_enc_mbmaxlen(enc)); - while (*ptr) { - unsigned int c = (unsigned char)*ptr; - int len = mrb_enc_codelen(mrb, c, enc); - mrb_enc_mbcput(c, buf, enc); - mrb_enc_cr_str_buf_cat(mrb, str, buf, len, - encindex, ENC_CODERANGE_VALID, 0); - ptr++; + p = s->buf; + len = s->len; + ptr = mrb_malloc(mrb, sizeof(char)*(len+1)); + if (p) { + memcpy(ptr, p, len); } - return str; + ptr[len] = 0; + s->buf = ptr; + s->len = len; + s->aux.capa = len; + MRB_STR_UNSET_NOCAPA(str); } } mrb_value -mrb_filesystem_str_new_cstr(mrb_state *mrb, const char *ptr) -{ - return mrb_external_str_new_with_enc(mrb, ptr, strlen(ptr), mrb_filesystem_encoding(mrb)); -} -#endif //INCLUDE_ENCODING - -mrb_value -mrb_str_resize(mrb_state *mrb, mrb_value str, size_t len) +mrb_str_resize(mrb_state *mrb, mrb_value str, int len) { - size_t slen; + int slen; - mrb_str_modify(mrb, str); + str_modify(mrb, str); slen = RSTRING_LEN(str); if (len != slen) { if (slen < len || slen -len > 1024) { RSTRING_PTR(str) = mrb_realloc(mrb, RSTRING_PTR(str), len+1); } if (!MRB_STR_NOCAPA_P(str)) { - RSTRING(str)->aux.capa = len; + RSTRING_CAPA(str) = len; } RSTRING(str)->len = len; RSTRING(str)->buf[len] = '\0'; /* sentinel */ @@ -162,16 +94,6 @@ mrb_str_resize(mrb_state *mrb, mrb_value str, size_t len) return str; } -#ifdef INCLUDE_ENCODING -mrb_value -mrb_usascii_str_new(mrb_state *mrb, const char *ptr, long len) -{ - mrb_value str = mrb_str_new(mrb, ptr, len); - ENCODING_CODERANGE_SET(mrb, str, mrb_usascii_encindex(), ENC_CODERANGE_7BIT); - return str; -} -#endif //INCLUDE_ENCODING - static inline void str_mod_check(mrb_state *mrb, mrb_value str, char *p, mrb_int len) { @@ -182,360 +104,62 @@ str_mod_check(mrb_state *mrb, mrb_value str, char *p, mrb_int len) } } -#ifdef INCLUDE_ENCODING -static inline int -single_byte_optimizable(mrb_state *mrb, mrb_value str) -{ - mrb_encoding *enc; - /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */ - if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) - return 1; - - enc = STR_ENC_GET(mrb, str); - if (mrb_enc_mbmaxlen(enc) == 1) - return 1; - - /* Conservative. Possibly single byte. - * "\xa1" in Shift_JIS for example. */ - return 0; -} - -static inline const char * -search_nonascii(const char *p, const char *e) -{ - while (p < e) { - if (!ISASCII(*p)) - return p; - p++; - } - return NULL; -} -#endif //INCLUDE_ENCODING - -static inline void -str_modifiable(mrb_value str) -{ - ; -} - -static inline int -str_independent(mrb_value str) -{ - str_modifiable(str); - if (!MRB_STR_SHARED_P(str)) return 1; - return 0; -} - -#ifdef INCLUDE_ENCODING -static inline void -str_enc_copy(mrb_state *mrb, mrb_value str1, mrb_value str2) -{ - mrb_enc_set_index(mrb, str1, ENCODING_GET(mrb, str2)); -} - -static inline long -enc_strlen(const char *p, const char *e, mrb_encoding *enc, int cr) -{ - long c; - const char *q; - - if (mrb_enc_mbmaxlen(enc) == mrb_enc_mbminlen(enc)) { - return (e - p + mrb_enc_mbminlen(enc) - 1) / mrb_enc_mbminlen(enc); - } - else if (mrb_enc_asciicompat(mrb, enc)) { - c = 0; - if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) { - while (p < e) { - if (ISASCII(*p)) { - q = search_nonascii(p, e); - if (!q) - return c + (e - p); - c += q - p; - p = q; - } - p += mrb_enc_fast_mbclen(p, e, enc); - c++; - } - } - else { - while (p < e) { - if (ISASCII(*p)) { - q = search_nonascii(p, e); - if (!q) - return c + (e - p); - c += q - p; - p = q; - } - p += mrb_enc_mbclen(p, e, enc); - c++; - } - } - return c; - } - - for (c=0; p<e; c++) { - p += mrb_enc_mbclen(p, e, enc); - } - return c; -} - -size_t -mrb_str_capacity(mrb_value str) -{ - if (MRB_STR_NOCAPA_P(str)) { - return RSTRING_LEN(str); - } - else { - return RSTRING_CAPA(str); - } -} -#endif //INCLUDE_ENCODING - #define mrb_obj_alloc_string(mrb) ((struct RString*)mrb_obj_alloc((mrb), MRB_TT_STRING, (mrb)->string_class)) -static inline mrb_value -str_alloc(mrb_state *mrb) +static struct RString* +str_alloc(mrb_state *mrb, struct RClass *c) { struct RString* s; s = mrb_obj_alloc_string(mrb); - //NEWOBJ(str, struct RString); - //OBJSETUP(str, klass, T_STRING); + s->c = c; s->buf = 0; s->len = 0; s->aux.capa = 0; - return mrb_obj_value(s); -} - -#ifdef INCLUDE_ENCODING -long -mrb_enc_strlen(const char *p, const char *e, mrb_encoding *enc) -{ - return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN); -} -#endif //INCLUDE_ENCODING - -static void -str_make_independent(mrb_state *mrb, mrb_value str) -{ - char *ptr; - long len = RSTRING_LEN(str); - - ptr = mrb_malloc(mrb, sizeof(char)*(len+1)); - if (RSTRING_PTR(str)) { - memcpy(ptr, RSTRING_PTR(str), len); - } - ptr[len] = 0; - RSTRING(str)->buf = ptr; - RSTRING(str)->len = len; - RSTRING(str)->aux.capa = len; - MRB_STR_UNSET_NOCAPA(str); -} - -#ifdef INCLUDE_ENCODING -static int -coderange_scan(const char *p, long len, mrb_encoding *enc) -{ - const char *e = p + len; - - if (mrb_enc_to_index(enc) == 0) { - /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ - p = search_nonascii(p, e); - return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT; - } - - if (mrb_enc_asciicompat(mrb, enc)) { - p = search_nonascii(p, e); - if (!p) { - return ENC_CODERANGE_7BIT; - } - while (p < e) { - int ret = mrb_enc_precise_mbclen(p, e, enc); - if (!MBCLEN_CHARFOUND_P(ret)) { - return ENC_CODERANGE_BROKEN; - } - p += MBCLEN_CHARFOUND_LEN(ret); - if (p < e) { - p = search_nonascii(p, e); - if (!p) { - return ENC_CODERANGE_VALID; - } - } - } - if (e < p) { - return ENC_CODERANGE_BROKEN; - } - return ENC_CODERANGE_VALID; - } - - while (p < e) { - int ret = mrb_enc_precise_mbclen(p, e, enc); - - if (!MBCLEN_CHARFOUND_P(ret)) { - return ENC_CODERANGE_BROKEN; - } - p += MBCLEN_CHARFOUND_LEN(ret); - } - if (e < p) { - return ENC_CODERANGE_BROKEN; - } - return ENC_CODERANGE_VALID; -} - -int -mrb_enc_str_coderange(mrb_state *mrb, mrb_value str) -{ - int cr = ENC_CODERANGE(str); - - if (cr == ENC_CODERANGE_UNKNOWN) { - mrb_encoding *enc = STR_ENC_GET(mrb, str); - cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); - ENC_CODERANGE_SET(str, cr); - } - return cr; -} - -char* -mrb_enc_nth(mrb_state *mrb, const char *p, const char *e, long nth, mrb_encoding *enc) -{ - if (mrb_enc_mbmaxlen(enc) == 1) { - p += nth; - } - else if (mrb_enc_mbmaxlen(enc) == mrb_enc_mbminlen(enc)) { - p += nth * mrb_enc_mbmaxlen(enc); - } - else if (mrb_enc_asciicompat(mrb, enc)) { - const char *p2, *e2; - int n; - - while (p < e && 0 < nth) { - e2 = p + nth; - if (e < e2) - return (char*)e; - if (ISASCII(*p)) { - p2 = search_nonascii(p, e2); - if (!p2) - return (char*)e2; - nth -= p2 - p; - p = p2; - } - n = mrb_enc_mbclen(p, e, enc); - p += n; - nth--; - } - if (nth != 0) - return (char*)e; - return (char*)p; - } - else { - while (p<e && nth--) { - p += mrb_enc_mbclen(p, e, enc); - } - } - if (p > e) p = e; - return (char*)p; -} - -static char* -str_nth(mrb_state *mrb, const char *p, const char *e, long nth, mrb_encoding *enc, int singlebyte) -{ - if (singlebyte) - p += nth; - else { - p = mrb_enc_nth(mrb, p, e, nth, enc); - } - if (!p) return 0; - if (p > e) p = e; - return (char*)p; + return s; } /* char offset to byte offset */ -static long -str_offset(mrb_state *mrb, const char *p, const char *e, long nth, mrb_encoding *enc, int singlebyte) -{ - const char *pp = str_nth(mrb, p, e, nth, enc, singlebyte); - if (!pp) return e - p; - return pp - p; -} - -long -mrb_str_offset(mrb_state *mrb, mrb_value str, long pos) -{ - return str_offset(mrb, RSTRING_PTR(str), RSTRING_END(str), pos, - STR_ENC_GET(mrb, str), single_byte_optimizable(mrb, str)); -} - -static void -mrb_enc_cr_str_exact_copy(mrb_state *mrb, mrb_value dest, mrb_value src) -{ - str_enc_copy(mrb, dest, src); - ENC_CODERANGE_SET(dest, ENC_CODERANGE(src)); -} -#else -#define mrb_enc_cr_str_exact_copy(mrb, dest, src) -#endif //INCLUDE_ENCODING - -mrb_value -str_new4(mrb_state *mrb, mrb_value str) +int +mrb_str_offset(mrb_state *mrb, mrb_value str, int pos) { - mrb_value str2; - - str2 = mrb_obj_value(mrb_obj_alloc_string(mrb)); - RSTRING(str2)->len = RSTRING_LEN(str); - RSTRING(str2)->buf = RSTRING_PTR(str); - - if (MRB_STR_SHARED_P(str)) { - struct RString *shared = RSTRING_SHARED(str); - FL_SET(str2, MRB_STR_SHARED); - RSTRING_SHARED(str2) = shared; - } - else { - FL_SET(str, MRB_STR_SHARED); - RSTRING_SHARED(str) = mrb_str_ptr(str2); - } - mrb_enc_cr_str_exact_copy(mrb, str2, str); - return str2; + return pos; } -static mrb_value -str_new(mrb_state *mrb, enum mrb_vtype ttype, const char *p, size_t len) +static struct RString* +str_new(mrb_state *mrb, const char *p, int len) { - mrb_value str; + struct RString *s = str_alloc(mrb, mrb->string_class); - //str = str_alloc(mrb); - str = mrb_str_buf_new(mrb, len); -#ifdef INCLUDE_ENCODING - if (len == 0) { - ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); - } -#endif //INCLUDE_ENCODING + s->len = len; + s->aux.capa = len; + s->buf = mrb_malloc(mrb, len+1); if (p) { - memcpy(RSTRING_PTR(str), p, len); + memcpy(s->buf, p, len); } - STR_SET_LEN(str, len); - RSTRING_PTR(str)[len] = '\0'; - return str; + s->buf[len] = '\0'; + return s; } -mrb_value -mrb_str_new_with_class(mrb_state *mrb, mrb_value obj, const char *ptr, long len) +void +str_with_class(mrb_state *mrb, struct RString *s, mrb_value obj) { - return str_new(mrb, mrb_type(obj), ptr, len); + s->c = mrb_str_ptr(obj)->c; } -#define mrb_str_new5 mrb_str_new_with_class - static mrb_value -str_new_empty(mrb_state *mrb, mrb_value str) +mrb_str_new_empty(mrb_state *mrb, mrb_value str) { - mrb_value v = mrb_str_new5(mrb, str, 0, 0); - return v; + struct RString *s = str_new(mrb, 0, 0); + + str_with_class(mrb, s, str); + return mrb_obj_value(s); } mrb_value -mrb_str_buf_new(mrb_state *mrb, size_t capa) +mrb_str_buf_new(mrb_state *mrb, int capa) { struct RString *s; @@ -553,14 +177,14 @@ mrb_str_buf_new(mrb_state *mrb, size_t capa) } mrb_value -str_buf_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len) +str_buf_cat(mrb_state *mrb, mrb_value str, const char *ptr, int len) { long capa, total, off = -1; + str_modify(mrb, str); if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) { off = ptr - RSTRING_PTR(str); } - mrb_str_modify(mrb, str); if (len == 0) return mrb_fixnum_value(0); capa = RSTRING_CAPA(str); if (RSTRING_LEN(str) >= LONG_MAX - len) { @@ -588,61 +212,29 @@ str_buf_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len) } mrb_value -mrb_str_buf_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len) +mrb_str_buf_cat(mrb_state *mrb, mrb_value str, const char *ptr, int len) { if (len == 0) return str; return str_buf_cat(mrb, str, ptr, len); } -/* - * call-seq: - * String.new(str="") => new_str - * - * Returns a new string object containing a copy of <i>str</i>. - */ - mrb_value -mrb_str_new(mrb_state *mrb, const char *p, size_t len) +mrb_str_new(mrb_state *mrb, const char *p, int len) { - struct RString *s; - - if (len == 0) { - return mrb_str_buf_new(mrb, len); - } - s = mrb_obj_alloc_string(mrb); - s->buf = mrb_malloc(mrb, len+1); - if (p) { - memcpy(s->buf, p, len); - } - s->len = len; - s->aux.capa = len; - s->buf[len] ='\0'; + struct RString *s = str_new(mrb, p, len); return mrb_obj_value(s); } -/* ptr==0 is error */ mrb_value mrb_str_new2(mrb_state *mrb, const char *ptr) { + struct RString *s; if (!ptr) { mrb_raise(mrb, E_ARGUMENT_ERROR, "NULL pointer given"); } -#ifdef INCLUDE_ENCODING - return mrb_usascii_str_new2(mrb, ptr); -#else - return mrb_str_new(mrb, ptr, strlen(ptr)); -#endif //INCLUDE_ENCODING -} - -#ifdef INCLUDE_ENCODING -mrb_value -mrb_enc_str_new(mrb_state *mrb, const char *ptr, long len, mrb_encoding *enc) -{ - mrb_value str = mrb_str_new(mrb, ptr, len); - mrb_enc_associate(mrb, str, enc); - return str; + s = str_new(mrb, ptr, strlen(ptr)); + return mrb_obj_value(s); } -#endif //INCLUDE_ENCODING /* * call-seq: (Caution! NULL string) @@ -655,7 +247,7 @@ mrb_value mrb_str_new_cstr(mrb_state *mrb, const char *p) { struct RString *s; - size_t len = strlen(p); + int len = strlen(p); s = mrb_obj_alloc_string(mrb); s->buf = mrb_malloc(mrb, len+1); @@ -667,6 +259,32 @@ mrb_str_new_cstr(mrb_state *mrb, const char *p) return mrb_obj_value(s); } +static struct RString* +str_make_shared(mrb_state *mrb, mrb_value str) +{ + struct RString *orig, *s; + + s = str_new(mrb, 0, 0); + str_with_class(mrb, s, str); + orig = mrb_str_ptr(str); + if (!(orig->flags & MRB_STR_SHARED)) { + struct RString *shared = mrb_obj_alloc_string(mrb); + + shared->buf = orig->buf; + shared->len = orig->len; + shared->aux.capa = orig->aux.capa; + + orig->aux.shared = shared; + orig->flags |= MRB_STR_SHARED; + } + s->buf = orig->buf; + s->len = orig->len; + s->aux.shared = orig->aux.shared; + s->flags |= MRB_STR_SHARED; + + return s; +} + /* * call-seq: (Caution! string literal) * String.new(str="") => new_str @@ -675,11 +293,21 @@ mrb_str_new_cstr(mrb_state *mrb, const char *p) */ mrb_value -mrb_str_literal(mrb_state *mrb, mrb_value lit) +mrb_str_literal(mrb_state *mrb, mrb_value str) { - struct RString *s = mrb_str_ptr(lit); + struct RString *orig, *s; - return mrb_str_new(mrb, s->buf, s->len); + s = str_new(mrb, 0, 0); + orig = mrb_str_ptr(str); + while (orig->flags & MRB_STR_SHARED) { + orig = orig->aux.shared; + } + s->buf = orig->buf; + s->len = orig->len; + s->aux.shared = orig; + s->flags |= MRB_STR_SHARED; + + return mrb_obj_value(s); } /* @@ -707,7 +335,7 @@ void mrb_str_concat(mrb_state *mrb, mrb_value self, mrb_value other) { struct RString *s1 = mrb_str_ptr(self), *s2; - size_t len; + int len; if (mrb_type(other) != MRB_TT_STRING) { other = mrb_str_to_str(mrb, other); @@ -736,14 +364,12 @@ mrb_str_plus(mrb_state *mrb, mrb_value a, mrb_value b) struct RString *s = mrb_str_ptr(a); struct RString *s2 = mrb_str_ptr(b); struct RString *t; - mrb_value r; - r = mrb_str_new(mrb, 0, s->len + s2->len); - t = mrb_str_ptr(r); + t = str_new(mrb, 0, s->len + s2->len); memcpy(t->buf, s->buf, s->len); memcpy(t->buf + s->len, s2->buf, s2->len); - return r; + return mrb_obj_value(t); } /* 15.2.10.5.2 */ @@ -757,30 +383,7 @@ mrb_str_plus(mrb_state *mrb, mrb_value a, mrb_value b) static mrb_value mrb_str_plus_m(mrb_state *mrb, mrb_value self) { - mrb_value str3; - mrb_value str2; -#ifdef INCLUDE_ENCODING - mrb_encoding *enc; -#endif //INCLUDE_ENCODING - - //mrb_get_args(mrb, "s", &p, &len); - mrb_get_args(mrb, "o", &str2); - - mrb_string_value(mrb, &str2); -#ifdef INCLUDE_ENCODING - enc = mrb_enc_check(mrb, self, str2); -#endif //INCLUDE_ENCODING - str3 = mrb_str_new(mrb, 0, RSTRING_LEN(self)+RSTRING_LEN(str2)); - memcpy(RSTRING_PTR(str3), RSTRING_PTR(self), RSTRING_LEN(self)); - memcpy(RSTRING_PTR(str3) + RSTRING_LEN(self), - RSTRING_PTR(str2), RSTRING_LEN(str2)); - RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0'; -#ifdef INCLUDE_ENCODING - ENCODING_CODERANGE_SET(mrb, str3, mrb_enc_to_index(enc), - ENC_CODERANGE_AND(ENC_CODERANGE(self), ENC_CODERANGE(str2))); -#endif //INCLUDE_ENCODING - - return str3; + return mrb_nil_value(); } /* @@ -793,7 +396,6 @@ static mrb_value mrb_str_bytesize(mrb_state *mrb, mrb_value self) { struct RString *s = mrb_str_ptr(self); - return mrb_fixnum_value(s->len); } @@ -808,26 +410,11 @@ mrb_str_bytesize(mrb_state *mrb, mrb_value self) mrb_value mrb_str_size(mrb_state *mrb, mrb_value self) { -#ifdef INCLUDE_ENCODING - long len; - - len = str_strlen(mrb, self, STR_ENC_GET(mrb, self)); - return mrb_fixnum_value(len); -#else - return mrb_str_bytesize(mrb, self); -#endif //INCLUDE_ENCODING -} - -void -mrb_str_modify(mrb_state *mrb, mrb_value str) -{ - if (!str_independent(str)) - str_make_independent(mrb, str); + struct RString *s = mrb_str_ptr(self); + return mrb_fixnum_value(s->len); } - /* 15.2.10.5.1 */ - /* * call-seq: * str * integer => new_str @@ -840,12 +427,11 @@ mrb_str_modify(mrb_state *mrb, mrb_value str) static mrb_value mrb_str_times(mrb_state *mrb, mrb_value self) { - mrb_value str2; mrb_int n,len,times; - char *ptr2; + struct RString *str2; + char *p; mrb_get_args(mrb, "i", ×); - if (times < 0) { mrb_raise(mrb, E_ARGUMENT_ERROR, "negative argument"); } @@ -853,22 +439,22 @@ mrb_str_times(mrb_state *mrb, mrb_value self) mrb_raise(mrb, E_ARGUMENT_ERROR, "argument too big"); } - str2 = mrb_str_new5(mrb, self, 0, len = RSTRING_LEN(self)*times); - ptr2 = RSTRING_PTR(str2); + len = RSTRING_LEN(self)*times; + str2 = str_new(mrb, 0, len); + str_with_class(mrb, str2, self); + p = str2->buf; if (len > 0) { n = RSTRING_LEN(self); - memcpy(ptr2, RSTRING_PTR(self), n); + memcpy(p, RSTRING_PTR(self), n); while (n <= len/2) { - memcpy(ptr2 + n, ptr2, n); + memcpy(p + n, p, n); n *= 2; } - memcpy(ptr2 + n, ptr2, len-n); + memcpy(p + n, p, len-n); } - ptr2[RSTRING_LEN(str2)] = '\0'; - - mrb_enc_cr_str_copy_for_substr(mrb, str2, self); + p[str2->len] = '\0'; - return str2; + return mrb_obj_value(str2); } /* -------------------------------------------------------------- */ @@ -941,8 +527,7 @@ mrb_str_cmp_m(mrb_state *mrb, mrb_value str1) else if (!mrb_respond_to(mrb, str2, mrb_intern(mrb, "<=>"))) { return mrb_nil_value(); } - else - { + else { mrb_value tmp = mrb_funcall(mrb, str2, "<=>", 1, str1); if (mrb_nil_p(tmp)) return mrb_nil_value(); @@ -958,55 +543,12 @@ mrb_str_cmp_m(mrb_state *mrb, mrb_value str1) return mrb_fixnum_value(result); } -#ifdef INCLUDE_ENCODING -int -mrb_str_comparable(mrb_state *mrb, mrb_value str1, mrb_value str2) -{ - int idx1, idx2; - int rc1, rc2; - - if (RSTRING_LEN(str1) == 0) return TRUE; - if (RSTRING_LEN(str2) == 0) return TRUE; - idx1 = ENCODING_GET(mrb, str1); - idx2 = ENCODING_GET(mrb, str2); - if (idx1 == idx2) return TRUE; - rc1 = mrb_enc_str_coderange(mrb, str1); - rc2 = mrb_enc_str_coderange(mrb, str2); - if (rc1 == ENC_CODERANGE_7BIT) { - if (rc2 == ENC_CODERANGE_7BIT) return TRUE; - if (mrb_enc_asciicompat(mrb, mrb_enc_from_index(mrb, idx2))) - return TRUE; - } - if (rc2 == ENC_CODERANGE_7BIT) { - if (mrb_enc_asciicompat(mrb, mrb_enc_from_index(mrb, idx1))) - return TRUE; - } - return FALSE; -} - -int -mrb_str_hash_cmp(mrb_state *mrb, mrb_value str1, mrb_value str2) -{ - long len; - - if (!mrb_str_comparable(mrb, str1, str2)) return 1; - if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) && - memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) { - return 0; - } - return 1; -} -#endif //INCLUDE_ENCODING - static int str_eql(mrb_state *mrb, const mrb_value str1, const mrb_value str2) { const long len = RSTRING_LEN(str1); if (len != RSTRING_LEN(str2)) return FALSE; -#ifdef INCLUDE_ENCODING - if (!mrb_str_comparable(mrb, str1, str2)) return FALSE; -#endif //INCLUDE_ENCODING if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) return TRUE; return FALSE; @@ -1100,202 +642,9 @@ mrb_string_value_ptr(mrb_state *mrb, mrb_value ptr) static mrb_value mrb_str_match(mrb_state *mrb, mrb_value self/* x */) { - mrb_value y; - - mrb_get_args(mrb, "o", &y); - switch (mrb_type(y)) { - case MRB_TT_STRING: - mrb_raise(mrb, E_TYPE_ERROR, "type mismatch: String given"); - case MRB_TT_REGEX: -#ifdef INCLUDE_REGEXP - return mrb_reg_match_str(mrb, y, self); -#else - mrb_raise(mrb, E_TYPE_ERROR, "Regexp Class not supported"); -#endif //INCLUDE_REGEXP - default: - if (mrb_respond_to(mrb, y, mrb_intern(mrb, "=~"))) { - return mrb_funcall(mrb, y, "=~", 1, self); - } - else { - return mrb_nil_value(); - } - } -} -/* ---------------------------------- */ -mrb_value -mrb_str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, int len) -{ -#ifdef INCLUDE_ENCODING - mrb_encoding *enc = STR_ENC_GET(mrb, str); -#endif //INCLUDE_ENCODING - mrb_value str2; -#ifdef INCLUDE_ENCODING - char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str); -#else - char *p, *s = RSTRING_PTR(str); -#endif //INCLUDE_ENCODING - - if (len < 0) return mrb_nil_value(); - if (!RSTRING_LEN(str)) { - len = 0; - } -#ifdef INCLUDE_ENCODING - if (single_byte_optimizable(mrb, str)) { -#endif //INCLUDE_ENCODING - if (beg > RSTRING_LEN(str)) return mrb_nil_value(); - if (beg < 0) { - beg += RSTRING_LEN(str); - if (beg < 0) return mrb_nil_value(); - } - if (beg + len > RSTRING_LEN(str)) - len = RSTRING_LEN(str) - beg; - if (len <= 0) { - len = 0; - p = 0; - } - else - p = s + beg; -#ifdef INCLUDE_ENCODING - goto sub; - } - if (beg < 0) { - if (len > -beg) len = -beg; - if (-beg * mrb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) { - beg = -beg; - while (beg-- > len && (e = mrb_enc_prev_char(s, e, e, enc)) != 0); - p = e; - if (!p) return mrb_nil_value(); - while (len-- > 0 && (p = mrb_enc_prev_char(s, p, e, enc)) != 0); - if (!p) return mrb_nil_value(); - len = e - p; - goto sub; - } - else { - beg += str_strlen(mrb, str, enc); - if (beg < 0) return mrb_nil_value(); - } - } - else if (beg > 0 && beg > str_strlen(mrb, str, enc)) { - return mrb_nil_value(); - } - if (len == 0) { - p = 0; - } - else if (mrb_enc_mbmaxlen(enc) == mrb_enc_mbminlen(enc)) { - int char_sz = mrb_enc_mbmaxlen(enc); - - p = s + beg * char_sz; - if (p > e) { - p = e; - len = 0; - } - else if (len * char_sz > e - p) - len = e - p; - else - len *= char_sz; - } - else if ((p = str_nth(mrb, s, e, beg, enc, 0)) == e) { - len = 0; - } - else { - len = str_offset(mrb, p, e, len, enc, 0); - } -sub: -#endif //INCLUDE_ENCODING - if (len > STR_BUF_MIN_SIZE && beg + len == RSTRING_LEN(str)) { -#ifdef INCLUDE_ENCODING - str2 = mrb_str_new4(mrb, str); - str2 = str_new3(mrb, mrb_obj_class(mrb, str2), str2); -#else - str2 = mrb_str_new(mrb, s, RSTRING_LEN(str)); -#endif //INCLUDE_ENCODING - RSTRING(str2)->buf += RSTRING(str2)->len - len; - RSTRING(str2)->len = len; - } - else { - str2 = mrb_str_new5(mrb, str, p, len); - mrb_enc_cr_str_copy_for_substr(mrb, str2, str); - } - - return str2; -} - -#ifdef INCLUDE_REGEXP -static mrb_value -mrb_str_subpat(mrb_state *mrb, mrb_value str, mrb_value re, mrb_int backref) -{ - if (mrb_reg_search(mrb, re, str, 0, 0) >= 0) { - mrb_value match = mrb_backref_get(mrb); - int nth = mrb_reg_backref_number(mrb, match, mrb_fixnum_value(backref)); - return mrb_reg_nth_match(mrb, nth, mrb_backref_get(mrb)); - } return mrb_nil_value(); } -#endif //INCLUDE_REGEXP - -/* --- 1-8-7parse.c --> */ - -#ifdef INCLUDE_ENCODING -long -mrb_enc_strlen_cr(mrb_state *mrb, const char *p, const char *e, mrb_encoding *enc, int *cr) -{ - long c; - const char *q; - int ret; - - *cr = 0; - if (mrb_enc_mbmaxlen(enc) == mrb_enc_mbminlen(enc)) { - return (e - p + mrb_enc_mbminlen(enc) - 1) / mrb_enc_mbminlen(enc); - } - else if (mrb_enc_asciicompat(mrb, enc)) { - c = 0; - while (p < e) { - if (ISASCII(*p)) { - q = search_nonascii(p, e); - if (!q) { - if (!*cr) *cr = ENC_CODERANGE_7BIT; - return c + (e - p); - } - c += q - p; - p = q; - } - ret = mrb_enc_precise_mbclen(p, e, enc); - if (MBCLEN_CHARFOUND_P(ret)) { - *cr |= ENC_CODERANGE_VALID; - p += MBCLEN_CHARFOUND_LEN(ret); - } - else { - *cr = ENC_CODERANGE_BROKEN; - p++; - } - c++; - } - if (!*cr) *cr = ENC_CODERANGE_7BIT; - return c; - } - - for (c=0; p<e; c++) { - ret = mrb_enc_precise_mbclen(p, e, enc); - if (MBCLEN_CHARFOUND_P(ret)) { - *cr |= ENC_CODERANGE_VALID; - p += MBCLEN_CHARFOUND_LEN(ret); - } - else { - *cr = ENC_CODERANGE_BROKEN; - if (p + mrb_enc_mbminlen(enc) <= e) - p += mrb_enc_mbminlen(enc); - else - p = e; - } - } - if (!*cr) *cr = ENC_CODERANGE_7BIT; - return c; -} -#endif //INCLUDE_ENCODING -/* --- 1-8-7parse.c --< */ - -#ifndef INCLUDE_ENCODING static inline long mrb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n) { @@ -1308,7 +657,7 @@ mrb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long qstable[i] = m + 1; for (; x < xe; ++x) qstable[*x] = xe - x; - /* Searching */ + /* Searching */ for (; y + m <= ys + n; y += *(qstable + y[m])) { if (*xs == *y && memcmp(xs, y, m) == 0) return y - ys; @@ -1316,7 +665,7 @@ mrb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long return -1; } -int +static int mrb_memsearch(const void *x0, int m, const void *y0, int n) { const unsigned char *x = x0, *y = y0; @@ -1328,7 +677,7 @@ mrb_memsearch(const void *x0, int m, const void *y0, int n) else if (m < 1) { return 0; } - else if (m == 1) { + else if (m == 1) { const unsigned char *ys = y, *ye = ys + n; for (; y < ye; ++y) { if (*x == *y) @@ -1338,60 +687,22 @@ mrb_memsearch(const void *x0, int m, const void *y0, int n) } return mrb_memsearch_qs(x0, m, y0, n); } -#endif //INCLUDE_ENCODING - -/* --- 1-8-7parse.c --< */ -#ifdef INCLUDE_ENCODING -static long -str_strlen(mrb_state *mrb, mrb_value str, mrb_encoding *enc) -{ - const char *p, *e; - long n; - int cr; - - if (single_byte_optimizable(mrb, str)) return RSTRING_LEN(str); - if (!enc) enc = STR_ENC_GET(mrb, str); - p = RSTRING_PTR(str); - e = RSTRING_END(str); - cr = ENC_CODERANGE(str); - n = mrb_enc_strlen_cr(mrb, p, e, enc, &cr); - if (cr) { - ENC_CODERANGE_SET(str, cr); - } - return n; -} -#endif //INCLUDE_ENCODING static mrb_int mrb_str_index(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int offset) { mrb_int pos; - char *s, *sptr, *e; + char *s, *sptr; int len, slen; -#ifdef INCLUDE_ENCODING - mrb_encoding *enc; - - enc = mrb_enc_check(mrb, str, sub); - if (is_broken_string(mrb, sub)) { - return -1; - } - len = str_strlen(mrb, str, enc); - slen = str_strlen(mrb, sub, enc); -#else len = RSTRING_LEN(str); slen = RSTRING_LEN(sub); -#endif //INCLUDE_ENCODING if (offset < 0) { offset += len; if (offset < 0) return -1; } if (len - offset < slen) return -1; s = RSTRING_PTR(str); - e = s + RSTRING_LEN(str); if (offset) { -#ifdef INCLUDE_ENCODING - offset = str_offset(mrb, s, RSTRING_END(str), offset, enc, single_byte_optimizable(mrb, str)); -#endif //INCLUDE_ENCODING s += offset; } if (slen == 0) return offset; @@ -1399,39 +710,18 @@ mrb_str_index(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int offset) sptr = RSTRING_PTR(sub); slen = RSTRING_LEN(sub); len = RSTRING_LEN(str) - offset; -#ifdef INCLUDE_ENCODING - for (;;) { - char *t; - pos = mrb_memsearch(mrb, sptr, slen, s, len, enc); - if (pos < 0) return pos; - t = mrb_enc_right_char_head(s, s+pos, e, enc); - if (t == s + pos) break; - if ((len -= t - s) <= 0) return -1; - offset += t - s; - s = t; - } -#else - pos = mrb_memsearch(sptr, slen, s+offset, len-offset); + pos = mrb_memsearch(sptr, slen, s, len); if (pos < 0) return pos; -#endif //INCLUDE_ENCODING return pos + offset; } mrb_value mrb_str_dup(mrb_state *mrb, mrb_value str) { + /* should return shared string */ struct RString *s = mrb_str_ptr(str); - struct RString *dup; - dup = mrb_obj_alloc_string(mrb); - dup->buf = mrb_malloc(mrb, s->len+1); - if (s->buf) { - memcpy(dup->buf, s->buf, s->len); - dup->buf[s->len] = 0; - } - dup->len = s->len; - dup->aux.capa = s->len; - return mrb_obj_value(dup); + return mrb_str_new(mrb, s->buf, s->len); } static mrb_value @@ -1467,18 +757,14 @@ num_index: mrb_int beg, len; mrb_value tmp; -#ifdef INCLUDE_ENCODING - len = str_strlen(mrb, str, STR_ENC_GET(mrb, str)); -#else len = RSTRING_LEN(str); -#endif //INCLUDE_ENCODING switch (mrb_range_beg_len(mrb, indx, &beg, &len, len, 0)) { case 0/*FLASE*/: break; case 2/*OTHER*/: return mrb_nil_value(); default: - tmp = mrb_str_substr(mrb, str, beg, len); + tmp = mrb_str_subseq(mrb, str, beg, len); return tmp; } } @@ -1539,12 +825,12 @@ num_index: static mrb_value mrb_str_aref_m(mrb_state *mrb, mrb_value str) { + mrb_value a1, a2; int argc; - mrb_value *argv; - mrb_get_args(mrb, "*", &argv, &argc); + argc = mrb_get_args(mrb, "o|o", &a1, &a2); if (argc == 2) { - if (mrb_type(argv[0]) == MRB_TT_REGEX) { + if (mrb_type(a1) == MRB_TT_REGEX) { #ifdef INCLUDE_REGEXP return mrb_str_subpat(mrb, str, argv[0], mrb_fixnum(argv[1])); #else @@ -1552,37 +838,13 @@ mrb_str_aref_m(mrb_state *mrb, mrb_value str) return mrb_nil_value(); #endif //INCLUDE_REGEXP } - return mrb_str_substr(mrb, str, mrb_fixnum(argv[0]), mrb_fixnum(argv[1])); + return mrb_str_substr(mrb, str, mrb_fixnum(a1), mrb_fixnum(a2)); } if (argc != 1) { mrb_raise(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (%d for 1)", argc); } - return mrb_str_aref(mrb, str, argv[0]); -} - -#ifdef INCLUDE_ENCODING -/* As mrb_str_modify(), but don't clear coderange */ -static void -str_modify_keep_cr(mrb_state *mrb, mrb_value str) -{ - if (!str_independent(str)) - str_make_independent(mrb, str); - if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN) - /* Force re-scan later */ - ENC_CODERANGE_CLEAR(str); -} - -static void -mrb_str_check_dummy_enc(mrb_state *mrb, mrb_encoding *enc) -{ - if (mrb_enc_dummy_p(enc)) { - mrb_raise(mrb, E_ENCODING_ERROR, "incompatible encoding with this operation: %s", - mrb_enc_name(enc)); - } + return mrb_str_aref(mrb, str, a1); } -#else -#define str_modify_keep_cr(mrb, str) mrb_str_modify((mrb), (str)) -#endif //INCLUDE_ENCODING /* 15.2.10.5.8 */ /* @@ -1600,39 +862,12 @@ mrb_str_check_dummy_enc(mrb_state *mrb, mrb_encoding *enc) static mrb_value mrb_str_capitalize_bang(mrb_state *mrb, mrb_value str) { -#ifdef INCLUDE_ENCODING - mrb_encoding *enc; -#endif //INCLUDE_ENCODING char *s, *send; int modify = 0; -#ifdef INCLUDE_ENCODING - unsigned int c; - int n; -#endif //INCLUDE_ENCODING - str_modify_keep_cr(mrb, str); -#ifdef INCLUDE_ENCODING - enc = STR_ENC_GET(mrb, str); - mrb_str_check_dummy_enc(mrb, enc); -#endif //INCLUDE_ENCODING + str_modify(mrb, str); if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return mrb_nil_value(); s = RSTRING_PTR(str); send = RSTRING_END(str); -#ifdef INCLUDE_ENCODING - c = mrb_enc_codepoint_len(mrb, s, send, &n, enc); - if (mrb_enc_islower(c, enc)) { - mrb_enc_mbcput(mrb_enc_toupper(c, enc), s, enc); - modify = 1; - } - s += n; - while (s < send) { - c = mrb_enc_codepoint_len(mrb, s, send, &n, enc); - if (mrb_enc_isupper(c, enc)) { - mrb_enc_mbcput(mrb_enc_tolower(c, enc), s, enc); - modify = 1; - } - s += n; - } -#else if (ISLOWER(*s)) { *s = toupper(*s); modify = 1; @@ -1643,7 +878,6 @@ mrb_str_capitalize_bang(mrb_state *mrb, mrb_value str) modify = 1; } } -#endif //INCLUDE_ENCODING if (modify) return str; return mrb_nil_value(); } @@ -1681,71 +915,34 @@ mrb_str_capitalize(mrb_state *mrb, mrb_value self) static mrb_value mrb_str_chomp_bang(mrb_state *mrb, mrb_value str) { - mrb_value *argv; - int argc; -#ifdef INCLUDE_ENCODING - mrb_encoding *enc; -#endif //INCLUDE_ENCODING mrb_value rs; mrb_int newline; - char *p, *pp, *e; + char *p, *pp; long len, rslen; - str_modify_keep_cr(mrb, str); + str_modify(mrb, str); len = RSTRING_LEN(str); - if (len == 0) return mrb_nil_value(); - p = RSTRING_PTR(str); - e = p + len; - //if (mrb_scan_args(argc, argv, "01", &rs) == 0) { - mrb_get_args(mrb, "*", &argv, &argc); - if (argc == 0) { - rs = mrb_str_new2(mrb, "\n"); -smart_chomp: -#ifdef INCLUDE_ENCODING - enc = mrb_enc_get(mrb, str); - if (mrb_enc_mbminlen(enc) > 1) { - pp = mrb_enc_left_char_head(p, e-mrb_enc_mbminlen(enc), e, enc); - if (mrb_enc_is_newline(pp, e, enc)) { - e = pp; + if (mrb_get_args(mrb, "|S", &rs) == 0) { + if (len == 0) return mrb_nil_value(); + smart_chomp: + if (RSTRING_PTR(str)[len-1] == '\n') { + STR_DEC_LEN(str); + if (RSTRING_LEN(str) > 0 && + RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') { + STR_DEC_LEN(str); } - pp = e - mrb_enc_mbminlen(enc); - if (pp >= p) { - pp = mrb_enc_left_char_head(p, pp, e, enc); - if (mrb_enc_ascget(mrb, pp, e, 0, enc) == '\r') { - e = pp; - } - } - if (e == RSTRING_END(str)) { - return mrb_nil_value(); - } - len = e - RSTRING_PTR(str); - STR_SET_LEN(str, len); + } + else if (RSTRING_PTR(str)[len-1] == '\r') { + STR_DEC_LEN(str); } else { -#endif //INCLUDE_ENCODING - if (RSTRING_PTR(str)[len-1] == '\n') { - STR_DEC_LEN(str); - if (RSTRING_LEN(str) > 0 && - RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') { - STR_DEC_LEN(str); - } - } - else if (RSTRING_PTR(str)[len-1] == '\r') { - STR_DEC_LEN(str); - } - else { - return mrb_nil_value(); - } -#ifdef INCLUDE_ENCODING + return mrb_nil_value(); } -#endif //INCLUDE_ENCODING - RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; return str; } - rs = argv[0]; - if (mrb_nil_p(rs)) return mrb_nil_value(); - //StringValue(rs); - mrb_string_value(mrb, &rs); + + if (len == 0 || mrb_nil_p(rs)) return mrb_nil_value(); + p = RSTRING_PTR(str); rslen = RSTRING_LEN(rs); if (rslen == 0) { while (len>0 && p[len-1] == '\n') { @@ -1755,7 +952,7 @@ smart_chomp: } if (len < RSTRING_LEN(str)) { STR_SET_LEN(str, len); - RSTRING_PTR(str)[len] = '\0'; + p[len] = '\0'; return str; } return mrb_nil_value(); @@ -1763,29 +960,16 @@ smart_chomp: if (rslen > len) return mrb_nil_value(); newline = RSTRING_PTR(rs)[rslen-1]; if (rslen == 1 && newline == '\n') + newline = RSTRING_PTR(rs)[rslen-1]; + if (rslen == 1 && newline == '\n') goto smart_chomp; -#ifdef INCLUDE_ENCODING - enc = mrb_enc_check(mrb, str, rs); - if (is_broken_string(mrb, rs)) { - return mrb_nil_value(); - } - pp = e - rslen; -#else pp = p + len - rslen; -#endif //INCLUDE_ENCODING if (p[len-1] == newline && (rslen <= 1 || memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) { -#ifdef INCLUDE_ENCODING - if (mrb_enc_left_char_head(p, pp, e, enc) != pp) - return mrb_nil_value(); - if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { - ENC_CODERANGE_CLEAR(str); - } -#endif //INCLUDE_ENCODING - STR_SET_LEN(str, RSTRING_LEN(str) - rslen); - RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; + STR_SET_LEN(str, len - rslen); + p[len] = '\0'; return str; } return mrb_nil_value(); @@ -1820,26 +1004,6 @@ mrb_str_chomp(mrb_state *mrb, mrb_value self) return str; } -#ifdef INCLUDE_ENCODING -static long -chopped_length(mrb_state *mrb, mrb_value str) -{ - mrb_encoding *enc = STR_ENC_GET(mrb, str); - const char *p, *p2, *beg, *end; - - beg = RSTRING_PTR(str); - end = beg + RSTRING_LEN(str); - if (beg > end) return 0; - p = mrb_enc_prev_char(beg, end, end, enc); - if (!p) return 0; - if (p > beg && mrb_enc_ascget(mrb, p, end, 0, enc) == '\n') { - p2 = mrb_enc_prev_char(beg, p, end, enc); - if (p2 && mrb_enc_ascget(mrb, p2, end, 0, enc) == '\r') p = p2; - } - return p - beg; -} -#endif //INCLUDE_ENCODING - /* 15.2.10.5.12 */ /* * call-seq: @@ -1852,13 +1016,9 @@ chopped_length(mrb_state *mrb, mrb_value str) static mrb_value mrb_str_chop_bang(mrb_state *mrb, mrb_value str) { - str_modify_keep_cr(mrb, str); + str_modify(mrb, str); if (RSTRING_LEN(str) > 0) { -#ifdef INCLUDE_ENCODING - long len; - len = chopped_length(mrb, str); -#else - size_t len; + int len; len = RSTRING_LEN(str) - 1; if (RSTRING_PTR(str)[len] == '\n') { if (len > 0 && @@ -1866,14 +1026,8 @@ mrb_str_chop_bang(mrb_state *mrb, mrb_value str) len--; } } -#endif //INCLUDE_ENCODING STR_SET_LEN(str, len); RSTRING_PTR(str)[len] = '\0'; -#ifdef INCLUDE_ENCODING - if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { - ENC_CODERANGE_CLEAR(str); - } -#endif //INCLUDE_ENCODING return str; } return mrb_nil_value(); @@ -1900,13 +1054,8 @@ static mrb_value mrb_str_chop(mrb_state *mrb, mrb_value self) { mrb_value str; -#ifdef INCLUDE_ENCODING - str = mrb_str_new5(mrb, self, RSTRING_PTR(self), chopped_length(mrb, self)); - mrb_enc_cr_str_copy_for_substr(mrb, str, self); -#else str = mrb_str_dup(mrb, self); mrb_str_chop_bang(mrb, str); -#endif //INCLUDE_ENCODING return str; } @@ -1921,62 +1070,20 @@ mrb_str_chop(mrb_state *mrb, mrb_value self) static mrb_value mrb_str_downcase_bang(mrb_state *mrb, mrb_value str) { -#ifdef INCLUDE_ENCODING - mrb_encoding *enc; -#endif //INCLUDE_ENCODING char *s, *send; int modify = 0; - str_modify_keep_cr(mrb, str); -#ifdef INCLUDE_ENCODING - enc = STR_ENC_GET(mrb, str); - mrb_str_check_dummy_enc(mrb, enc); -#endif //INCLUDE_ENCODING - s = RSTRING_PTR(str); send = RSTRING_END(str); -#ifdef INCLUDE_ENCODING - if (single_byte_optimizable(mrb, str)) { -#endif //INCLUDE_ENCODING - while (s < send) { - unsigned int c = *(unsigned char*)s; - -#ifdef INCLUDE_ENCODING - if (mrb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { -#else - if ('A' <= c && c <= 'Z') { -#endif //INCLUDE_ENCODING - *s = 'a' + (c - 'A'); - modify = 1; - } - s++; + str_modify(mrb, str); + s = RSTRING_PTR(str); + send = RSTRING_END(str); + while (s < send) { + if (ISUPPER(*s)) { + *s = tolower(*s); + modify = 1; } -#ifdef INCLUDE_ENCODING + s++; } - else { - int ascompat = mrb_enc_asciicompat(mrb, enc); - while (s < send) { - unsigned int c; - int n; - - if (ascompat && (c = *(unsigned char*)s) < 0x80) { - if (mrb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { - *s = 'a' + (c - 'A'); - modify = 1; - } - s++; - } - else { - c = mrb_enc_codepoint_len(mrb, s, send, &n, enc); - if (mrb_enc_isupper(c, enc)) { - /* assuming toupper returns codepoint with same size */ - mrb_enc_mbcput(mrb_enc_tolower(c, enc), s, enc); - modify = 1; - } - s += n; - } - } - } -#endif //INCLUDE_ENCODING if (modify) return str; return mrb_nil_value(); } @@ -2037,62 +1144,7 @@ mrb_str_downcase(mrb_state *mrb, mrb_value self) static mrb_value mrb_str_each_line(mrb_state *mrb, mrb_value str) { - mrb_value rs; - int newline; - struct RString *ps = mrb_str_ptr(str); - char *p = ps->buf, *pend = p + ps->len, *s; - char *ptr = p; - long len = ps->len, rslen; - mrb_value line; - struct RString *prs; - mrb_value *argv, b; - int argc; - - //if (mrb_scan_args(argc, argv, "01", &rs) == 0) { - mrb_get_args(mrb, "*&", &argv, &argc, &b); - if (argc > 0) { - rs = argv[0]; - } else { - rs = mrb_str_new2(mrb, "\n"); - } - /*RETURN_ENUMERATOR(str, argc, argv);*/ - if (mrb_nil_p(rs)) { - mrb_yield(mrb, b, str); - return str; - } - //StringValue(rs); - mrb_string_value(mrb, &rs); - prs = mrb_str_ptr(rs); - rslen = prs->len; - if (rslen == 0) { - newline = '\n'; - } - else { - newline = prs->buf[rslen-1]; - } - - for (s = p, p += rslen; p < pend; p++) { - if (rslen == 0 && *p == '\n') { - if (*++p != '\n') continue; - while (*p == '\n') p++; - } - if (ps->buf < p && p[-1] == newline && - (rslen <= 1 || - memcmp(prs->buf, p-rslen, rslen) == 0)) { - line = mrb_str_new5(mrb, str, s, p - s); - mrb_yield(mrb, b, line); - str_mod_check(mrb, str, ptr, len); - s = p; - } - } - - if (s != pend) { - if (p > pend) p = pend; - line = mrb_str_new5(mrb, str, s, p - s); - mrb_yield(mrb, b, line); - } - - return str; + return mrb_nil_value(); } /* 15.2.10.5.16 */ @@ -2106,7 +1158,7 @@ mrb_str_each_line(mrb_state *mrb, mrb_value str) * "".empty? #=> true */ static mrb_value -mrb_str_empty(mrb_state *mrb, mrb_value self) +mrb_str_empty_p(mrb_state *mrb, mrb_value self) { struct RString *s = mrb_str_ptr(self); @@ -2135,308 +1187,48 @@ mrb_str_eql(mrb_state *mrb, mrb_value self) return mrb_false_value(); } -#ifdef INCLUDE_ENCODING -static void -mrb_enc_cr_str_copy_for_substr(mrb_state *mrb, mrb_value dest, mrb_value src) -{ - /* this function is designed for copying encoding and coderange - * from src to new string "dest" which is made from the part of src. - */ - str_enc_copy(mrb, dest, src); - switch (ENC_CODERANGE(src)) { - case ENC_CODERANGE_7BIT: - ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); - break; - case ENC_CODERANGE_VALID: - if (!mrb_enc_asciicompat(mrb, STR_ENC_GET(mrb, src)) || - search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest))) - ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); - else - ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); - break; - default: - if (RSTRING_LEN(dest) == 0) { - if (!mrb_enc_asciicompat(mrb, STR_ENC_GET(mrb, src))) - ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); - else - ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); - } - break; - } -} -#endif //INCLUDE_ENCODING - static mrb_value -str_replace_shared(mrb_state *mrb, mrb_value str2, mrb_value str) -{ - str = mrb_str_new_frozen(mrb, str); - RSTRING(str2)->len = RSTRING_LEN(str); - RSTRING(str2)->buf = RSTRING_PTR(str); - RSTRING_SHARED(str2) = mrb_str_ptr(str); - FL_SET(str2, MRB_STR_SHARED); - mrb_enc_cr_str_exact_copy(mrb, str2, str); - - return str2; -} - -static mrb_value -str_new_shared(mrb_state *mrb, struct RClass* klass, mrb_value str) -{ - return str_replace_shared(mrb, str_alloc(mrb), str); -} - -mrb_value -str_new3(mrb_state *mrb, struct RClass* klass, mrb_value str) -{ - return str_new_shared(mrb, klass, str); -} - -mrb_value -mrb_str_new_shared(mrb_state *mrb, mrb_value str) -{ - mrb_value str2 = str_new3(mrb, mrb_obj_class(mrb, str), str); - - return str2; -} - -mrb_value -mrb_str_new_frozen(mrb_state *mrb, mrb_value orig) +mrb_str_subseq(mrb_state *mrb, mrb_value str, long beg, long len) { - struct RClass* klass; - mrb_value str; - - klass = mrb_obj_class(mrb, orig); + struct RString *s; - if (MRB_STR_SHARED_P(orig) && RSTRING_SHARED(orig)) { - long ofs; - ofs = RSTRING_LEN(str) - RSTRING_SHARED(orig)->len; -#ifdef INCLUDE_ENCODING - if ((ofs > 0) || (klass != RBASIC(str)->c) || - ENCODING_GET(mrb, str) != ENCODING_GET(mrb, orig)) { -#else - if ((ofs > 0) || (klass != RBASIC(str)->c)) { -#endif //INCLUDE_ENCODING - str = str_new3(mrb, klass, str); - RSTRING_PTR(str) += ofs; - RSTRING_LEN(str) -= ofs; - mrb_enc_cr_str_exact_copy(mrb, str, orig); - } - } - else { - str = str_new4(mrb, orig); - } - return str; -} + s = str_make_shared(mrb, str); + s->buf += beg; + s->len = len; -mrb_value -mrb_str_drop_bytes(mrb_state *mrb, mrb_value str, long len) -{ - char *ptr = RSTRING_PTR(str); - long olen = RSTRING_LEN(str), nlen; - - str_modifiable(str); - if (len > olen) len = olen; - nlen = olen - len; - if (!MRB_STR_SHARED_P(str)) mrb_str_new4(mrb, str); - ptr = RSTRING(str)->buf += len; - RSTRING(str)->len = nlen; - ptr[nlen] = 0; - //ENC_CODERANGE_CLEAR(str); - return str; + return mrb_obj_value(s); } mrb_value -mrb_str_subseq(mrb_state *mrb, mrb_value str, long beg, long len) +mrb_str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, int len) { mrb_value str2; - if (RSTRING_LEN(str) == beg + len && - STR_BUF_MIN_SIZE < len) { - str2 = mrb_str_new_shared(mrb, mrb_str_new_frozen(mrb, str)); - mrb_str_drop_bytes(mrb, str2, beg); - } - else { - str2 = mrb_str_new5(mrb, str, RSTRING_PTR(str)+beg, len); - } - mrb_enc_cr_str_copy_for_substr(mrb, str2, str); - - return str2; -} - -#ifdef INCLUDE_ENCODING -int -mrb_enc_str_asciionly_p(mrb_state *mrb, mrb_value str) -{ - mrb_encoding *enc = STR_ENC_GET(mrb, str); - - if (!mrb_enc_asciicompat(mrb, enc)) - return 0/*FALSE*/; - else if (mrb_enc_str_coderange(mrb, str) == ENC_CODERANGE_7BIT) - return 1/*TRUE*/; - return 0/*FALSE*/; -} - -static mrb_value -mrb_enc_cr_str_buf_cat(mrb_state *mrb, mrb_value str, const char *ptr, long len, - int ptr_encindex, int ptr_cr, int *ptr_cr_ret) -{ - int str_encindex = ENCODING_GET(mrb, str); - int res_encindex; - int str_cr, res_cr; - int str_a8 = ENCODING_IS_ASCII8BIT(str); - int ptr_a8 = ptr_encindex == 0; - - str_cr = ENC_CODERANGE(str); - - if (str_encindex == ptr_encindex) { - if (str_cr == ENC_CODERANGE_UNKNOWN || - (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) { - ptr_cr = ENC_CODERANGE_UNKNOWN; - } - else if (ptr_cr == ENC_CODERANGE_UNKNOWN) { - ptr_cr = coderange_scan(ptr, len, mrb_enc_from_index(mrb, ptr_encindex)); - } - } - else { - mrb_encoding *str_enc = mrb_enc_from_index(mrb, str_encindex); - mrb_encoding *ptr_enc = mrb_enc_from_index(mrb, ptr_encindex); - if (!mrb_enc_asciicompat(mrb, str_enc) || !mrb_enc_asciicompat(mrb, ptr_enc)) { - if (len == 0) - return str; - if (RSTRING_LEN(str) == 0) { - mrb_str_buf_cat(mrb, str, ptr, len); - ENCODING_CODERANGE_SET(mrb, str, ptr_encindex, ptr_cr); - return str; - } - goto incompatible; - } - if (ptr_cr == ENC_CODERANGE_UNKNOWN) { - ptr_cr = coderange_scan(ptr, len, ptr_enc); - } - if (str_cr == ENC_CODERANGE_UNKNOWN) { - if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) { - str_cr = mrb_enc_str_coderange(mrb, str); - } - } - } - if (ptr_cr_ret) - *ptr_cr_ret = ptr_cr; - - if (str_encindex != ptr_encindex && - str_cr != ENC_CODERANGE_7BIT && - ptr_cr != ENC_CODERANGE_7BIT) { -incompatible: - mrb_raise(mrb, E_ENCODING_ERROR, "incompatible character encodings: %s and %s", - mrb_enc_name(mrb_enc_from_index(mrb, str_encindex)), - mrb_enc_name(mrb_enc_from_index(mrb, ptr_encindex))); - } - if (str_cr == ENC_CODERANGE_UNKNOWN) { - res_encindex = str_encindex; - res_cr = ENC_CODERANGE_UNKNOWN; - } - else if (str_cr == ENC_CODERANGE_7BIT) { - if (ptr_cr == ENC_CODERANGE_7BIT) { - res_encindex = !str_a8 ? str_encindex : ptr_encindex; - res_cr = ENC_CODERANGE_7BIT; - } - else { - res_encindex = ptr_encindex; - res_cr = ptr_cr; - } - } - else if (str_cr == ENC_CODERANGE_VALID) { - res_encindex = str_encindex; - if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID) - res_cr = str_cr; - else - res_cr = ptr_cr; + if (len < 0) return mrb_nil_value(); + if (!RSTRING_LEN(str)) { + len = 0; } - else { /* str_cr == ENC_CODERANGE_BROKEN */ - res_encindex = str_encindex; - res_cr = str_cr; - if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN; + if (beg > RSTRING_LEN(str)) return mrb_nil_value(); + if (beg < 0) { + beg += RSTRING_LEN(str); + if (beg < 0) return mrb_nil_value(); } - - if (len < 0) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "negative string size (or size too big)"); + if (beg + len > RSTRING_LEN(str)) + len = RSTRING_LEN(str) - beg; + if (len <= 0) { + len = 0; } - str_buf_cat(mrb, str, ptr, len); - ENCODING_CODERANGE_SET(mrb, str, res_encindex, res_cr); - return str; -} + str2 = mrb_str_subseq(mrb, str, beg, len); -mrb_value -mrb_enc_str_buf_cat(mrb_state *mrb, mrb_value str, const char *ptr, long len, mrb_encoding *ptr_enc) -{ - return mrb_enc_cr_str_buf_cat(mrb, str, ptr, len, - mrb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL); + return str2; } mrb_value mrb_str_buf_append(mrb_state *mrb, mrb_value str, mrb_value str2) { - int str2_cr; - - str2_cr = ENC_CODERANGE(str2); - - mrb_enc_cr_str_buf_cat(mrb, str, RSTRING_PTR(str2), RSTRING_LEN(str2), - ENCODING_GET(mrb, str2), str2_cr, &str2_cr); - - ENC_CODERANGE_SET(str2, str2_cr); - - return str; -} -#else -mrb_value -mrb_str_buf_append(mrb_state *mrb, mrb_value str, mrb_value str2) -{ mrb_str_cat(mrb, str, RSTRING_PTR(str2), RSTRING_LEN(str2)); return str; } -#endif //INCLUDE_ENCODING - -static inline void -str_discard(mrb_state *mrb, mrb_value str) -{ - str_modifiable(str); - if (!MRB_STR_SHARED_P(str)) { - mrb_free(mrb, RSTRING_PTR(str)); - RSTRING(str)->buf = 0; - RSTRING(str)->len = 0; - } -} - -void -mrb_str_shared_replace(mrb_state *mrb, mrb_value str, mrb_value str2) -{ -#ifdef INCLUDE_ENCODING - mrb_encoding *enc; - int cr; -#endif //INCLUDE_ENCODING - - if (mrb_obj_equal(mrb, str, str2)) return; -#ifdef INCLUDE_ENCODING - enc = STR_ENC_GET(mrb, str2); - cr = ENC_CODERANGE(str2); -#endif //INCLUDE_ENCODING - str_discard(mrb, str); - MRB_STR_UNSET_NOCAPA(str); - RSTRING_PTR(str) = RSTRING_PTR(str2); - RSTRING_LEN(str) = RSTRING_LEN(str2); - if (MRB_STR_NOCAPA_P(str2)) { - FL_SET(str, RBASIC(str2)->flags & MRB_STR_NOCAPA); - RSTRING_SHARED(str) = RSTRING_SHARED(str2); - } - else { - RSTRING_CAPA(str) = RSTRING_CAPA(str2); - } - - MRB_STR_UNSET_NOCAPA(str2); /* abandon str2 */ - RSTRING_PTR(str2)[0] = 0; - RSTRING_LEN(str2) = 0; - mrb_enc_associate(mrb, str, enc); - ENC_CODERANGE_SET(str, cr); -} #ifdef INCLUDE_REGEXP static mrb_value @@ -2450,7 +1242,6 @@ str_gsub(mrb_state *mrb, mrb_value str, mrb_int bang) mrb_int beg0, end0; mrb_int offset, blen, len, last; char *sp, *cp; - mrb_encoding *str_enc; mrb_get_args(mrb, "*", &argv, &argc); switch (argc) { @@ -2478,7 +1269,6 @@ str_gsub(mrb_state *mrb, mrb_value str, mrb_int bang) dest = mrb_str_buf_new(mrb, blen); sp = RSTRING_PTR(str); cp = sp; - str_enc = STR_ENC_GET(mrb, str); do { n++; @@ -2490,7 +1280,7 @@ str_gsub(mrb_state *mrb, mrb_value str, mrb_int bang) len = beg - offset; /* copy pre-match substr */ if (len) { - mrb_enc_str_buf_cat(mrb, dest, cp, len, str_enc); + mrb_str_buf_cat(mrb, dest, cp, len); } mrb_str_buf_append(mrb, dest, val); @@ -2503,8 +1293,8 @@ str_gsub(mrb_state *mrb, mrb_value str, mrb_int bang) * in order to prevent infinite loops. */ if (RSTRING_LEN(str) <= end0) break; - len = mrb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc); - mrb_enc_str_buf_cat(mrb, dest, RSTRING_PTR(str)+end0, len, str_enc); + len = RSTRING_LEN(str)-end0; + mrb_str_buf_cat(mrb, dest, RSTRING_PTR(str)+end0, len); offset = end0 + len; } cp = RSTRING_PTR(str) + offset; @@ -2512,17 +1302,10 @@ str_gsub(mrb_state *mrb, mrb_value str, mrb_int bang) beg = mrb_reg_search(mrb, pat, str, offset, 0); } while (beg >= 0); if (RSTRING_LEN(str) > offset) { - mrb_enc_str_buf_cat(mrb, dest, cp, RSTRING_LEN(str) - offset, str_enc); + mrb_str_buf_cat(mrb, dest, cp, RSTRING_LEN(str) - offset); } mrb_reg_search(mrb, pat, str, last, 0); - if (bang) { - mrb_str_shared_replace(mrb, str, dest); - } - else { - RBASIC(dest)->c = mrb_obj_class(mrb, str); - str = dest; - } - + RBASIC(dest)->c = mrb_obj_class(mrb, str); return str; } @@ -2578,8 +1361,7 @@ mrb_str_gsub(mrb_state *mrb, mrb_value self) static mrb_value mrb_str_gsub_bang(mrb_state *mrb, mrb_value self) { - str_modify_keep_cr(mrb, self); - //return str_gsub(argc, argv, self, 1); + str_modify(mrb, self); return str_gsub(mrb, self, 1); } #endif //INCLUDE_REGEXP @@ -2694,18 +1476,10 @@ mrb_str_index_m(mrb_state *mrb, mrb_value str) } if (pos < 0) { -#ifdef INCLUDE_ENCODING - pos += str_strlen(mrb, str, STR_ENC_GET(mrb, str)); -#else pos += RSTRING_LEN(str); -#endif //INCLUDE_ENCODING if (pos < 0) { if (mrb_type(sub) == MRB_TT_REGEX) { -#ifdef INCLUDE_REGEXP - mrb_backref_set(mrb, mrb_nil_value()); -#else - mrb_raise(mrb, E_TYPE_ERROR, "Regexp Class not supported"); -#endif //INCLUDE_REGEXP + mrb_raise(mrb, E_TYPE_ERROR, "Regexp class not supported"); } return mrb_nil_value(); } @@ -2714,11 +1488,9 @@ mrb_str_index_m(mrb_state *mrb, mrb_value str) switch (mrb_type(sub)) { case MRB_TT_REGEX: #ifdef INCLUDE_REGEXP - if (pos > str_strlen(mrb, str, STR_ENC_GET(mrb, str))) + if (pos > RSTRING_LEN(str)) return mrb_nil_value(); - pos = str_offset(mrb, RSTRING_PTR(str), RSTRING_END(str), pos, - mrb_enc_check(mrb, str, sub), single_byte_optimizable(mrb, str)); - + pos = mrb_str_offset(mrb, str, pos); pos = mrb_reg_search(mrb, sub, str, pos, 0); pos = mrb_str_sublen(mrb, str, pos); #else @@ -2750,9 +1522,6 @@ mrb_str_index_m(mrb_state *mrb, mrb_value str) /* fall through */ case MRB_TT_STRING: pos = mrb_str_index(mrb, str, sub, pos); -#ifdef INCLUDE_ENCODING - pos = mrb_str_sublen(mrb, str, pos); -#endif //INCLUDE_ENCODING break; } @@ -2761,24 +1530,15 @@ mrb_str_index_m(mrb_state *mrb, mrb_value str) } static mrb_value -str_replace(mrb_state *mrb, mrb_value str, mrb_value str2) +str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2) { - long len; + int len = s2->len; - len = RSTRING_LEN(str2); - if (MRB_STR_SHARED_P(str2)) { - struct RString *shared = RSTRING_SHARED(str2); - RSTRING_LEN(str) = len; - RSTRING_PTR(str) = shared->buf; - FL_SET(str, MRB_STR_SHARED); - RSTRING_SHARED(str) = shared; - } - else { - str_replace_shared(mrb, str, str2); - } - - mrb_enc_cr_str_exact_copy(mrb, str, str2); - return str; + s1->buf = mrb_realloc(mrb, s1->buf, len); + memcpy(s1->buf, s2->buf, len); + s1->len = s2->len; + s2->aux.capa = s2->len; + return mrb_obj_value(s1); } /* 15.2.10.5.24 */ @@ -2795,14 +1555,8 @@ mrb_str_replace(mrb_state *mrb, mrb_value str) { mrb_value str2; - mrb_get_args(mrb, "o", &str2); - str_modifiable(str); - if (mrb_obj_equal(mrb, str, str2)) return str; - - //StringValue(str2); - mrb_string_value(mrb, &str2); - //str_discard(str); - return str_replace(mrb, str, str2); + mrb_get_args(mrb, "S", &str2); + return str_replace(mrb, mrb_str_ptr(str), mrb_str_ptr(str2)); } /* 15.2.10.5.23 */ @@ -2815,43 +1569,18 @@ mrb_str_replace(mrb_state *mrb, mrb_value str) static mrb_value mrb_str_init(mrb_state *mrb, mrb_value self) { - //mrb_value orig; - mrb_value *argv; - int argc; + mrb_value str2; - mrb_get_args(mrb, "*", &argv, &argc); - if (argc == 1) - mrb_str_replace(mrb, self); + if (mrb_get_args(mrb, "|S", &str2) == 1) { + str_replace(mrb, mrb_str_ptr(self), mrb_str_ptr(str2)); + } return self; } -#ifdef INCLUDE_ENCODING -mrb_sym -mrb_intern3(mrb_state *mrb, const char *name, long len, mrb_encoding *enc) -{ - return mrb_intern(mrb, name); -} -#endif //INCLUDE_ENCODING - mrb_sym mrb_intern_str(mrb_state *mrb, mrb_value str) { - mrb_sym id; -#ifdef INCLUDE_ENCODING - mrb_encoding *enc; - - if (mrb_enc_str_coderange(mrb, str) == ENC_CODERANGE_7BIT) { - enc = mrb_usascii_encoding(mrb); - } - else { - enc = mrb_enc_get(mrb, str); - } - id = mrb_intern3(mrb, RSTRING_PTR(str), RSTRING_LEN(str), enc); -#else - id = mrb_intern(mrb, RSTRING_PTR(str)); -#endif //INCLUDE_ENCODING - str = RB_GC_GUARD(str); - return id; + return mrb_intern(mrb, RSTRING_PTR(str)); } /* 15.2.10.5.25 */ @@ -2984,66 +1713,20 @@ mrb_str_match_m(mrb_state *mrb, mrb_value self) static mrb_value mrb_str_reverse(mrb_state *mrb, mrb_value str) { -#ifdef INCLUDE_ENCODING - mrb_encoding *enc; -#endif //INCLUDE_ENCODING - mrb_value rev; + struct RString *s2; char *s, *e, *p; -#ifdef INCLUDE_ENCODING - int single = 1; -#endif //INCLUDE_ENCODING - if (RSTRING_LEN(str) <= 1) return mrb_str_dup(mrb, str); -#ifdef INCLUDE_ENCODING - enc = STR_ENC_GET(mrb, str); -#endif //INCLUDE_ENCODING - rev = mrb_str_new5(mrb, str, 0, RSTRING_LEN(str)); - s = RSTRING_PTR(str); e = RSTRING_END(str); - p = RSTRING_END(rev); + if (RSTRING(str)->len <= 1) return mrb_str_dup(mrb, str); - if (RSTRING_LEN(str) > 1) { -#ifdef INCLUDE_ENCODING - if (single_byte_optimizable(mrb, str)) { -#endif //INCLUDE_ENCODING - while (s < e) { - *--p = *s++; - } -#ifdef INCLUDE_ENCODING - } - else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) { - while (s < e) { - int clen = mrb_enc_fast_mbclen(s, e, enc); - - if (clen > 1 || (*s & 0x80)) single = 0; - p -= clen; - memcpy(p, s, clen); - s += clen; - } - } - else { - while (s < e) { - int clen = mrb_enc_mbclen(s, e, enc); + s2 = str_new(mrb, 0, RSTRING(str)->len); + str_with_class(mrb, s2, str); + s = RSTRING_PTR(str); e = RSTRING_END(str) - 1; + p = s2->buf; - if (clen > 1 || (*s & 0x80)) single = 0; - p -= clen; - memcpy(p, s, clen); - s += clen; - } - } - } - STR_SET_LEN(rev, RSTRING_LEN(str)); - if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) { - if (single) { - ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); - } - else { - ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); - } -#endif //INCLUDE_ENCODING + while (e >= s) { + *p++ = *e--; } - mrb_enc_cr_str_copy_for_substr(mrb, rev, str); - - return rev; + return mrb_obj_value(s2); } /* 15.2.10.5.30 */ @@ -3056,29 +1739,19 @@ mrb_str_reverse(mrb_state *mrb, mrb_value str) static mrb_value mrb_str_reverse_bang(mrb_state *mrb, mrb_value str) { -#ifdef INCLUDE_ENCODING - if (RSTRING_LEN(str) > 1) { - if (single_byte_optimizable(mrb, str)) { -#endif //INCLUDE_ENCODING - char *s, *e, c; - str_modify_keep_cr(mrb, str); - s = RSTRING_PTR(str); - e = RSTRING_END(str) - 1; - while (s < e) { - c = *s; - *s++ = *e; - *e-- = c; - } -#ifdef INCLUDE_ENCODING - } - else { - mrb_str_shared_replace(mrb, str, mrb_str_reverse(mrb, str)); + char *s, *e; + char c; + + str_modify(mrb, str); + if (RSTRING(str)->len > 1) { + s = RSTRING(str)->buf; + e = s + RSTRING(str)->len - 1; + while (s < e) { + c = *s; + *s++ = *e; + *e-- = c; } } - else { - str_modify_keep_cr(mrb, str); - } -#endif //INCLUDE_ENCODING return str; } @@ -3132,15 +1805,10 @@ mrb_str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) #ifdef INCLUDE_ENCODING /* byte offset to char offset */ -size_t +int mrb_str_sublen(mrb_state *mrb, mrb_value str, long pos) { - if (single_byte_optimizable(mrb, str) || pos < 0) - return pos; - else { - char *p = RSTRING_PTR(str); - return enc_strlen(p, p + pos, STR_ENC_GET(mrb, str), ENC_CODERANGE(str)); - } + return pos; } #endif //INCLUDE_ENCODING @@ -3170,14 +1838,8 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) int argc; mrb_value sub; mrb_value vpos; -#ifdef INCLUDE_ENCODING - mrb_encoding *enc = STR_ENC_GET(mrb, str); - int pos, len = str_strlen(mrb, str, enc); -#else int pos, len = RSTRING_LEN(str); -#endif //INCLUDE_ENCODING - //if (mrb_scan_args(argc, argv, "11", &sub, &vpos) == 2) { mrb_get_args(mrb, "*", &argv, &argc); if (argc == 2) { sub = argv[0]; @@ -3209,9 +1871,7 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) switch (mrb_type(sub)) { case MRB_TT_REGEX: #ifdef INCLUDE_REGEXP - pos = str_offset(mrb, RSTRING_PTR(str), RSTRING_END(str), pos, - STR_ENC_GET(mrb, str), single_byte_optimizable(mrb, str)); - + pos = mrb_str_offset(mrb, str, pos); if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) { pos = mrb_reg_search(mrb, sub, str, pos, 1); pos = mrb_str_sublen(mrb, str, pos); @@ -3269,12 +1929,11 @@ scan_once(mrb_state *mrb, mrb_value str, mrb_value pat, mrb_int *start) pmatch = mrb_match_ptr(match); regs = &pmatch->rmatch->regs; if (regs->beg[0] == regs->end[0]) { - mrb_encoding *enc = STR_ENC_GET(mrb, str); /* * Always consume at least one character of the input string */ if (ps->len > regs->end[0]) - *start = regs->end[0] + mrb_enc_fast_mbclen(RSTRING_PTR(str)+regs->end[0],RSTRING_END(str), enc); + *start = regs->end[0] + RSTRING_LEN(str)-regs->end[0]; else *start = regs->end[0] + 1; } @@ -3426,19 +2085,14 @@ static const char isspacetable[256] = { * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""] */ -//static mrb_value -//mrb_str_split_m(int argc, mrb_value *argv, mrb_value str) static mrb_value mrb_str_split_m(mrb_state *mrb, mrb_value str) { mrb_value *argv; int argc; -#ifdef INCLUDE_ENCODING - mrb_encoding *enc; -#endif //INCLUDE_ENCODING - mrb_value spat; + mrb_value spat = mrb_nil_value(); mrb_value limit; - enum {awk, string, regexp} split_type; + enum {awk, string, regexp} split_type = string; long beg, end, i = 0; int lim = 0; mrb_value result, tmp; @@ -3457,26 +2111,17 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) else if (lim == 1) { if (RSTRING_LEN(str) == 0) return mrb_ary_new_capa(mrb, 0); - return mrb_ary_new_from_values(mrb, &str, 1); + return mrb_ary_new_from_values(mrb, 1, &str); } i = 1; } -#ifdef INCLUDE_ENCODING - enc = STR_ENC_GET(mrb, str); -#endif //INCLUDE_ENCODING - //if (mrb_nil_p(spat)) { if (argc == 0) { -// spat = mrb_nil_value(); -// goto fs_set; split_type = awk; } else { //fs_set: if (mrb_type(spat) == MRB_TT_STRING) { -#ifdef INCLUDE_REGEXP - mrb_encoding *enc2 = STR_ENC_GET(mrb, spat); -#endif //INCLUDE_REGEXP split_type = string; #ifdef INCLUDE_REGEXP if (RSTRING_LEN(spat) == 0) { @@ -3484,20 +2129,13 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) spat = mrb_reg_regcomp(mrb, spat); split_type = regexp; } - else if (mrb_enc_asciicompat(mrb, enc2) == 1) { + else { #endif //INCLUDE_REGEXP if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){ split_type = awk; } #ifdef INCLUDE_REGEXP } - else { - int l; - if (mrb_enc_ascget(mrb, RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' && - RSTRING_LEN(spat) == l) { - split_type = awk; - } - } #endif //INCLUDE_REGEXP } else { @@ -3520,89 +2158,28 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) unsigned int c; end = beg; -#ifdef INCLUDE_ENCODING - if (is_ascii_string(mrb, str)) { -#endif //INCLUDE_ENCODING - while (ptr < eptr) { - c = (unsigned char)*ptr++; - if (skip) { - if (ascii_isspace(c)) { - beg = ptr - bptr; - } - else { - end = ptr - bptr; - skip = 0; - if (!mrb_nil_p(limit) && lim <= i) break; - } - } - else if (ascii_isspace(c)) { - mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, beg, end-beg)); - skip = 1; - beg = ptr - bptr; - if (!mrb_nil_p(limit)) ++i; - } - else { - end = ptr - bptr; - } - } -#ifdef INCLUDE_ENCODING - } - else { - while (ptr < eptr) { - int n; - - c = mrb_enc_codepoint_len(mrb, ptr, eptr, &n, enc); - ptr += n; - if (skip) { - if (mrb_isspace(c)) { - beg = ptr - bptr; - } - else { - end = ptr - bptr; - skip = 0; - if (!mrb_nil_p(limit) && lim <= i) break; - } - } - else if (mrb_isspace(c)) { - mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, beg, end-beg)); - skip = 1; - beg = ptr - bptr; - if (!mrb_nil_p(limit)) ++i; - } - else { - end = ptr - bptr; - } + while (ptr < eptr) { + c = (unsigned char)*ptr++; + if (skip) { + if (ascii_isspace(c)) { + beg = ptr - bptr; + } + else { + end = ptr - bptr; + skip = 0; + if (!mrb_nil_p(limit) && lim <= i) break; + } + } + else if (ascii_isspace(c)) { + mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, beg, end-beg)); + skip = 1; + beg = ptr - bptr; + if (!mrb_nil_p(limit)) ++i; } - } - } - else if (split_type == string) { - char *ptr = RSTRING_PTR(str); - char *temp = ptr; - char *eptr = RSTRING_END(str); - char *sptr = RSTRING_PTR(spat); - long slen = RSTRING_LEN(spat); - - if (is_broken_string(mrb, str)) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid byte sequence in %s", mrb_enc_name(STR_ENC_GET(mrb, str))); - } - if (is_broken_string(mrb, spat)) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid byte sequence in %s", mrb_enc_name(STR_ENC_GET(mrb, spat))); - } - enc = mrb_enc_check(mrb, str, spat); - while (ptr < eptr && - (end = mrb_memsearch(mrb, sptr, slen, ptr, eptr - ptr, enc)) >= 0) { - /* Check we are at the start of a char */ - char *t = mrb_enc_right_char_head(ptr, ptr + end, eptr, enc); - if (t != ptr + end) { - ptr = t; - continue; + else { + end = ptr - bptr; } - mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, ptr - temp, end)); - ptr += end + slen; - if (!mrb_nil_p(limit) && lim <= ++i) break; } - beg = ptr - temp; -#endif //INCLUDE_ENCODING } else { #ifdef INCLUDE_REGEXP @@ -3617,21 +2194,18 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) regs = RMATCH_REGS(mrb_backref_get(mrb)); if (start == end && BEG(0) == END(0)) { if (!ptr) { - mrb_ary_push(mrb, result, str_new_empty(mrb, str)); + mrb_ary_push(mrb, result, mrb_str_new_empty(mrb, str)); break; } else if (last_null == 1) { - mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, beg, - mrb_enc_fast_mbclen(ptr+beg, - ptr+len, - enc))); + mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, beg, len)); beg = start; } else { if (ptr+start == ptr+len) start++; else - start += mrb_enc_fast_mbclen(ptr+start,ptr+len,enc); + start += len; last_null = 1; continue; } @@ -3645,7 +2219,7 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) for (idx=1; idx < regs->num_regs; idx++) { if (BEG(idx) == -1) continue; if (BEG(idx) == END(idx)) - tmp = str_new_empty(mrb, str); + tmp = mrb_str_new_empty(mrb, str); else tmp = mrb_str_subseq(mrb, str, BEG(idx), END(idx)-BEG(idx)); mrb_ary_push(mrb, result, tmp); @@ -3658,7 +2232,7 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) } if (RSTRING_LEN(str) > 0 && (!mrb_nil_p(limit) || RSTRING_LEN(str) > beg || lim < 0)) { if (RSTRING_LEN(str) == beg) - tmp = str_new_empty(mrb, str); + tmp = mrb_str_new_empty(mrb, str); else tmp = mrb_str_subseq(mrb, str, beg, RSTRING_LEN(str)-beg); mrb_ary_push(mrb, result, tmp); @@ -3696,77 +2270,7 @@ mrb_block_given_p() static mrb_value mrb_str_sub_bang(mrb_state *mrb, mrb_value str) { - mrb_value *argv; - int argc; - mrb_value pat, repl; - long plen; - - mrb_get_args(mrb, "*", &argv, &argc); - if (argc == 1 && mrb_block_given_p()) { - /* do nothing */ - } - else if (argc == 2) { - repl = argv[1]; - //StringValue(repl); - mrb_string_value(mrb, &repl); - } - else { - mrb_raise(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (%d for 2)", argc); - } - - pat = get_pat(mrb, argv[0], 1); - str_modifiable(str); - if (mrb_reg_search(mrb, pat, str, 0, 0) >= 0) { - mrb_encoding *enc; - int cr = ENC_CODERANGE(str); - mrb_value match = mrb_backref_get(mrb); - struct re_registers *regs = RMATCH_REGS(match); - long beg0 = BEG(0); - long end0 = END(0); - char *p, *rp; - long len, rlen; - - repl = mrb_reg_regsub(mrb, repl, str, regs, pat); - enc = mrb_enc_compatible(mrb, str, repl); - if (!enc) { - mrb_encoding *str_enc = STR_ENC_GET(mrb, str); - p = RSTRING_PTR(str); len = RSTRING_LEN(str); - if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT || - coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) { - mrb_raise(mrb, E_ENCODING_ERROR, "incompatible character encodings: %s and %s", - mrb_enc_name(str_enc), - mrb_enc_name(STR_ENC_GET(mrb, repl))); - } - enc = STR_ENC_GET(mrb, repl); - } - mrb_str_modify(mrb, str); - mrb_enc_associate(mrb, str, enc); - if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) { - int cr2 = ENC_CODERANGE(repl); - if (cr2 == ENC_CODERANGE_BROKEN || - (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT)) - cr = ENC_CODERANGE_UNKNOWN; - else - cr = cr2; - } - plen = end0 - beg0; - rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl); - len = RSTRING_LEN(str); - if (rlen > plen) { - RESIZE_CAPA(str, len + rlen - plen); - } - p = RSTRING_PTR(str); - if (rlen != plen) { - memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen); - } - memcpy(p + beg0, rp, rlen); - len += rlen - plen; - STR_SET_LEN(str, len); - RSTRING_PTR(str)[len] = '\0'; - ENC_CODERANGE_SET(str, cr); - - return str; - } + str_modify(mrb, str); return mrb_nil_value(); } #endif //INCLUDE_REGEXP @@ -3987,7 +2491,7 @@ mrb_value mrb_str_to_inum(mrb_state *mrb, mrb_value str, int base, int badcheck) { char *s; - size_t len; + int len; //StringValue(str); mrb_string_value(mrb, &str); @@ -4127,7 +2631,7 @@ double mrb_str_to_dbl(mrb_state *mrb, mrb_value str, int badcheck) { char *s; - size_t len; + int len; //StringValue(str); mrb_string_value(mrb, &str); @@ -4197,66 +2701,20 @@ mrb_str_to_s(mrb_state *mrb, mrb_value self) static mrb_value mrb_str_upcase_bang(mrb_state *mrb, mrb_value str) { -#ifdef INCLUDE_ENCODING - mrb_encoding *enc; -#endif //INCLUDE_ENCODING char *s, *send; int modify = 0; -#ifdef INCLUDE_ENCODING - int n; - - str_modify_keep_cr(mrb, str); - enc = STR_ENC_GET(mrb, str); - mrb_str_check_dummy_enc(mrb, enc); - s = RSTRING_PTR(str); send = RSTRING_END(str); - if (single_byte_optimizable(mrb, str)) { - while (s < send) { - unsigned int c = *(unsigned char*)s; - - if (mrb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { - *s = 'A' + (c - 'a'); - modify = 1; - } - s++; - } - } - else { - int ascompat = mrb_enc_asciicompat(mrb, enc); - - while (s < send) { - unsigned int c; - if (ascompat && (c = *(unsigned char*)s) < 0x80) { - if (mrb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { - *s = 'A' + (c - 'a'); - modify = 1; - } - s++; - } - else { - c = mrb_enc_codepoint_len(mrb, s, send, &n, enc); - if (mrb_enc_islower(c, enc)) { - /* assuming toupper returns codepoint with same size */ - mrb_enc_mbcput(mrb_enc_toupper(c, enc), s, enc); - modify = 1; - } - s += n; - } - } - } -#else - mrb_str_modify(mrb, str); - s = RSTRING_PTR(str); send = RSTRING_END(str); + str_modify(mrb, str); + s = RSTRING_PTR(str); + send = RSTRING_END(str); while (s < send) { - unsigned int c = *(unsigned char*)s; - - if ('a' <= c && c <= 'z') { - *s = 'A' + (c - 'a'); + if (ISLOWER(*s)) { + *s = toupper(*s); modify = 1; } s++; } -#endif //INCLUDE_ENCODING + if (modify) return str; return mrb_nil_value(); } @@ -4282,252 +2740,6 @@ mrb_str_upcase(mrb_state *mrb, mrb_value self) return str; } -/* 15.2.10.5.xx */ -/* - * call-seq: - * str.force_encoding(encoding) -> str - * - * Changes the encoding to +encoding+ and returns self. - */ -#ifdef INCLUDE_ENCODING -static mrb_value -mrb_str_force_encoding(mrb_state *mrb, mrb_value self) -{ - mrb_value enc; - - mrb_get_args(mrb, "o", &enc); - str_modifiable(self); - mrb_enc_associate(mrb, self, mrb_to_encoding(mrb, enc)); - ENC_CODERANGE_CLEAR(self); - return self; -} - -long -mrb_str_coderange_scan_restartable(const char *s, const char *e, mrb_encoding *enc, int *cr) -{ - const char *p = s; - - if (*cr == ENC_CODERANGE_BROKEN) - return e - s; - - if (mrb_enc_to_index(enc) == 0) { - /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ - p = search_nonascii(p, e); - *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; - return e - s; - } - else if (mrb_enc_asciicompat(mrb, enc)) { - p = search_nonascii(p, e); - if (!p) { - if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT; - return e - s; - } - while (p < e) { - int ret = mrb_enc_precise_mbclen(p, e, enc); - if (!MBCLEN_CHARFOUND_P(ret)) { - *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; - return p - s; - } - p += MBCLEN_CHARFOUND_LEN(ret); - if (p < e) { - p = search_nonascii(p, e); - if (!p) { - *cr = ENC_CODERANGE_VALID; - return e - s; - } - } - } - *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; - return p - s; - } - else { - while (p < e) { - int ret = mrb_enc_precise_mbclen(p, e, enc); - if (!MBCLEN_CHARFOUND_P(ret)) { - *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; - return p - s; - } - p += MBCLEN_CHARFOUND_LEN(ret); - } - *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; - return p - s; - } -} - -mrb_value -mrb_str_conv_enc_opts(mrb_state *mrb, mrb_value str, mrb_encoding *from, mrb_encoding *to, int ecflags, mrb_value ecopts) -{ - mrb_econv_t *ec; - mrb_econv_result_t ret; - long len; - mrb_value newstr; - const unsigned char *sp; - unsigned char *dp; - - if (!to) return str; - if (from == to) return str; - if ((mrb_enc_asciicompat(mrb, to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) || - to == mrb_ascii8bit_encoding(mrb)) { - if (STR_ENC_GET(mrb, str) != to) { - str = mrb_str_dup(mrb, str); - mrb_enc_associate(mrb, str, to); - } - return str; - } - - len = RSTRING_LEN(str); - newstr = mrb_str_new(mrb, 0, len); - - retry: - ec = mrb_econv_open_opts(mrb, from->name, to->name, ecflags, ecopts); - if (!ec) return str; - - sp = (unsigned char*)RSTRING_PTR(str); - dp = (unsigned char*)RSTRING_PTR(newstr); - ret = mrb_econv_convert(mrb, ec, &sp, (unsigned char*)RSTRING_END(str), - &dp, (unsigned char*)RSTRING_END(newstr), 0); - mrb_econv_close(ec); - switch (ret) { - case econv_destination_buffer_full: - /* destination buffer short */ - len = len < 2 ? 2 : len * 2; - mrb_str_resize(mrb, newstr, len); - goto retry; - - case econv_finished: - len = dp - (unsigned char*)RSTRING_PTR(newstr); - mrb_str_set_len(mrb, newstr, len); - mrb_enc_associate(mrb, newstr, to); - return newstr; - - case econv_invalid_byte_sequence: - case econv_undefined_conversion: - case econv_source_buffer_empty: - case econv_after_output: - case econv_incomplete_input: - /* some error, return original */ - return str; - - default: - mrb_bug("Internal Error: Invalid return value mrb_econv_convert."); - return str; - } -} - -mrb_value -mrb_str_conv_enc(mrb_state *mrb, mrb_value str, mrb_encoding *from, mrb_encoding *to) -{ - return mrb_str_conv_enc_opts(mrb, str, from, to, 0, mrb_nil_value()); -} -#endif //INCLUDE_ENCODING - -#ifndef INCLUDE_ENCODING -#undef SIGN_EXTEND_CHAR -#if __STDC__ -# define SIGN_EXTEND_CHAR(c) ((signed char)(c)) -#else /* not __STDC__ */ -/* As in Harbison and Steele. */ -# define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128) -#endif -#define is_identchar(c) (SIGN_EXTEND_CHAR(c)!=-1&&(ISALNUM(c) || (c) == '_')) - -static int -is_special_global_name(m) - const char *m; -{ - switch (*m) { - case '~': case '*': case '$': case '?': case '!': case '@': - case '/': case '\\': case ';': case ',': case '.': case '=': - case ':': case '<': case '>': case '\"': - case '&': case '`': case '\'': case '+': - case '0': - ++m; - break; - case '-': - ++m; - if (is_identchar(*m)) m += 1; - break; - default: - if (!ISDIGIT(*m)) return 0; - do ++m; while (ISDIGIT(*m)); - } - return !*m; -} - -int -mrb_symname_p(const char *name) -{ - const char *m = name; - int localid = FALSE; - - if (!m) return FALSE; - switch (*m) { - case '\0': - return FALSE; - - case '$': - if (is_special_global_name(++m)) return TRUE; - goto id; - - case '@': - if (*++m == '@') ++m; - goto id; - - case '<': - switch (*++m) { - case '<': ++m; break; - case '=': if (*++m == '>') ++m; break; - default: break; - } - break; - - case '>': - switch (*++m) { - case '>': case '=': ++m; break; - } - break; - - case '=': - switch (*++m) { - case '~': ++m; break; - case '=': if (*++m == '=') ++m; break; - default: return FALSE; - } - break; - - case '*': - if (*++m == '*') ++m; - break; - - case '+': case '-': - if (*++m == '@') ++m; - break; - - case '|': case '^': case '&': case '/': case '%': case '~': case '`': - ++m; - break; - - case '[': - if (*++m != ']') return FALSE; - if (*++m == '=') ++m; - break; - - default: - localid = !ISUPPER(*m); -id: - if (*m != '_' && !ISALPHA(*m)) return FALSE; - while (is_identchar(*m)) m += 1; - if (localid) { - switch (*m) { - case '!': case '?': case '=': ++m; - } - } - break; - } - return *m ? FALSE : TRUE; -} -#endif //INCLUDE_ENCODING - /* * call-seq: * str.dump -> new_str @@ -4538,16 +2750,10 @@ id: mrb_value mrb_str_dump(mrb_state *mrb, mrb_value str) { -#ifdef INCLUDE_ENCODING - mrb_encoding *enc = mrb_enc_get(mrb, str); -#endif //INCLUDE_ENCODING long len; const char *p, *pend; - char *q, *qend; - mrb_value result; -#ifdef INCLUDE_ENCODING - int u8 = (enc == mrb_utf8_encoding(mrb)); -#endif //INCLUDE_ENCODING + char *q; + struct RString *result; len = 2; /* "" */ p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); @@ -4570,33 +2776,16 @@ mrb_str_dump(mrb_state *mrb, mrb_value str) len++; } else { -#ifdef INCLUDE_ENCODING - if (u8) { /* \u{NN} */ - int n = mrb_enc_precise_mbclen(p-1, pend, enc); - if (MBCLEN_CHARFOUND_P(n-1)) { - unsigned int cc = mrb_enc_mbc_to_codepoint(p-1, pend, enc); - while (cc >>= 4) len++; - len += 5; - p += MBCLEN_CHARFOUND_LEN(n)-1; - break; - } - } -#endif //INCLUDE_ENCODING len += 4; /* \xNN */ } break; } } -#ifdef INCLUDE_ENCODING - if (!mrb_enc_asciicompat(mrb, enc)) { - len += 19; /* ".force_encoding('')" */ - len += strlen(enc->name); - } -#endif //INCLUDE_ENCODING - result = mrb_str_new5(mrb, str, 0, len); + result = str_new(mrb, 0, len); + str_with_class(mrb, result, str); p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); - q = RSTRING_PTR(result); qend = q + len + 1; + q = result->buf; *q++ = '"'; while (p < pend) { @@ -4647,36 +2836,12 @@ mrb_str_dump(mrb_state *mrb, mrb_value str) } else { *q++ = '\\'; -#ifdef INCLUDE_ENCODING - if (u8) { - int n = mrb_enc_precise_mbclen(p-1, pend, enc) - 1; - if (MBCLEN_CHARFOUND_P(n)) { - int cc = mrb_enc_mbc_to_codepoint(p-1, pend, enc); - p += n; - snprintf(q, qend-q, "u{%x}", cc); - q += strlen(q); - continue; - } - } - snprintf(q, qend-q, "x%02X", c); -#else sprintf(q, "%03o", c&0xff); -#endif //INCLUDE_ENCODING q += 3; } } *q++ = '"'; -#ifdef INCLUDE_ENCODING - *q = '\0'; - if (!mrb_enc_asciicompat(mrb, enc)) { - snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name); - enc = mrb_ascii8bit_encoding(mrb); - } - /* result from dump is ASCII */ - mrb_enc_associate(mrb, result, enc); - ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT); -#endif //INCLUDE_ENCODING - return result; + return mrb_obj_value(result); } mrb_value @@ -4686,8 +2851,6 @@ mrb_str_cat(mrb_state *mrb, mrb_value str, const char *ptr, long len) mrb_raise(mrb, E_ARGUMENT_ERROR, "negative string size (or size too big)"); } if (0/*STR_ASSOC_P(str)*/) { - mrb_str_modify(mrb, str); - //if (STR_EMBED_P(str)) str_make_independent(mrb, str); mrb_realloc(mrb, RSTRING(str)->buf, RSTRING(str)->len+len+1); memcpy(RSTRING(str)->buf + RSTRING(str)->len, ptr, len); RSTRING(str)->len += len; @@ -4701,18 +2864,13 @@ mrb_str_cat(mrb_state *mrb, mrb_value str, const char *ptr, long len) mrb_value mrb_str_cat2(mrb_state *mrb, mrb_value str, const char *ptr) { - return mrb_str_cat(mrb, str, ptr, strlen(ptr)); + return mrb_str_cat(mrb, str, ptr, strlen(ptr)); } -mrb_value +static mrb_value mrb_str_vcatf(mrb_state *mrb, mrb_value str, const char *fmt, va_list ap) { - //mrb_printf_buffer f; - //mrb_value klass; - - //StringValue(str); mrb_string_value(mrb, &str); - mrb_str_modify(mrb, str); mrb_str_resize(mrb, str, (char*)RSTRING_END(str) - RSTRING_PTR(str)); return str; @@ -4730,12 +2888,6 @@ mrb_str_catf(mrb_state *mrb, mrb_value str, const char *format, ...) return str; } -void -mrb_lastline_set(mrb_value val) -{ - //vm_svar_set(0, val); -} - mrb_value mrb_str_append(mrb_state *mrb, mrb_value str, mrb_value str2) { @@ -4743,69 +2895,7 @@ mrb_str_append(mrb_state *mrb, mrb_value str, mrb_value str2) return mrb_str_buf_append(mrb, str, str2); } -void -mrb_str_setter(mrb_state *mrb, mrb_value val, mrb_sym id, mrb_value *var) -{ - if (!mrb_nil_p(val) && (mrb_type(val) != MRB_TT_STRING)) { - mrb_raise(mrb, E_TYPE_ERROR, "value of %s must be String", mrb_sym2name(mrb, id)); - } - *var = val; -} - -#ifdef INCLUDE_ENCODING -/* - * call-seq: - * str.ascii_only? -> true or false - * - * Returns true for a string which has only ASCII characters. - * - * "abc".force_encoding("UTF-8").ascii_only? #=> true - * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false - */ - -int -mrb_str_is_ascii_only_p(mrb_state *mrb, mrb_value str) -{ - int cr = mrb_enc_str_coderange(mrb, str); - - return cr == ENC_CODERANGE_7BIT ? TRUE : FALSE; -} - -#endif //INCLUDE_ENCODING - #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */ -int -mrb_str_buf_cat_escaped_char(mrb_state *mrb, mrb_value result, unsigned int c, int unicode_p) -{ - char buf[CHAR_ESC_LEN + 1]; - int l; - - if (sizeof(c) > 4) { - c &= 0xffffffff; - } - if (unicode_p) { - if (c < 0x7F && ISPRINT(c)) { - snprintf(buf, CHAR_ESC_LEN, "%c", c); - } - else if (c < 0x10000) { - snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c); - } - else { - snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c); - } - } - else { - if (c < 0x100) { - snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c); - } - else { - snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c); - } - } - l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */ - mrb_str_buf_cat(mrb, result, buf, l); - return l; -} /* * call-seq: @@ -4821,24 +2911,9 @@ mrb_str_buf_cat_escaped_char(mrb_state *mrb, mrb_value result, unsigned int c, i mrb_value mrb_str_inspect(mrb_state *mrb, mrb_value str) { -#ifdef INCLUDE_ENCODING - mrb_encoding *enc = STR_ENC_GET(mrb, str); -#endif //INCLUDE_ENCODING const char *p, *pend, *prev; char buf[CHAR_ESC_LEN + 1]; -#ifdef INCLUDE_ENCODING - mrb_value result = mrb_str_buf_new(mrb, 0); - mrb_encoding *resenc = mrb_default_internal_encoding(mrb); - int unicode_p = mrb_enc_unicode_p(enc); - int asciicompat = mrb_enc_asciicompat(mrb, enc); - - if (resenc == NULL) resenc = mrb_default_external_encoding(mrb); - if (!mrb_enc_asciicompat(mrb, resenc)) resenc = mrb_usascii_encoding(mrb); - mrb_enc_associate(mrb, result, resenc); - mrb_str_buf_cat(mrb, result, "\"", strlen("\"")); //str_buf_cat2(result, "\""); -#else - mrb_value result = mrb_str_new_cstr(mrb, "\"");//mrb_str_buf_new2("\""); -#endif //INCLUDE_ENCODING + mrb_value result = mrb_str_new_cstr(mrb, "\""); p = RSTRING_PTR(str); pend = RSTRING_END(str); prev = p; @@ -4846,37 +2921,6 @@ mrb_str_inspect(mrb_state *mrb, mrb_value str) unsigned int c, cc; int n; -#ifdef INCLUDE_ENCODING - n = mrb_enc_precise_mbclen(p, pend, enc); - if (!MBCLEN_CHARFOUND_P(n)) { - if (p > prev) mrb_str_buf_cat(mrb, result, prev, p - prev); - n = mrb_enc_mbminlen(enc); - if (pend < p + n) - n = (int)(pend - p); - while (n--) { - snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377); - mrb_str_buf_cat(mrb, result, buf, strlen(buf)); - prev = ++p; - } - continue; - } - n = MBCLEN_CHARFOUND_LEN(n); - c = mrb_enc_mbc_to_codepoint(p, pend, enc); - p += n; - if (c == '"'|| c == '\\' || - (c == '#' && - p < pend && - MBCLEN_CHARFOUND_P(mrb_enc_precise_mbclen(p,pend,enc)) && - (cc = mrb_enc_codepoint(mrb, p, pend, enc), - (cc == '$' || cc == '@' || cc == '{')))) { - if (p - n > prev) mrb_str_buf_cat(mrb, result, prev, p - n - prev); - mrb_str_buf_cat(mrb, result, "\\", strlen("\\")); //str_buf_cat2(result, "\\"); - if (asciicompat || enc == resenc) { - prev = p - n; - continue; - } - } -#else c = *p++; n = 1; if (c == '"'|| c == '\\' || (c == '#' && IS_EVSTR(p, pend))) { @@ -4889,7 +2933,6 @@ mrb_str_inspect(mrb_state *mrb, mrb_value str) mrb_str_buf_cat(mrb, result, buf, 1); continue; } -#endif //INCLUDE_ENCODING switch (c) { case '\n': cc = 'n'; break; case '\r': cc = 'r'; break; @@ -4909,45 +2952,22 @@ mrb_str_inspect(mrb_state *mrb, mrb_value str) prev = p; continue; } -#ifdef INCLUDE_ENCODING - if ((enc == resenc && mrb_enc_isprint(c, enc)) || - (asciicompat && mrb_enc_isascii(c, enc) && ISPRINT(c))) { - continue; - } -#endif //INCLUDE_ENCODING else { if (p - n > prev) mrb_str_buf_cat(mrb, result, prev, p - n - prev); -#ifdef INCLUDE_ENCODING - mrb_str_buf_cat_escaped_char(mrb, result, c, unicode_p); -#else sprintf(buf, "\\%03o", c & 0377); mrb_str_buf_cat(mrb, result, buf, strlen(buf)); -#endif //INCLUDE_ENCODING prev = p; continue; } } +#ifdef INCLUDE_ENCODING if (p > prev) mrb_str_buf_cat(mrb, result, prev, p - prev); - mrb_str_buf_cat(mrb, result, "\"", strlen("\"")); //str_buf_cat2(result, "\""); +#endif + mrb_str_buf_cat(mrb, result, "\"", strlen("\"")); return result; } -#ifdef INCLUDE_ENCODING -int -sym_printable(mrb_state *mrb, const char *s, const char *send, mrb_encoding *enc) -{ - while (s < send) { - int n; - int c = mrb_enc_codepoint_len(mrb, s, send, &n, enc); - - if (!mrb_enc_isprint(c, enc)) return FALSE; - s += n; - } - return TRUE; -} -#endif //INCLUDE_ENCODING - /* ---------------------------*/ void mrb_init_string(mrb_state *mrb) @@ -4976,7 +2996,7 @@ mrb_init_string(mrb_state *mrb) mrb_define_method(mrb, s, "downcase", mrb_str_downcase, ARGS_NONE()); /* 15.2.10.5.13 */ mrb_define_method(mrb, s, "downcase!", mrb_str_downcase_bang, ARGS_NONE()); /* 15.2.10.5.14 */ mrb_define_method(mrb, s, "each_line", mrb_str_each_line, ARGS_REQ(1)); /* 15.2.10.5.15 */ - mrb_define_method(mrb, s, "empty?", mrb_str_empty, ARGS_NONE()); /* 15.2.10.5.16 */ + mrb_define_method(mrb, s, "empty?", mrb_str_empty_p, ARGS_NONE()); /* 15.2.10.5.16 */ mrb_define_method(mrb, s, "eql?", mrb_str_eql, ARGS_REQ(1)); /* 15.2.10.5.17 */ #ifdef INCLUDE_REGEXP mrb_define_method(mrb, s, "gsub", mrb_str_gsub, ARGS_REQ(1)); /* 15.2.10.5.18 */ @@ -5011,9 +3031,5 @@ mrb_init_string(mrb_state *mrb) mrb_define_method(mrb, s, "to_sym", mrb_str_intern, ARGS_NONE()); /* 15.2.10.5.41 */ mrb_define_method(mrb, s, "upcase", mrb_str_upcase, ARGS_REQ(1)); /* 15.2.10.5.42 */ mrb_define_method(mrb, s, "upcase!", mrb_str_upcase_bang, ARGS_REQ(1)); /* 15.2.10.5.43 */ -#ifdef INCLUDE_ENCODING - mrb_define_method(mrb, s, "encoding", mrb_obj_encoding, ARGS_NONE()); /* 15.2.10.5.44(x) */ - mrb_define_method(mrb, s, "force_encoding", mrb_str_force_encoding, ARGS_REQ(1)); /* 15.2.10.5.45(x) */ -#endif mrb_define_method(mrb, s, "inspect", mrb_str_inspect, ARGS_NONE()); /* 15.2.10.5.46(x) */ } diff --git a/src/struct.c b/src/struct.c index 699825cff..d06124b50 100644 --- a/src/struct.c +++ b/src/struct.c @@ -379,7 +379,7 @@ mrb_struct_s_def(mrb_state *mrb, mrb_value klass) pargv = &argv[0]; argcnt++; } - rest = mrb_ary_new_from_values(mrb, pargv, argcnt); + rest = mrb_ary_new_from_values(mrb, argcnt, pargv); } st = make_struct(mrb, name, rest, struct_class(mrb)); if (!mrb_nil_p(b)) { diff --git a/src/symbol.c b/src/symbol.c index b4ffc19e6..89e81af0e 100644 --- a/src/symbol.c +++ b/src/symbol.c @@ -149,13 +149,7 @@ mrb_sym_to_s(mrb_state *mrb, mrb_value sym) { mrb_sym id = SYM2ID(sym); -#ifdef INCLUDE_REGEXP - //return str_new3(mrb_cString, mrb_id2str(id)); - return str_new3(mrb, mrb_obj_class(mrb, sym), mrb_str_new_cstr(mrb, mrb_sym2name(mrb, id))); -#else - return mrb_str_new_cstr(mrb, mrb_sym2name(mrb, id)); //mrb_str_new2(mrb_id2name(SYM2ID(sym))); -#endif - + return mrb_str_new_cstr(mrb, mrb_sym2name(mrb, id)); } /* 15.2.11.3.4 */ @@ -185,42 +179,113 @@ sym_to_sym(mrb_state *mrb, mrb_value sym) * :fred.inspect #=> ":fred" */ +#if __STDC__ +# define SIGN_EXTEND_CHAR(c) ((signed char)(c)) +#else /* not __STDC__ */ +/* As in Harbison and Steele. */ +# define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128) +#endif +#define is_identchar(c) (SIGN_EXTEND_CHAR(c)!=-1&&(ISALNUM(c) || (c) == '_')) + +static int +is_special_global_name(m) + const char *m; +{ + switch (*m) { + case '~': case '*': case '$': case '?': case '!': case '@': + case '/': case '\\': case ';': case ',': case '.': case '=': + case ':': case '<': case '>': case '\"': + case '&': case '`': case '\'': case '+': + case '0': + ++m; + break; + case '-': + ++m; + if (is_identchar(*m)) m += 1; + break; + default: + if (!ISDIGIT(*m)) return 0; + do ++m; while (ISDIGIT(*m)); + } + return !*m; +} + +static int +symname_p(const char *name) +{ + const char *m = name; + int localid = FALSE; + + if (!m) return FALSE; + switch (*m) { + case '\0': + return FALSE; + + case '$': + if (is_special_global_name(++m)) return TRUE; + goto id; + + case '@': + if (*++m == '@') ++m; + goto id; + + case '<': + switch (*++m) { + case '<': ++m; break; + case '=': if (*++m == '>') ++m; break; + default: break; + } + break; + + case '>': + switch (*++m) { + case '>': case '=': ++m; break; + } + break; + + case '=': + switch (*++m) { + case '~': ++m; break; + case '=': if (*++m == '=') ++m; break; + default: return FALSE; + } + break; + + case '*': + if (*++m == '*') ++m; + break; + + case '+': case '-': + if (*++m == '@') ++m; + break; + + case '|': case '^': case '&': case '/': case '%': case '~': case '`': + ++m; + break; + + case '[': + if (*++m != ']') return FALSE; + if (*++m == '=') ++m; + break; + + default: + localid = !ISUPPER(*m); +id: + if (*m != '_' && !ISALPHA(*m)) return FALSE; + while (is_identchar(*m)) m += 1; + if (localid) { + switch (*m) { + case '!': case '?': case '=': ++m; + } + } + break; + } + return *m ? FALSE : TRUE; +} + static mrb_value sym_inspect(mrb_state *mrb, mrb_value sym) { -#ifdef INCLUDE_ENCODING - #define STR_ENC_GET(mrb, str) mrb_enc_from_index(mrb, ENCODING_GET(mrb, str)) - mrb_value str; - mrb_sym id = SYM2ID(sym); - mrb_encoding *enc; - const char *ptr; - long len; - char *dest; - mrb_encoding *resenc = mrb_default_internal_encoding(mrb); - - if (resenc == NULL) resenc = mrb_default_external_encoding(mrb); - sym = mrb_str_new_cstr(mrb, mrb_sym2name(mrb, id));//mrb_id2str(id); - enc = STR_ENC_GET(mrb, sym); - ptr = RSTRING_PTR(sym); - len = RSTRING_LEN(sym); - if ((resenc != enc && !mrb_str_is_ascii_only_p(mrb, sym)) || len != (long)strlen(ptr) || - !mrb_enc_symname_p(ptr, enc) || !sym_printable(mrb, ptr, ptr + len, enc)) { - str = mrb_str_inspect(mrb, sym); - len = RSTRING_LEN(str); - mrb_str_resize(mrb, str, len + 1); - dest = RSTRING_PTR(str); - memmove(dest + 1, dest, len); - dest[0] = ':'; - } - else { - char *dest; - str = mrb_enc_str_new(mrb, 0, len + 1, enc); - dest = RSTRING_PTR(str); - dest[0] = ':'; - memcpy(dest + 1, ptr, len); - } - return str; -#else mrb_value str; const char *name; mrb_sym id = SYM2ID(sym); @@ -229,12 +294,11 @@ sym_inspect(mrb_state *mrb, mrb_value sym) str = mrb_str_new(mrb, 0, strlen(name)+1); RSTRING(str)->buf[0] = ':'; strcpy(RSTRING(str)->buf+1, name); - if (!mrb_symname_p(name)) { + if (!symname_p(name)) { str = mrb_str_dump(mrb, str); strncpy(RSTRING(str)->buf, ":\"", 2); } return str; -#endif } diff --git a/src/transcode.c b/src/transcode.c deleted file mode 100644 index d9f0ce896..000000000 --- a/src/transcode.c +++ /dev/null @@ -1,4386 +0,0 @@ -/********************************************************************** - - transcode.c - - - $Author: usa $ - created at: Tue Oct 30 16:10:22 JST 2007 - - Copyright (C) 2007 Martin Duerst - -**********************************************************************/ - -#include "mruby.h" -#ifdef INCLUDE_ENCODING -#include "encoding.h" -#include <sys/types.h> /* for ssize_t */ -#ifdef _MSC_VER -typedef int ssize_t; -#endif -#include "transcode_data.h" -#include <ctype.h> -#include "st.h" -#include "mruby/variable.h" -#include <string.h> -#include "mruby/string.h" -#include "mruby/array.h" -#include "mruby/hash.h" -#include "error.h" -#include "mruby/numeric.h" -//#include "mio.h" -#include <stdio.h> - - -#define TYPE(o) (o).tt//mrb_type(o) - -#define E_CONVERTERNOTFOUND_ERROR (mrb_class_obj_get(mrb, "ConverterNotFoundError")) -#define E_INVALIDBYTESEQUENCE_ERROR (mrb_class_obj_get(mrb, "InvalidByteSequenceError")) -#define E_UNDEFINEDCONVERSION_ERROR (mrb_class_obj_get(mrb, "UndefinedConversionError")) - -/* mrb_value mrb_cEncoding = rb_define_class("Encoding", rb_cObject); */ -mrb_value rb_eUndefinedConversionError; -mrb_value mrb_eInvalidByteSequenceError; -mrb_value rb_eConverterNotFoundError; - -mrb_value mrb_cEncodingConverter; - -static mrb_value sym_invalid, sym_undef, sym_replace, sym_fallback; -static mrb_value sym_xml, sym_text, sym_attr; -static mrb_value sym_universal_newline; -static mrb_value sym_crlf_newline; -static mrb_value sym_cr_newline; -static mrb_value sym_partial_input; - -static mrb_value sym_invalid_byte_sequence; -static mrb_value sym_undefined_conversion; -static mrb_value sym_destination_buffer_full; -static mrb_value sym_source_buffer_empty; -static mrb_value sym_finished; -static mrb_value sym_after_output; -static mrb_value sym_incomplete_input; - -static unsigned char * -allocate_converted_string(mrb_state *mrb, - const char *sname, const char *dname, - const unsigned char *str, size_t len, - unsigned char *caller_dst_buf, size_t caller_dst_bufsize, - size_t *dst_len_ptr); - -union mrb_transcoding_state_t { /* opaque data for stateful encoding */ - void *ptr; - char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)]; - double dummy_for_alignment; -}; - -/* dynamic structure, one per conversion (similar to iconv_t) */ -/* may carry conversion state (e.g. for iso-2022-jp) */ -typedef struct mrb_transcoding { - const mrb_transcoder *transcoder; - - int flags; - - int resume_position; - unsigned int next_table; - mrb_value next_info; - unsigned char next_byte; - unsigned int output_index; - - ssize_t recognized_len; /* already interpreted */ - ssize_t readagain_len; /* not yet interpreted */ - union { - unsigned char ary[8]; /* max_input <= sizeof(ary) */ - unsigned char *ptr; /* length: max_input */ - } readbuf; /* recognized_len + readagain_len used */ - - ssize_t writebuf_off; - ssize_t writebuf_len; - union { - unsigned char ary[8]; /* max_output <= sizeof(ary) */ - unsigned char *ptr; /* length: max_output */ - } writebuf; - - union mrb_transcoding_state_t state; -} mrb_transcoding; -#define TRANSCODING_READBUF(tc) \ - ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \ - (tc)->readbuf.ary : \ - (tc)->readbuf.ptr) -#define TRANSCODING_WRITEBUF(tc) \ - ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \ - (tc)->writebuf.ary : \ - (tc)->writebuf.ptr) -#define TRANSCODING_WRITEBUF_SIZE(tc) \ - ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \ - sizeof((tc)->writebuf.ary) : \ - (size_t)(tc)->transcoder->max_output) -#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union mrb_transcoding_state_t)) -#define TRANSCODING_STATE(tc) \ - ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \ - (tc)->state.ary : \ - (tc)->state.ptr) - -typedef struct { - struct mrb_transcoding *tc; - unsigned char *out_buf_start; - unsigned char *out_data_start; - unsigned char *out_data_end; - unsigned char *out_buf_end; - mrb_econv_result_t last_result; -} mrb_econv_elem_t; - -struct mrb_econv_t { - int flags; - const char *source_encoding_name; - const char *destination_encoding_name; - - int started; - - const unsigned char *replacement_str; - size_t replacement_len; - const char *replacement_enc; - int replacement_allocated; - - unsigned char *in_buf_start; - unsigned char *in_data_start; - unsigned char *in_data_end; - unsigned char *in_buf_end; - mrb_econv_elem_t *elems; - int num_allocated; - int num_trans; - int num_finished; - struct mrb_transcoding *last_tc; - - /* last error */ - struct { - mrb_econv_result_t result; - struct mrb_transcoding *error_tc; - const char *source_encoding; - const char *destination_encoding; - const unsigned char *error_bytes_start; - size_t error_bytes_len; - size_t readagain_len; - } last_error; - - /* The following fields are only for Encoding::Converter. - * mrb_econv_open set them NULL. */ - mrb_encoding *source_encoding; - mrb_encoding *destination_encoding; -}; - -/* - * Dispatch data and logic - */ - -#define DECORATOR_P(sname, dname) (*(sname) == '\0') - -typedef struct { - const char *sname; - const char *dname; - const char *lib; /* null means means no need to load a library */ - const mrb_transcoder *transcoder; -} transcoder_entry_t; - -static st_table *transcoder_table; - -static transcoder_entry_t * -make_transcoder_entry(const char *sname, const char *dname) -{ - st_data_t val; - st_table *table2; - - if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { - val = (st_data_t)st_init_strcasetable(); - st_add_direct(transcoder_table, (st_data_t)sname, val); - } - table2 = (st_table*)val; - if (!st_lookup(table2, (st_data_t)dname, &val)) { - transcoder_entry_t *entry = malloc(sizeof(transcoder_entry_t)); - entry->sname = sname; - entry->dname = dname; - entry->lib = NULL; - entry->transcoder = NULL; - val = (st_data_t)entry; - st_add_direct(table2, (st_data_t)dname, val); - } - return (transcoder_entry_t*)val; -} - -static transcoder_entry_t * -get_transcoder_entry(const char *sname, const char *dname) -{ - st_data_t val; - st_table *table2; - - if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { - return NULL; - } - table2 = (st_table*)val; - if (!st_lookup(table2, (st_data_t)dname, &val)) { - return NULL; - } - return (transcoder_entry_t*)val; -} - -void -mrb_register_transcoder(mrb_state *mrb, const mrb_transcoder *tr) -{ - const char *const sname = tr->src_encoding; - const char *const dname = tr->dst_encoding; - - transcoder_entry_t *entry; - - entry = make_transcoder_entry(sname, dname); - if (entry->transcoder) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "transcoder from %s to %s has been already registered", - sname, dname); - } - - entry->transcoder = tr; -} - -static void -declare_transcoder(const char *sname, const char *dname, const char *lib) -{ - transcoder_entry_t *entry; - - entry = make_transcoder_entry(sname, dname); - entry->lib = lib; -} - -#define MAX_TRANSCODER_LIBNAME_LEN 64 -static const char transcoder_lib_prefix[] = "enc/trans/"; - -void -mrb_declare_transcoder(mrb_state *mrb, const char *enc1, const char *enc2, const char *lib) -{ - if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid library name - %s", - lib ? lib : "(null)"); - } - declare_transcoder(enc1, enc2, lib); -} - -#define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0) - -typedef struct search_path_queue_tag { - struct search_path_queue_tag *next; - const char *enc; -} search_path_queue_t; - -typedef struct { - st_table *visited; - search_path_queue_t *queue; - search_path_queue_t **queue_last_ptr; - const char *base_enc; -} search_path_bfs_t; - -static enum st_retval -transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg) -{ - const char *dname = (const char*)key; - search_path_bfs_t *bfs = (search_path_bfs_t*)arg; - search_path_queue_t *q; - - if (st_lookup(bfs->visited, (st_data_t)dname, &val)) { - return ST_CONTINUE; - } - - q = malloc(sizeof(search_path_queue_t)); - q->enc = dname; - q->next = NULL; - *bfs->queue_last_ptr = q; - bfs->queue_last_ptr = &q->next; - - st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc); - return ST_CONTINUE; -} - -static int -transcode_search_path(mrb_state *mrb, const char *sname, const char *dname, - void (*callback)(mrb_state *mrb, const char *sname, const char *dname, int depth, void *arg), - void *arg) -{ - search_path_bfs_t bfs; - search_path_queue_t *q; - st_data_t val; - st_table *table2; - int found; - int pathlen = -1; - - if (encoding_equal(sname, dname)) - return -1; - - q = malloc(sizeof(search_path_queue_t));//ALLOC(search_path_queue_t); - q->enc = sname; - q->next = NULL; - bfs.queue_last_ptr = &q->next; - bfs.queue = q; - - bfs.visited = st_init_strcasetable(); - st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL); - - while (bfs.queue) { - q = bfs.queue; - bfs.queue = q->next; - if (!bfs.queue) - bfs.queue_last_ptr = &bfs.queue; - - if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) { - xfree(q); - continue; - } - table2 = (st_table*)val; - - if (st_lookup(table2, (st_data_t)dname, &val)) { - st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc); - xfree(q); - found = 1; - goto cleanup; - } - - bfs.base_enc = q->enc; - st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs); - bfs.base_enc = NULL; - - xfree(q); - } - found = 0; - - cleanup: - while (bfs.queue) { - q = bfs.queue; - bfs.queue = q->next; - xfree(q); - } - - if (found) { - const char *enc = dname; - int depth; - pathlen = 0; - while (1) { - st_lookup(bfs.visited, (st_data_t)enc, &val); - if (!val) - break; - pathlen++; - enc = (const char*)val; - } - depth = pathlen; - enc = dname; - while (1) { - st_lookup(bfs.visited, (st_data_t)enc, &val); - if (!val) - break; - callback(mrb, (const char*)val, enc, --depth, arg); - enc = (const char*)val; - } - } - - st_free_table(bfs.visited); - - return pathlen; /* is -1 if not found */ -} - -int -mrb_require(mrb_state *mrb, const char *fname) -{ - //mrb_value fn = mrb_str_new2(mrb, fname); - //OBJ_FREEZE(fn); - //return mrb_require_safe(fn, mrb_safe_level()); - mrb_str_new2(mrb, fname); - return 1/* OK */; -} - -static const mrb_transcoder * -load_transcoder_entry(mrb_state *mrb, transcoder_entry_t *entry) -{ - if (entry->transcoder) - return entry->transcoder; - - if (entry->lib) { - const char *lib = entry->lib; - size_t len = strlen(lib); - char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN]; - - entry->lib = NULL; - - if (len > MAX_TRANSCODER_LIBNAME_LEN) - return NULL; - memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1); - memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1); - if (!mrb_require(mrb, path)) - return NULL; - } - - if (entry->transcoder) - return entry->transcoder; - - return NULL; -} - -static const char* -get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr) -{ - if (encoding_equal(encname, "UTF-8")) { - *len_ret = 3; - *repl_encname_ptr = "UTF-8"; - return "\xEF\xBF\xBD"; - } - else { - *len_ret = 1; - *repl_encname_ptr = "US-ASCII"; - return "?"; - } -} - -/* - * Transcoding engine logic - */ - -static const unsigned char * -transcode_char_start(mrb_transcoding *tc, - const unsigned char *in_start, - const unsigned char *inchar_start, - const unsigned char *in_p, - size_t *char_len_ptr) -{ - const unsigned char *ptr; - if (inchar_start - in_start < tc->recognized_len) { - memcpy(TRANSCODING_READBUF(tc) + tc->recognized_len, - inchar_start, in_p - inchar_start); - ptr = TRANSCODING_READBUF(tc); - } - else { - ptr = inchar_start - tc->recognized_len; - } - *char_len_ptr = tc->recognized_len + (in_p - inchar_start); - return ptr; -} - -static mrb_econv_result_t -transcode_restartable0(mrb_state *mrb, - const unsigned char **in_pos, unsigned char **out_pos, - const unsigned char *in_stop, unsigned char *out_stop, - mrb_transcoding *tc, - const int opt) -{ - const mrb_transcoder *tr = tc->transcoder; - int unitlen = tr->input_unit_length; - ssize_t readagain_len = 0; - - const unsigned char *inchar_start; - const unsigned char *in_p; - - unsigned char *out_p; - - in_p = inchar_start = *in_pos; - - out_p = *out_pos; - -#define SUSPEND(ret, num) \ - do { \ - tc->resume_position = (num); \ - if (0 < in_p - inchar_start) \ - memmove(TRANSCODING_READBUF(tc)+tc->recognized_len, \ - inchar_start, in_p - inchar_start); \ - *in_pos = in_p; \ - *out_pos = out_p; \ - tc->recognized_len += in_p - inchar_start; \ - if (readagain_len) { \ - tc->recognized_len -= readagain_len; \ - tc->readagain_len = readagain_len; \ - } \ - return ret; \ - resume_label ## num:; \ - } while (0) -#define SUSPEND_OBUF(num) \ - do { \ - while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \ - } while (0) - -#define SUSPEND_AFTER_OUTPUT(num) \ - if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \ - SUSPEND(econv_after_output, num); \ - } - -#define next_table (tc->next_table) -#define next_info (tc->next_info) -#define next_byte (tc->next_byte) -#define writebuf_len (tc->writebuf_len) -#define writebuf_off (tc->writebuf_off) - - switch (tc->resume_position) { - case 0: break; - case 1: goto resume_label1; - case 2: goto resume_label2; - case 3: goto resume_label3; - case 4: goto resume_label4; - case 5: goto resume_label5; - case 6: goto resume_label6; - case 7: goto resume_label7; - case 8: goto resume_label8; - case 9: goto resume_label9; - case 10: goto resume_label10; - case 11: goto resume_label11; - case 12: goto resume_label12; - case 13: goto resume_label13; - case 14: goto resume_label14; - case 15: goto resume_label15; - case 16: goto resume_label16; - case 17: goto resume_label17; - case 18: goto resume_label18; - case 19: goto resume_label19; - case 20: goto resume_label20; - case 21: goto resume_label21; - case 22: goto resume_label22; - case 23: goto resume_label23; - case 24: goto resume_label24; - case 25: goto resume_label25; - case 26: goto resume_label26; - case 27: goto resume_label27; - case 28: goto resume_label28; - case 29: goto resume_label29; - case 30: goto resume_label30; - case 31: goto resume_label31; - case 32: goto resume_label32; - case 33: goto resume_label33; - case 34: goto resume_label34; - default: break; - } - - while (1) { - inchar_start = in_p; - tc->recognized_len = 0; - next_table = tr->conv_tree_start; - - SUSPEND_AFTER_OUTPUT(24); - - if (in_stop <= in_p) { - if (!(opt & ECONV_PARTIAL_INPUT)) - break; - SUSPEND(econv_source_buffer_empty, 7); - continue; - } - -#define BYTE_ADDR(index) (tr->byte_array + (index)) -#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index)) -#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table))) -#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table))) -#define BL_MIN_BYTE (BL_BASE[0]) -#define BL_MAX_BYTE (BL_BASE[1]) -#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE]) -#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))]) - - next_byte = (unsigned char)*in_p++; - follow_byte: - if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte) - next_info = mrb_fixnum_value(INVALID); - else { - next_info = mrb_fixnum_value(BL_ACTION(next_byte)); - } - follow_info: - switch (mrb_fixnum(next_info) & 0x1F) { - case NOMAP: - { - { - const unsigned char *p = inchar_start; - writebuf_off = 0; - while (p < in_p) { - TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++; - } - } - writebuf_len = writebuf_off; - writebuf_off = 0; - while (writebuf_off < writebuf_len) { - SUSPEND_OBUF(3); - *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; - } - } - continue; - case 0x00: case 0x04: case 0x08: case 0x0C: - case 0x10: case 0x14: case 0x18: case 0x1C: - SUSPEND_AFTER_OUTPUT(25); - while (in_p >= in_stop) { - if (!(opt & ECONV_PARTIAL_INPUT)) - goto incomplete; - SUSPEND(econv_source_buffer_empty, 5); - } - next_byte = (unsigned char)*in_p++; - next_table = (unsigned int)mrb_fixnum(next_info); - goto follow_byte; - case ZERObt: /* drop input */ - continue; - case ONEbt: - SUSPEND_OBUF(9); *out_p++ = getBT1(mrb_fixnum(next_info)); - continue; - case TWObt: - SUSPEND_OBUF(10); *out_p++ = getBT1(mrb_fixnum(next_info)); - SUSPEND_OBUF(21); *out_p++ = getBT2(mrb_fixnum(next_info)); - continue; - case THREEbt: - SUSPEND_OBUF(11); *out_p++ = getBT1(mrb_fixnum(next_info)); - SUSPEND_OBUF(15); *out_p++ = getBT2(mrb_fixnum(next_info)); - SUSPEND_OBUF(16); *out_p++ = getBT3(mrb_fixnum(next_info)); - continue; - case FOURbt: - SUSPEND_OBUF(12); *out_p++ = getBT0(mrb_fixnum(next_info)); - SUSPEND_OBUF(17); *out_p++ = getBT1(mrb_fixnum(next_info)); - SUSPEND_OBUF(18); *out_p++ = getBT2(mrb_fixnum(next_info)); - SUSPEND_OBUF(19); *out_p++ = getBT3(mrb_fixnum(next_info)); - continue; - case GB4bt: - SUSPEND_OBUF(29); *out_p++ = getGB4bt0((unsigned char)mrb_fixnum(next_info)); - SUSPEND_OBUF(30); *out_p++ = getGB4bt1((mrb_fixnum(next_info))); - SUSPEND_OBUF(31); *out_p++ = getGB4bt2((unsigned char)mrb_fixnum(next_info)); - SUSPEND_OBUF(32); *out_p++ = getGB4bt3(mrb_fixnum(next_info)); - continue; - case STR1: - tc->output_index = 0; - while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(mrb_fixnum(next_info))))) { - SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(mrb_fixnum(next_info)))[1+tc->output_index]; - tc->output_index++; - } - continue; - case FUNii: - next_info = (*tr->func_ii)(TRANSCODING_STATE(tc), next_info); - goto follow_info; - case FUNsi: - { - const unsigned char *char_start; - size_t char_len; - char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); - next_info = (*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len); - goto follow_info; - } - case FUNio: - SUSPEND_OBUF(13); - if (tr->max_output <= out_stop - out_p) - out_p += tr->func_io(TRANSCODING_STATE(tc), - next_info, out_p, out_stop - out_p); - else { - writebuf_len = tr->func_io(TRANSCODING_STATE(tc), - next_info, - TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); - writebuf_off = 0; - while (writebuf_off < writebuf_len) { - SUSPEND_OBUF(20); - *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; - } - } - break; - case FUNso: - { - const unsigned char *char_start; - size_t char_len; - SUSPEND_OBUF(14); - if (tr->max_output <= out_stop - out_p) { - char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); - out_p += tr->func_so(TRANSCODING_STATE(tc), - char_start, (size_t)char_len, - out_p, out_stop - out_p); - } - else { - char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); - writebuf_len = tr->func_so(TRANSCODING_STATE(tc), - char_start, (size_t)char_len, - TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); - writebuf_off = 0; - while (writebuf_off < writebuf_len) { - SUSPEND_OBUF(22); - *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; - } - } - break; - } - case FUNsio: - { - const unsigned char *char_start; - size_t char_len; - SUSPEND_OBUF(33); - if (tr->max_output <= out_stop - out_p) { - char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); - out_p += tr->func_sio(TRANSCODING_STATE(tc), - char_start, (size_t)char_len, next_info, - out_p, out_stop - out_p); - } - else { - char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); - writebuf_len = tr->func_sio(TRANSCODING_STATE(tc), - char_start, (size_t)char_len, next_info, - TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); - writebuf_off = 0; - while (writebuf_off < writebuf_len) { - SUSPEND_OBUF(34); - *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; - } - } - break; - } - case INVALID: - if (tc->recognized_len + (in_p - inchar_start) <= unitlen) { - if (tc->recognized_len + (in_p - inchar_start) < unitlen) - SUSPEND_AFTER_OUTPUT(26); - while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) { - in_p = in_stop; - SUSPEND(econv_source_buffer_empty, 8); - } - if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) { - in_p = in_stop; - } - else { - in_p = inchar_start + (unitlen - tc->recognized_len); - } - } - else { - ssize_t invalid_len; /* including the last byte which causes invalid */ - ssize_t discard_len; - invalid_len = tc->recognized_len + (in_p - inchar_start); - discard_len = ((invalid_len - 1) / unitlen) * unitlen; - readagain_len = invalid_len - discard_len; - } - goto invalid; - case UNDEF: - goto undef; - default: - mrb_raise(mrb, mrb->eRuntimeError_class, "unknown transcoding instruction"); - } - continue; - - invalid: - SUSPEND(econv_invalid_byte_sequence, 1); - continue; - - incomplete: - SUSPEND(econv_incomplete_input, 27); - continue; - - undef: - SUSPEND(econv_undefined_conversion, 2); - continue; - } - - /* cleanup */ - if (tr->finish_func) { - SUSPEND_OBUF(4); - if (tr->max_output <= out_stop - out_p) { - out_p += tr->finish_func(TRANSCODING_STATE(tc), - out_p, out_stop - out_p); - } - else { - writebuf_len = tr->finish_func(TRANSCODING_STATE(tc), - TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); - writebuf_off = 0; - while (writebuf_off < writebuf_len) { - SUSPEND_OBUF(23); - *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; - } - } - } - while (1) - SUSPEND(econv_finished, 6); -#undef SUSPEND -#undef next_table -#undef next_info -#undef next_byte -#undef writebuf_len -#undef writebuf_off -} - -static mrb_econv_result_t -transcode_restartable(mrb_state *mrb, - const unsigned char **in_pos, unsigned char **out_pos, - const unsigned char *in_stop, unsigned char *out_stop, - mrb_transcoding *tc, - const int opt) -{ - if (tc->readagain_len) { - unsigned char *readagain_buf = malloc(tc->readagain_len);//ALLOCA_N(unsigned char, tc->readagain_len); - const unsigned char *readagain_pos = readagain_buf; - const unsigned char *readagain_stop = readagain_buf + tc->readagain_len; - mrb_econv_result_t res; - - memcpy(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len, - tc->readagain_len); - tc->readagain_len = 0; - res = transcode_restartable0(mrb, &readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT); - if (res != econv_source_buffer_empty) { - memcpy(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len, - readagain_pos, readagain_stop - readagain_pos); - tc->readagain_len += readagain_stop - readagain_pos; - return res; - } - } - return transcode_restartable0(mrb, in_pos, out_pos, in_stop, out_stop, tc, opt); -} - -static mrb_transcoding * -mrb_transcoding_open_by_transcoder(const mrb_transcoder *tr, int flags) -{ - mrb_transcoding *tc; - - tc = malloc(sizeof(mrb_transcoding)); - tc->transcoder = tr; - tc->flags = flags; - if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) - tc->state.ptr = xmalloc(tr->state_size); - if (tr->state_init_func) { - (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */ - } - tc->resume_position = 0; - tc->recognized_len = 0; - tc->readagain_len = 0; - tc->writebuf_len = 0; - tc->writebuf_off = 0; - if ((int)sizeof(tc->readbuf.ary) < tr->max_input) { - tc->readbuf.ptr = xmalloc(tr->max_input); - } - if ((int)sizeof(tc->writebuf.ary) < tr->max_output) { - tc->writebuf.ptr = xmalloc(tr->max_output); - } - return tc; -} - -static mrb_econv_result_t -mrb_transcoding_convert(mrb_state *mrb, mrb_transcoding *tc, - const unsigned char **input_ptr, const unsigned char *input_stop, - unsigned char **output_ptr, unsigned char *output_stop, - int flags) -{ - return transcode_restartable(mrb, - input_ptr, output_ptr, - input_stop, output_stop, - tc, flags); -} - -static void -mrb_transcoding_close(mrb_transcoding *tc) -{ - const mrb_transcoder *tr = tc->transcoder; - if (tr->state_fini_func) { - (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */ - } - if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) - xfree(tc->state.ptr); - if ((int)sizeof(tc->readbuf.ary) < tr->max_input) - xfree(tc->readbuf.ptr); - if ((int)sizeof(tc->writebuf.ary) < tr->max_output) - xfree(tc->writebuf.ptr); - xfree(tc); -} - -static size_t -mrb_transcoding_memsize(mrb_transcoding *tc) -{ - size_t size = sizeof(mrb_transcoding); - const mrb_transcoder *tr = tc->transcoder; - - if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) { - size += tr->state_size; - } - if ((int)sizeof(tc->readbuf.ary) < tr->max_input) { - size += tr->max_input; - } - if ((int)sizeof(tc->writebuf.ary) < tr->max_output) { - size += tr->max_output; - } - return size; -} - -static mrb_econv_t * -mrb_econv_alloc(int n_hint) -{ - mrb_econv_t *ec; - - if (n_hint <= 0) - n_hint = 1; - - ec = malloc(sizeof(mrb_econv_t));//ALLOC(mrb_econv_t); - ec->flags = 0; - ec->source_encoding_name = NULL; - ec->destination_encoding_name = NULL; - ec->started = 0; - ec->replacement_str = NULL; - ec->replacement_len = 0; - ec->replacement_enc = NULL; - ec->replacement_allocated = 0; - ec->in_buf_start = NULL; - ec->in_data_start = NULL; - ec->in_data_end = NULL; - ec->in_buf_end = NULL; - ec->num_allocated = n_hint; - ec->num_trans = 0; - ec->elems = malloc(sizeof(mrb_econv_elem_t)*ec->num_allocated);//ALLOC_N(mrb_econv_elem_t, ec->num_allocated); - ec->num_finished = 0; - ec->last_tc = NULL; - ec->last_error.result = econv_source_buffer_empty; - ec->last_error.error_tc = NULL; - ec->last_error.source_encoding = NULL; - ec->last_error.destination_encoding = NULL; - ec->last_error.error_bytes_start = NULL; - ec->last_error.error_bytes_len = 0; - ec->last_error.readagain_len = 0; - ec->source_encoding = NULL; - ec->destination_encoding = NULL; - return ec; -} - -static int -mrb_econv_add_transcoder_at(mrb_state *mrb, mrb_econv_t *ec, const mrb_transcoder *tr, int i) -{ - int n, j; - int bufsize = 4096; - unsigned char *p; - - if (ec->num_trans == ec->num_allocated) { - n = ec->num_allocated * 2; - mrb_realloc(mrb, ec->elems, sizeof(mrb_econv_elem_t)*n);//REALLOC_N(ec->elems, mrb_econv_elem_t, n); - ec->num_allocated = n; - } - - p = xmalloc(bufsize); - - memmove(ec->elems+i+1, ec->elems+i, sizeof(mrb_econv_elem_t)*(ec->num_trans-i)); - - ec->elems[i].tc = mrb_transcoding_open_by_transcoder(tr, 0); - ec->elems[i].out_buf_start = p; - ec->elems[i].out_buf_end = p + bufsize; - ec->elems[i].out_data_start = p; - ec->elems[i].out_data_end = p; - ec->elems[i].last_result = econv_source_buffer_empty; - - ec->num_trans++; - - if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding)) - for (j = ec->num_trans-1; i <= j; j--) { - mrb_transcoding *tc = ec->elems[j].tc; - const mrb_transcoder *tr2 = tc->transcoder; - if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) { - ec->last_tc = tc; - break; - } - } - - return 0; -} - -static mrb_econv_t * -mrb_econv_open_by_transcoder_entries(mrb_state *mrb, int n, transcoder_entry_t **entries) -{ - mrb_econv_t *ec; - int i, ret; - - for (i = 0; i < n; i++) { - const mrb_transcoder *tr; - tr = load_transcoder_entry(mrb, entries[i]); - if (!tr) - return NULL; - } - - ec = mrb_econv_alloc(n); - - for (i = 0; i < n; i++) { - const mrb_transcoder *tr = load_transcoder_entry(mrb, entries[i]); - ret = mrb_econv_add_transcoder_at(mrb, ec, tr, ec->num_trans); - if (ret == -1) { - mrb_econv_close(ec); - return NULL; - } - } - - return ec; -} - -struct trans_open_t { - transcoder_entry_t **entries; - int num_additional; -}; - -static void -trans_open_i(mrb_state *mrb, const char *sname, const char *dname, int depth, void *arg) -{ - struct trans_open_t *toarg = arg; - - if (!toarg->entries) { - toarg->entries = malloc(sizeof(transcoder_entry_t*)*depth+1+toarg->num_additional);//ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional); - } - toarg->entries[depth] = get_transcoder_entry(sname, dname); -} - -static mrb_econv_t * -mrb_econv_open0(mrb_state *mrb, const char *sname, const char *dname, int ecflags) -{ - transcoder_entry_t **entries = NULL; - int num_trans; - mrb_econv_t *ec; - - mrb_encoding *senc, *denc; - int sidx, didx; - - senc = NULL; - if (*sname) { - sidx = mrb_enc_find_index(mrb, sname); - if (0 <= sidx) { - senc = mrb_enc_from_index(mrb, sidx); - } - } - - denc = NULL; - if (*dname) { - didx = mrb_enc_find_index(mrb, dname); - if (0 <= didx) { - denc = mrb_enc_from_index(mrb, didx); - } - } - - if (*sname == '\0' && *dname == '\0') { - num_trans = 0; - entries = NULL; - } - else { - struct trans_open_t toarg; - toarg.entries = NULL; - toarg.num_additional = 0; - num_trans = transcode_search_path(mrb, sname, dname, trans_open_i, (void*)&toarg); - entries = toarg.entries; - if (num_trans < 0) { - xfree(entries); - return NULL; - } - } - - ec = mrb_econv_open_by_transcoder_entries(mrb, num_trans, entries); - xfree(entries); - if (!ec) - return NULL; - - ec->flags = ecflags; - ec->source_encoding_name = sname; - ec->destination_encoding_name = dname; - - return ec; -} - -#define MAX_ECFLAGS_DECORATORS 32 - -static int -decorator_names(int ecflags, const char **decorators_ret) -{ - int num_decorators; - - if ((ecflags & ECONV_CRLF_NEWLINE_DECORATOR) && - (ecflags & ECONV_CR_NEWLINE_DECORATOR)) - return -1; - - if ((ecflags & (ECONV_CRLF_NEWLINE_DECORATOR|ECONV_CR_NEWLINE_DECORATOR)) && - (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)) - return -1; - - if ((ecflags & ECONV_XML_TEXT_DECORATOR) && - (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)) - return -1; - - num_decorators = 0; - - if (ecflags & ECONV_XML_TEXT_DECORATOR) - decorators_ret[num_decorators++] = "xml_text_escape"; - if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) - decorators_ret[num_decorators++] = "xml_attr_content_escape"; - if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) - decorators_ret[num_decorators++] = "xml_attr_quote"; - - if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) - decorators_ret[num_decorators++] = "crlf_newline"; - if (ecflags & ECONV_CR_NEWLINE_DECORATOR) - decorators_ret[num_decorators++] = "cr_newline"; - if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) - decorators_ret[num_decorators++] = "universal_newline"; - - return num_decorators; -} - -mrb_econv_t * -mrb_econv_open(mrb_state *mrb, const char *sname, const char *dname, int ecflags) -{ - mrb_econv_t *ec; - int num_decorators; - const char *decorators[MAX_ECFLAGS_DECORATORS]; - int i; - - num_decorators = decorator_names(ecflags, decorators); - if (num_decorators == -1) - return NULL; - - ec = mrb_econv_open0(mrb, sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK); - if (!ec) - return NULL; - - for (i = 0; i < num_decorators; i++) - if (mrb_econv_decorate_at_last(mrb, ec, decorators[i]) == -1) { - mrb_econv_close(ec); - return NULL; - } - - ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK; - - return ec; -} - -static int -trans_sweep(mrb_state *mrb, mrb_econv_t *ec, - const unsigned char **input_ptr, const unsigned char *input_stop, - unsigned char **output_ptr, unsigned char *output_stop, - int flags, - int start) -{ - int should_try; - int i, f; - - const unsigned char **ipp, *is, *iold; - unsigned char **opp, *os, *oold; - mrb_econv_result_t res; - - should_try = 1; - while (should_try) { - should_try = 0; - for (i = start; i < ec->num_trans; i++) { - mrb_econv_elem_t *te = &ec->elems[i]; - - if (i == 0) { - ipp = input_ptr; - is = input_stop; - } - else { - mrb_econv_elem_t *prev_te = &ec->elems[i-1]; - ipp = (const unsigned char **)&prev_te->out_data_start; - is = prev_te->out_data_end; - } - - if (i == ec->num_trans-1) { - opp = output_ptr; - os = output_stop; - } - else { - if (te->out_buf_start != te->out_data_start) { - ssize_t len = te->out_data_end - te->out_data_start; - ssize_t off = te->out_data_start - te->out_buf_start; - memmove(te->out_buf_start, te->out_data_start, len); - te->out_data_start = te->out_buf_start; - te->out_data_end -= off; - } - opp = &te->out_data_end; - os = te->out_buf_end; - } - - f = flags; - if (ec->num_finished != i) - f |= ECONV_PARTIAL_INPUT; - if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) { - start = 1; - flags &= ~ECONV_AFTER_OUTPUT; - } - if (i != 0) - f &= ~ECONV_AFTER_OUTPUT; - iold = *ipp; - oold = *opp; - te->last_result = res = mrb_transcoding_convert(mrb, te->tc, ipp, is, opp, os, f); - if (iold != *ipp || oold != *opp) - should_try = 1; - - switch (res) { - case econv_invalid_byte_sequence: - case econv_incomplete_input: - case econv_undefined_conversion: - case econv_after_output: - return i; - - case econv_destination_buffer_full: - case econv_source_buffer_empty: - break; - - case econv_finished: - ec->num_finished = i+1; - break; - - default: - mrb_bug("Internal Error: invalid return value from mrb_transcoding_convert()."); - break; - } - } - } - return -1; -} - -static mrb_econv_result_t -mrb_trans_conv(mrb_state *mrb, mrb_econv_t *ec, - const unsigned char **input_ptr, const unsigned char *input_stop, - unsigned char **output_ptr, unsigned char *output_stop, - int flags, - int *result_position_ptr) -{ - int i; - int needreport_index; - int sweep_start; - - unsigned char empty_buf; - unsigned char *empty_ptr = &empty_buf; - - if (!input_ptr) { - input_ptr = (const unsigned char **)&empty_ptr; - input_stop = empty_ptr; - } - - if (!output_ptr) { - output_ptr = &empty_ptr; - output_stop = empty_ptr; - } - - if (ec->elems[0].last_result == econv_after_output) - ec->elems[0].last_result = econv_source_buffer_empty; - - needreport_index = -1; - for (i = ec->num_trans-1; 0 <= i; i--) { - switch (ec->elems[i].last_result) { - case econv_invalid_byte_sequence: - case econv_incomplete_input: - case econv_undefined_conversion: - case econv_after_output: - case econv_finished: - sweep_start = i+1; - needreport_index = i; - goto found_needreport; - - case econv_destination_buffer_full: - case econv_source_buffer_empty: - break; - - default: - mrb_bug("unexpected transcode last result"); - } - } - - /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */ - - if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full && - (flags & ECONV_AFTER_OUTPUT)) { - mrb_econv_result_t res; - - res = mrb_trans_conv(mrb, ec, NULL, NULL, output_ptr, output_stop, - (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, - result_position_ptr); - - if (res == econv_source_buffer_empty) - return econv_after_output; - return res; - } - - sweep_start = 0; - - found_needreport: - - do { - needreport_index = trans_sweep(mrb, ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start); - sweep_start = needreport_index + 1; - } while (needreport_index != -1 && needreport_index != ec->num_trans-1); - - for (i = ec->num_trans-1; 0 <= i; i--) { - if (ec->elems[i].last_result != econv_source_buffer_empty) { - mrb_econv_result_t res = ec->elems[i].last_result; - if (res == econv_invalid_byte_sequence || - res == econv_incomplete_input || - res == econv_undefined_conversion || - res == econv_after_output) { - ec->elems[i].last_result = econv_source_buffer_empty; - } - if (result_position_ptr) - *result_position_ptr = i; - return res; - } - } - if (result_position_ptr) - *result_position_ptr = -1; - return econv_source_buffer_empty; -} - -static mrb_econv_result_t -mrb_econv_convert0(mrb_state *mrb, mrb_econv_t *ec, - const unsigned char **input_ptr, const unsigned char *input_stop, - unsigned char **output_ptr, unsigned char *output_stop, - int flags) -{ - mrb_econv_result_t res; - int result_position; - int has_output = 0; - - memset(&ec->last_error, 0, sizeof(ec->last_error)); - - if (ec->num_trans == 0) { - size_t len; - if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) { - if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) { - len = output_stop - *output_ptr; - memcpy(*output_ptr, ec->in_data_start, len); - *output_ptr = output_stop; - ec->in_data_start += len; - res = econv_destination_buffer_full; - goto gotresult; - } - len = ec->in_data_end - ec->in_data_start; - memcpy(*output_ptr, ec->in_data_start, len); - *output_ptr += len; - ec->in_data_start = ec->in_data_end = ec->in_buf_start; - if (flags & ECONV_AFTER_OUTPUT) { - res = econv_after_output; - goto gotresult; - } - } - if (output_stop - *output_ptr < input_stop - *input_ptr) { - len = output_stop - *output_ptr; - } - else { - len = input_stop - *input_ptr; - } - if (0 < len && (flags & ECONV_AFTER_OUTPUT)) { - *(*output_ptr)++ = *(*input_ptr)++; - res = econv_after_output; - goto gotresult; - } - memcpy(*output_ptr, *input_ptr, len); - *output_ptr += len; - *input_ptr += len; - if (*input_ptr != input_stop) - res = econv_destination_buffer_full; - else if (flags & ECONV_PARTIAL_INPUT) - res = econv_source_buffer_empty; - else - res = econv_finished; - goto gotresult; - } - - if (ec->elems[ec->num_trans-1].out_data_start) { - unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start; - unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end; - if (data_start != data_end) { - size_t len; - if (output_stop - *output_ptr < data_end - data_start) { - len = output_stop - *output_ptr; - memcpy(*output_ptr, data_start, len); - *output_ptr = output_stop; - ec->elems[ec->num_trans-1].out_data_start += len; - res = econv_destination_buffer_full; - goto gotresult; - } - len = data_end - data_start; - memcpy(*output_ptr, data_start, len); - *output_ptr += len; - ec->elems[ec->num_trans-1].out_data_start = - ec->elems[ec->num_trans-1].out_data_end = - ec->elems[ec->num_trans-1].out_buf_start; - has_output = 1; - } - } - - if (ec->in_buf_start && - ec->in_data_start != ec->in_data_end) { - res = mrb_trans_conv(mrb, ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop, - (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position); - if (res != econv_source_buffer_empty) - goto gotresult; - } - - if (has_output && - (flags & ECONV_AFTER_OUTPUT) && - *input_ptr != input_stop) { - input_stop = *input_ptr; - res = mrb_trans_conv(mrb, ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position); - if (res == econv_source_buffer_empty) - res = econv_after_output; - } - else if ((flags & ECONV_AFTER_OUTPUT) || - ec->num_trans == 1) { - res = mrb_trans_conv(mrb, ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position); - } - else { - flags |= ECONV_AFTER_OUTPUT; - do { - res = mrb_trans_conv(mrb, ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position); - } while (res == econv_after_output); - } - - gotresult: - ec->last_error.result = res; - if (res == econv_invalid_byte_sequence || - res == econv_incomplete_input || - res == econv_undefined_conversion) { - mrb_transcoding *error_tc = ec->elems[result_position].tc; - ec->last_error.error_tc = error_tc; - ec->last_error.source_encoding = error_tc->transcoder->src_encoding; - ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding; - ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc); - ec->last_error.error_bytes_len = error_tc->recognized_len; - ec->last_error.readagain_len = error_tc->readagain_len; - } - - return res; -} - -static int output_replacement_character(mrb_state *mrb, mrb_econv_t *ec); - -static int -output_hex_charref(mrb_state *mrb, mrb_econv_t *ec) -{ - int ret; - unsigned char utfbuf[1024]; - const unsigned char *utf; - size_t utf_len; - int utf_allocated = 0; - char charef_buf[16]; - const unsigned char *p; - - if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) { - utf = ec->last_error.error_bytes_start; - utf_len = ec->last_error.error_bytes_len; - } - else { - utf = allocate_converted_string(mrb, - ec->last_error.source_encoding, "UTF-32BE", - ec->last_error.error_bytes_start, ec->last_error.error_bytes_len, - utfbuf, sizeof(utfbuf), - &utf_len); - if (!utf) - return -1; - if (utf != utfbuf && utf != ec->last_error.error_bytes_start) - utf_allocated = 1; - } - - if (utf_len % 4 != 0) - goto fail; - - p = utf; - while (4 <= utf_len) { - unsigned int u = 0; - u += p[0] << 24; - u += p[1] << 16; - u += p[2] << 8; - u += p[3]; - snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u); - - ret = mrb_econv_insert_output(mrb, ec, (unsigned char*)charef_buf, strlen(charef_buf), "US-ASCII"); - if (ret == -1) - goto fail; - - p += 4; - utf_len -= 4; - } - - if (utf_allocated) - xfree((void*)utf); - return 0; - - fail: - if (utf_allocated) - xfree((void*)utf); - return -1; -} - -mrb_econv_result_t -mrb_econv_convert(mrb_state *mrb, mrb_econv_t *ec, - const unsigned char **input_ptr, const unsigned char *input_stop, - unsigned char **output_ptr, unsigned char *output_stop, - int flags) -{ - mrb_econv_result_t ret; - - unsigned char empty_buf; - unsigned char *empty_ptr = &empty_buf; - - ec->started = 1; - - if (!input_ptr) { - input_ptr = (const unsigned char **)&empty_ptr; - input_stop = empty_ptr; - } - - if (!output_ptr) { - output_ptr = &empty_ptr; - output_stop = empty_ptr; - } - - resume: - ret = mrb_econv_convert0(mrb, ec, input_ptr, input_stop, output_ptr, output_stop, flags); - - if (ret == econv_invalid_byte_sequence || - ret == econv_incomplete_input) { - /* deal with invalid byte sequence */ - /* todo: add more alternative behaviors */ - switch (ec->flags & ECONV_INVALID_MASK) { - case ECONV_INVALID_REPLACE: - if (output_replacement_character(mrb, ec) == 0) - goto resume; - - default: - mrb_bug("Internal error: Unhandled ECONV_INVALID_xxx."); - break; - } - } - - if (ret == econv_undefined_conversion) { - /* valid character in source encoding - * but no related character(s) in destination encoding */ - /* todo: add more alternative behaviors */ - switch (ec->flags & ECONV_UNDEF_MASK) { - case ECONV_UNDEF_REPLACE: - if (output_replacement_character(mrb, ec) == 0) - goto resume; - break; - - case ECONV_UNDEF_HEX_CHARREF: - if (output_hex_charref(mrb, ec) == 0) - goto resume; - break; - - default: - mrb_bug("Internal error: Unhandled ECONV_UNDEF_xxx."); - break; - } - } - - return ret; -} - -const char * -mrb_econv_encoding_to_insert_output(mrb_econv_t *ec) -{ - mrb_transcoding *tc = ec->last_tc; - const mrb_transcoder *tr; - - if (tc == NULL) - return ""; - - tr = tc->transcoder; - - if (tr->asciicompat_type == asciicompat_encoder) - return tr->src_encoding; - return tr->dst_encoding; -} - -static unsigned char * -allocate_converted_string(mrb_state *mrb, - const char *sname, const char *dname, - const unsigned char *str, size_t len, - unsigned char *caller_dst_buf, size_t caller_dst_bufsize, - size_t *dst_len_ptr) -{ - unsigned char *dst_str; - size_t dst_len; - size_t dst_bufsize; - - mrb_econv_t *ec; - mrb_econv_result_t res; - - const unsigned char *sp; - unsigned char *dp; - - if (caller_dst_buf) - dst_bufsize = caller_dst_bufsize; - else if (len == 0) - dst_bufsize = 1; - else - dst_bufsize = len; - - ec = mrb_econv_open(mrb, sname, dname, 0); - if (ec == NULL) - return NULL; - if (caller_dst_buf) - dst_str = caller_dst_buf; - else - dst_str = xmalloc(dst_bufsize); - dst_len = 0; - sp = str; - dp = dst_str+dst_len; - res = mrb_econv_convert(mrb, ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0); - dst_len = dp - dst_str; - while (res == econv_destination_buffer_full) { - if (SIZE_MAX/2 < dst_bufsize) { - goto fail; - } - dst_bufsize *= 2; - if (dst_str == caller_dst_buf) { - unsigned char *tmp; - tmp = xmalloc(dst_bufsize); - memcpy(tmp, dst_str, dst_bufsize/2); - dst_str = tmp; - } - else { - dst_str = xrealloc(dst_str, dst_bufsize); - } - dp = dst_str+dst_len; - res = mrb_econv_convert(mrb, ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0); - dst_len = dp - dst_str; - } - if (res != econv_finished) { - goto fail; - } - mrb_econv_close(ec); - *dst_len_ptr = dst_len; - return dst_str; - - fail: - if (dst_str != caller_dst_buf) - xfree(dst_str); - mrb_econv_close(ec); - return NULL; -} - -/* result: 0:success -1:failure */ -int -mrb_econv_insert_output(mrb_state *mrb, mrb_econv_t *ec, - const unsigned char *str, size_t len, const char *str_encoding) -{ - const char *insert_encoding = mrb_econv_encoding_to_insert_output(ec); - unsigned char insert_buf[4096]; - const unsigned char *insert_str = NULL; - size_t insert_len; - - int last_trans_index; - mrb_transcoding *tc; - - unsigned char **buf_start_p; - unsigned char **data_start_p; - unsigned char **data_end_p; - unsigned char **buf_end_p; - - size_t need; - - ec->started = 1; - - if (len == 0) - return 0; - - if (encoding_equal(insert_encoding, str_encoding)) { - insert_str = str; - insert_len = len; - } - else { - insert_str = allocate_converted_string(mrb, str_encoding, insert_encoding, - str, len, insert_buf, sizeof(insert_buf), &insert_len); - if (insert_str == NULL) - return -1; - } - - need = insert_len; - - last_trans_index = ec->num_trans-1; - if (ec->num_trans == 0) { - tc = NULL; - buf_start_p = &ec->in_buf_start; - data_start_p = &ec->in_data_start; - data_end_p = &ec->in_data_end; - buf_end_p = &ec->in_buf_end; - } - else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) { - tc = ec->elems[last_trans_index].tc; - need += tc->readagain_len; - if (need < insert_len) - goto fail; - if (last_trans_index == 0) { - buf_start_p = &ec->in_buf_start; - data_start_p = &ec->in_data_start; - data_end_p = &ec->in_data_end; - buf_end_p = &ec->in_buf_end; - } - else { - mrb_econv_elem_t *ee = &ec->elems[last_trans_index-1]; - buf_start_p = &ee->out_buf_start; - data_start_p = &ee->out_data_start; - data_end_p = &ee->out_data_end; - buf_end_p = &ee->out_buf_end; - } - } - else { - mrb_econv_elem_t *ee = &ec->elems[last_trans_index]; - buf_start_p = &ee->out_buf_start; - data_start_p = &ee->out_data_start; - data_end_p = &ee->out_data_end; - buf_end_p = &ee->out_buf_end; - tc = ec->elems[last_trans_index].tc; - } - - if (*buf_start_p == NULL) { - unsigned char *buf = xmalloc(need); - *buf_start_p = buf; - *data_start_p = buf; - *data_end_p = buf; - *buf_end_p = buf+need; - } - else if ((size_t)(*buf_end_p - *data_end_p) < need) { - memmove(*buf_start_p, *data_start_p, *data_end_p - *data_start_p); - *data_end_p = *buf_start_p + (*data_end_p - *data_start_p); - *data_start_p = *buf_start_p; - if ((size_t)(*buf_end_p - *data_end_p) < need) { - unsigned char *buf; - size_t s = (*data_end_p - *buf_start_p) + need; - if (s < need) - goto fail; - buf = xrealloc(*buf_start_p, s); - *data_start_p = buf; - *data_end_p = buf + (*data_end_p - *buf_start_p); - *buf_start_p = buf; - *buf_end_p = buf + s; - } - } - - memcpy(*data_end_p, insert_str, insert_len); - *data_end_p += insert_len; - if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) { - memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len); - *data_end_p += tc->readagain_len; - tc->readagain_len = 0; - } - - if (insert_str != str && insert_str != insert_buf) - xfree((void*)insert_str); - return 0; - - fail: - if (insert_str != str && insert_str != insert_buf) - xfree((void*)insert_str); - return -1; -} - -void -mrb_econv_close(mrb_econv_t *ec) -{ - int i; - - if (ec->replacement_allocated) { - xfree((void*)ec->replacement_str); - } - for (i = 0; i < ec->num_trans; i++) { - mrb_transcoding_close(ec->elems[i].tc); - if (ec->elems[i].out_buf_start) - xfree(ec->elems[i].out_buf_start); - } - xfree(ec->in_buf_start); - xfree(ec->elems); - xfree(ec); -} - -size_t -mrb_econv_memsize(mrb_econv_t *ec) -{ - size_t size = sizeof(mrb_econv_t); - int i; - - if (ec->replacement_allocated) { - size += ec->replacement_len; - } - for (i = 0; i < ec->num_trans; i++) { - size += mrb_transcoding_memsize(ec->elems[i].tc); - - if (ec->elems[i].out_buf_start) { - size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start; - } - } - size += ec->in_buf_end - ec->in_buf_start; - size += sizeof(mrb_econv_elem_t) * ec->num_allocated; - - return size; -} - -int -mrb_econv_putbackable(mrb_econv_t *ec) -{ - if (ec->num_trans == 0) - return 0; - if (sizeof(size_t) > sizeof(int)) { - if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX; - } - return (int)ec->elems[0].tc->readagain_len; -} - -void -mrb_econv_putback(mrb_econv_t *ec, unsigned char *p, int n) -{ - mrb_transcoding *tc; - if (ec->num_trans == 0 || n == 0) - return; - tc = ec->elems[0].tc; - memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n); - tc->readagain_len -= n; -} - -struct asciicompat_encoding_t { - const char *ascii_compat_name; - const char *ascii_incompat_name; -}; - -static enum st_retval -asciicompat_encoding_i(mrb_state *mrb, st_data_t key, st_data_t val, st_data_t arg) -{ - struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t*)arg; - transcoder_entry_t *entry = (transcoder_entry_t*)val; - const mrb_transcoder *tr; - - if (DECORATOR_P(entry->sname, entry->dname)) - return ST_CONTINUE; - tr = load_transcoder_entry(mrb, entry); - if (tr && tr->asciicompat_type == asciicompat_decoder) { - data->ascii_compat_name = tr->dst_encoding; - return ST_STOP; - } - return ST_CONTINUE; -} - -const char * -mrb_econv_asciicompat_encoding(const char *ascii_incompat_name) -{ - st_data_t v; - st_table *table2; - struct asciicompat_encoding_t data; - - if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v)) - return NULL; - table2 = (st_table*)v; - - /* - * Assumption: - * There is at most one transcoder for - * converting from ASCII incompatible encoding. - * - * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others. - */ - if (table2->num_entries != 1) - return NULL; - - data.ascii_incompat_name = ascii_incompat_name; - data.ascii_compat_name = NULL; - st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data); - return data.ascii_compat_name; -} - -mrb_value -mrb_econv_substr_append(mrb_state *mrb, mrb_econv_t *ec, mrb_value src, long off, long len, mrb_value dst, int flags) -{ - unsigned const char *ss, *sp, *se; - unsigned char *ds, *dp, *de; - mrb_econv_result_t res; - int max_output; - - if (mrb_nil_p(dst)) { - dst = mrb_str_buf_new(mrb, len); - if (ec->destination_encoding) - mrb_enc_associate(mrb, dst, ec->destination_encoding); - } - - if (ec->last_tc) - max_output = ec->last_tc->transcoder->max_output; - else - max_output = 1; - - res = econv_destination_buffer_full; - while (res == econv_destination_buffer_full) { - long dlen = RSTRING_LEN(dst); - if (mrb_str_capacity(dst) - dlen < (size_t)len + max_output) { - unsigned long new_capa = (unsigned long)dlen + len + max_output; - if (LONG_MAX < new_capa) - mrb_raise(mrb, E_ARGUMENT_ERROR, "too long string"); - mrb_str_resize(mrb, dst, new_capa); - mrb_str_set_len(mrb, dst, dlen); - } - ss = sp = (const unsigned char*)RSTRING_PTR(src) + off; - se = ss + len; - ds = (unsigned char*)RSTRING_PTR(dst); - de = ds + mrb_str_capacity(dst); - dp = ds += dlen; - res = mrb_econv_convert(mrb, ec, &sp, se, &dp, de, flags); - off += sp - ss; - len -= sp - ss; - mrb_str_set_len(mrb, dst, dlen + (dp - ds)); - mrb_econv_check_error(mrb, ec); - } - - return dst; -} - -mrb_value -mrb_econv_str_append(mrb_state *mrb, mrb_econv_t *ec, mrb_value src, mrb_value dst, int flags) -{ - return mrb_econv_substr_append(mrb, ec, src, 0, RSTRING_LEN(src), dst, flags); -} - -mrb_value -mrb_econv_substr_convert(mrb_state *mrb, mrb_econv_t *ec, mrb_value src, long byteoff, long bytesize, int flags) -{ - return mrb_econv_substr_append(mrb, ec, src, byteoff, bytesize, mrb_nil_value(), flags); -} - -mrb_value -mrb_econv_str_convert(mrb_state *mrb, mrb_econv_t *ec, mrb_value src, int flags) -{ - return mrb_econv_substr_append(mrb, ec, src, 0, RSTRING_LEN(src), mrb_nil_value(), flags); -} - -static int -mrb_econv_add_converter(mrb_state *mrb, mrb_econv_t *ec, const char *sname, const char *dname, int n) -{ - transcoder_entry_t *entry; - const mrb_transcoder *tr; - - if (ec->started != 0) - return -1; - - entry = get_transcoder_entry(sname, dname); - if (!entry) - return -1; - - tr = load_transcoder_entry(mrb, entry); - - return mrb_econv_add_transcoder_at(mrb, ec, tr, n); -} - -static int -mrb_econv_decorate_at(mrb_state *mrb, mrb_econv_t *ec, const char *decorator_name, int n) -{ - return mrb_econv_add_converter(mrb, ec, "", decorator_name, n); -} - -int -mrb_econv_decorate_at_first(mrb_state *mrb, mrb_econv_t *ec, const char *decorator_name) -{ - const mrb_transcoder *tr; - - if (ec->num_trans == 0) - return mrb_econv_decorate_at(mrb, ec, decorator_name, 0); - - tr = ec->elems[0].tc->transcoder; - - if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && - tr->asciicompat_type == asciicompat_decoder) - return mrb_econv_decorate_at(mrb, ec, decorator_name, 1); - - return mrb_econv_decorate_at(mrb, ec, decorator_name, 0); -} - -int -mrb_econv_decorate_at_last(mrb_state *mrb, mrb_econv_t *ec, const char *decorator_name) -{ - const mrb_transcoder *tr; - - if (ec->num_trans == 0) - return mrb_econv_decorate_at(mrb, ec, decorator_name, 0); - - tr = ec->elems[ec->num_trans-1].tc->transcoder; - - if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && - tr->asciicompat_type == asciicompat_encoder) - return mrb_econv_decorate_at(mrb, ec, decorator_name, ec->num_trans-1); - - return mrb_econv_decorate_at(mrb, ec, decorator_name, ec->num_trans); -} - -void -mrb_econv_binmode(mrb_econv_t *ec) -{ - const mrb_transcoder *trs[3]; - int n, i, j; - transcoder_entry_t *entry; - int num_trans; - - n = 0; - if (ec->flags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) { - entry = get_transcoder_entry("", "universal_newline"); - if (entry->transcoder) - trs[n++] = entry->transcoder; - } - if (ec->flags & ECONV_CRLF_NEWLINE_DECORATOR) { - entry = get_transcoder_entry("", "crlf_newline"); - if (entry->transcoder) - trs[n++] = entry->transcoder; - } - if (ec->flags & ECONV_CR_NEWLINE_DECORATOR) { - entry = get_transcoder_entry("", "cr_newline"); - if (entry->transcoder) - trs[n++] = entry->transcoder; - } - - num_trans = ec->num_trans; - j = 0; - for (i = 0; i < num_trans; i++) { - int k; - for (k = 0; k < n; k++) - if (trs[k] == ec->elems[i].tc->transcoder) - break; - if (k == n) { - ec->elems[j] = ec->elems[i]; - j++; - } - else { - mrb_transcoding_close(ec->elems[i].tc); - xfree(ec->elems[i].out_buf_start); - ec->num_trans--; - } - } - - ec->flags &= ~(ECONV_UNIVERSAL_NEWLINE_DECORATOR|ECONV_CRLF_NEWLINE_DECORATOR|ECONV_CR_NEWLINE_DECORATOR); - -} - -static mrb_value -econv_description(mrb_state *mrb, const char *sname, const char *dname, int ecflags, mrb_value mesg) -{ - int has_description = 0; - - if (mrb_nil_p(mesg)) - mesg = mrb_str_new(mrb, NULL, 0); - - if (*sname != '\0' || *dname != '\0') { - if (*sname == '\0') - mrb_str_cat2(mrb, mesg, dname); - else if (*dname == '\0') - mrb_str_cat2(mrb, mesg, sname); - else - mrb_str_catf(mrb, mesg, "%s to %s", sname, dname); - has_description = 1; - } - - if (ecflags & (ECONV_UNIVERSAL_NEWLINE_DECORATOR| - ECONV_CRLF_NEWLINE_DECORATOR| - ECONV_CR_NEWLINE_DECORATOR| - ECONV_XML_TEXT_DECORATOR| - ECONV_XML_ATTR_CONTENT_DECORATOR| - ECONV_XML_ATTR_QUOTE_DECORATOR)) { - const char *pre = ""; - if (has_description) - mrb_str_cat2(mrb, mesg, " with "); - if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) { - mrb_str_cat2(mrb, mesg, pre); pre = ","; - mrb_str_cat2(mrb, mesg, "universal_newline"); - } - if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) { - mrb_str_cat2(mrb, mesg, pre); pre = ","; - mrb_str_cat2(mrb, mesg, "crlf_newline"); - } - if (ecflags & ECONV_CR_NEWLINE_DECORATOR) { - mrb_str_cat2(mrb, mesg, pre); pre = ","; - mrb_str_cat2(mrb, mesg, "cr_newline"); - } - if (ecflags & ECONV_XML_TEXT_DECORATOR) { - mrb_str_cat2(mrb, mesg, pre); pre = ","; - mrb_str_cat2(mrb, mesg, "xml_text"); - } - if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) { - mrb_str_cat2(mrb, mesg, pre); pre = ","; - mrb_str_cat2(mrb, mesg, "xml_attr_content"); - } - if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) { - mrb_str_cat2(mrb, mesg, pre); pre = ","; - mrb_str_cat2(mrb, mesg, "xml_attr_quote"); - } - has_description = 1; - } - if (!has_description) { - mrb_str_cat2(mrb, mesg, "no-conversion"); - } - - return mesg; -} - -mrb_value -mrb_econv_open_exc(mrb_state *mrb, const char *sname, const char *dname, int ecflags) -{ - mrb_value mesg, exc; - mesg = mrb_str_new_cstr(mrb, "code converter not found ("); - econv_description(mrb, sname, dname, ecflags, mesg); - mrb_str_cat2(mrb, mesg, ")"); - exc = mrb_exc_new3(mrb, E_CONVERTERNOTFOUND_ERROR, mesg); - return exc; -} - -static mrb_value -make_econv_exception(mrb_state *mrb, mrb_econv_t *ec) -{ - mrb_value mesg, exc; - if (ec->last_error.result == econv_invalid_byte_sequence || - ec->last_error.result == econv_incomplete_input) { - { - const char *err = (const char*)ec->last_error.error_bytes_start; - size_t error_len = ec->last_error.error_bytes_len; - mrb_value bytes = mrb_str_new(mrb, err, error_len); - mrb_value dumped = mrb_str_dump(mrb, bytes); - size_t readagain_len = ec->last_error.readagain_len; - mrb_value bytes2 = mrb_nil_value(); - mrb_value dumped2; - if (ec->last_error.result == econv_incomplete_input) { - mesg = mrb_sprintf(mrb, "incomplete %s on %s", - //StringValueCStr(dumped), - mrb_string_value_cstr(mrb, &dumped), - ec->last_error.source_encoding); - } - else if (readagain_len) { - bytes2 = mrb_str_new(mrb, err+error_len, readagain_len); - dumped2 = mrb_str_dump(mrb, bytes2); - mesg = mrb_sprintf(mrb, "%s followed by %s on %s", - //StringValueCStr(dumped), - mrb_string_value_cstr(mrb, &dumped), - //StringValueCStr(dumped2), - mrb_string_value_cstr(mrb, &dumped2), - ec->last_error.source_encoding); - } - else { - mesg = mrb_sprintf(mrb, "%s on %s", - //StringValueCStr(dumped), - mrb_string_value_cstr(mrb, &dumped), - ec->last_error.source_encoding); - } - - exc = mrb_exc_new3(mrb, E_INVALIDBYTESEQUENCE_ERROR, mesg); - mrb_iv_set(mrb, exc, mrb_intern(mrb, "error_bytes"), bytes); - mrb_iv_set(mrb, exc, mrb_intern(mrb, "readagain_bytes"), bytes2); - mrb_iv_set(mrb, exc, mrb_intern(mrb, "incomplete_input"), ec->last_error.result == econv_incomplete_input ? mrb_true_value() : mrb_false_value()); - } - -set_encs: - mrb_iv_set(mrb, exc, mrb_intern(mrb, "source_encoding_name"), mrb_str_new2(mrb, ec->last_error.source_encoding)); - mrb_iv_set(mrb, exc, mrb_intern(mrb, "destination_encoding_name"), mrb_str_new2(mrb, ec->last_error.destination_encoding)); - { - int idx = mrb_enc_find_index(mrb, ec->last_error.source_encoding); - if (0 <= idx) - mrb_iv_set(mrb, exc, mrb_intern(mrb, "source_encoding"), mrb_enc_from_encoding(mrb, mrb_enc_from_index(mrb, idx))); - idx = mrb_enc_find_index(mrb, ec->last_error.destination_encoding); - if (0 <= idx) - mrb_iv_set(mrb, exc, mrb_intern(mrb, "destination_encoding"), mrb_enc_from_encoding(mrb, mrb_enc_from_index(mrb, idx))); - } - return exc; - } - if (ec->last_error.result == econv_undefined_conversion) { - mrb_value bytes = mrb_str_new(mrb, (const char*)ec->last_error.error_bytes_start, - ec->last_error.error_bytes_len); - mrb_value dumped = mrb_nil_value(); - int idx; - if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) { - mrb_encoding *utf8 = mrb_utf8_encoding(mrb); - const char *start, *end; - int n; - start = (const char*)ec->last_error.error_bytes_start; - end = start + ec->last_error.error_bytes_len; - n = mrb_enc_precise_mbclen(start, end, utf8); - if (MBCLEN_CHARFOUND_P(n) && - (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) { - unsigned int cc = mrb_enc_mbc_to_codepoint(start, end, utf8); - dumped = mrb_sprintf(mrb, "U+%04X", cc); - } - } - if (mrb_obj_equal(mrb, dumped, mrb_nil_value())) - dumped = mrb_str_dump(mrb, bytes); - if (strcmp(ec->last_error.source_encoding, - ec->source_encoding_name) == 0 && - strcmp(ec->last_error.destination_encoding, - ec->destination_encoding_name) == 0) { - mesg = mrb_sprintf(mrb, "%s from %s to %s", - //StringValueCStr(dumped), - mrb_string_value_cstr(mrb, &dumped), - ec->last_error.source_encoding, - ec->last_error.destination_encoding); - } - else { - int i; - mesg = mrb_sprintf(mrb, "%s to %s in conversion from %s", - //StringValueCStr(dumped), - mrb_string_value_cstr(mrb, &dumped), - ec->last_error.destination_encoding, - ec->source_encoding_name); - for (i = 0; i < ec->num_trans; i++) { - const mrb_transcoder *tr = ec->elems[i].tc->transcoder; - if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding)) - mrb_str_catf(mrb, mesg, " to %s", - ec->elems[i].tc->transcoder->dst_encoding); - } - } - exc = mrb_exc_new3(mrb, E_UNDEFINEDCONVERSION_ERROR, mesg); - idx = mrb_enc_find_index(mrb, ec->last_error.source_encoding); - if (0 <= idx) - mrb_enc_associate_index(mrb, bytes, idx); - mrb_iv_set(mrb, exc, mrb_intern(mrb, "error_char"), bytes); - goto set_encs; - } - return mrb_nil_value(); -} - -static void -more_output_buffer(mrb_state *mrb, - mrb_value destination, - unsigned char *(*resize_destination)(mrb_state *, mrb_value, size_t, size_t), - int max_output, - unsigned char **out_start_ptr, - unsigned char **out_pos, - unsigned char **out_stop_ptr) -{ - size_t len = (*out_pos - *out_start_ptr); - size_t new_len = (len + max_output) * 2; - *out_start_ptr = resize_destination(mrb, destination, len, new_len); - *out_pos = *out_start_ptr + len; - *out_stop_ptr = *out_start_ptr + new_len; -} - -static int -make_replacement(mrb_state *mrb, mrb_econv_t *ec) -{ - mrb_transcoding *tc; - const mrb_transcoder *tr; - mrb_encoding *enc; - const unsigned char *replacement; - const char *repl_enc; - const char *ins_enc; - size_t len; - - if (ec->replacement_str) - return 0; - - ins_enc = mrb_econv_encoding_to_insert_output(ec); - - tc = ec->last_tc; - if (*ins_enc) { - tr = tc->transcoder; - enc = mrb_enc_find(mrb, tr->dst_encoding); - replacement = (const unsigned char*)get_replacement_character(ins_enc, &len, &repl_enc); - } - else { - replacement = (unsigned char*)"?"; - len = 1; - repl_enc = ""; - } - - ec->replacement_str = replacement; - ec->replacement_len = len; - ec->replacement_enc = repl_enc; - ec->replacement_allocated = 0; - return 0; -} - -int -mrb_econv_set_replacement(mrb_state *mrb, mrb_econv_t *ec, - const unsigned char *str, size_t len, const char *encname) -{ - unsigned char *str2; - size_t len2; - const char *encname2; - - encname2 = mrb_econv_encoding_to_insert_output(ec); - - if (encoding_equal(encname, encname2)) { - str2 = xmalloc(len); - memcpy(str2, str, len); /* xxx: str may be invalid */ - len2 = len; - encname2 = encname; - } - else { - str2 = allocate_converted_string(mrb, encname, encname2, str, len, NULL, 0, &len2); - if (!str2) - return -1; - } - - if (ec->replacement_allocated) { - xfree((void*)ec->replacement_str); - } - ec->replacement_allocated = 1; - ec->replacement_str = str2; - ec->replacement_len = len2; - ec->replacement_enc = encname2; - return 0; -} - -static int -output_replacement_character(mrb_state *mrb, mrb_econv_t *ec) -{ - int ret; - - if (make_replacement(mrb, ec) == -1) - return -1; - - ret = mrb_econv_insert_output(mrb, ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc); - if (ret == -1) - return -1; - - return 0; -} - -static void -transcode_loop(mrb_state *mrb, - const unsigned char **in_pos, unsigned char **out_pos, - const unsigned char *in_stop, unsigned char *out_stop, - mrb_value destination, - unsigned char *(*resize_destination)(mrb_state *, mrb_value, size_t, size_t), - const char *src_encoding, - const char *dst_encoding, - int ecflags, - mrb_value ecopts) -{ - mrb_econv_t *ec; - mrb_transcoding *last_tc; - mrb_econv_result_t ret; - unsigned char *out_start = *out_pos; - int max_output; - mrb_value exc; - mrb_value fallback = mrb_nil_value(); - mrb_value Qundef; - Qundef.tt = 0; - - ec = mrb_econv_open_opts(mrb, src_encoding, dst_encoding, ecflags, ecopts); - if (!ec) - mrb_exc_raise(mrb, mrb_econv_open_exc(mrb, src_encoding, dst_encoding, ecflags)); - - if (!mrb_nil_p(ecopts) && TYPE(ecopts) == MRB_TT_HASH) - fallback = mrb_hash_get(mrb, ecopts, sym_fallback); - last_tc = ec->last_tc; - max_output = last_tc ? last_tc->transcoder->max_output : 1; - - resume: - ret = mrb_econv_convert(mrb, ec, in_pos, in_stop, out_pos, out_stop, 0); - - if (!mrb_nil_p(fallback) && ret == econv_undefined_conversion) { - mrb_value rep = mrb_enc_str_new(mrb, - (const char*)ec->last_error.error_bytes_start, - ec->last_error.error_bytes_len, - mrb_enc_find(mrb, ec->last_error.source_encoding)); - rep = mrb_hash_getWithDef(mrb, fallback, rep, Qundef);//mrb_hash_lookup2(fallback, rep, Qundef); - if (!mrb_obj_equal(mrb, rep, Qundef)) { - //StringValue(rep); - mrb_string_value(mrb, &rep); - ret = mrb_econv_insert_output(mrb, ec, (const unsigned char*)RSTRING_PTR(rep), - RSTRING_LEN(rep), mrb_enc_name(mrb_enc_get(mrb, rep))); - if ((int)ret == -1) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "too big fallback string"); - } - goto resume; - } - } - - if (ret == econv_invalid_byte_sequence || - ret == econv_incomplete_input || - ret == econv_undefined_conversion) { - exc = make_econv_exception(mrb, ec); - mrb_econv_close(ec); - mrb_exc_raise(mrb, exc); - } - - if (ret == econv_destination_buffer_full) { - more_output_buffer(mrb, destination, resize_destination, max_output, &out_start, out_pos, &out_stop); - goto resume; - } - - mrb_econv_close(ec); - return; -} - -/* - * String-specific code - */ - -static unsigned char * -str_transcoding_resize(mrb_state *mrb, mrb_value destination, size_t len, size_t new_len) -{ - mrb_str_resize(mrb, destination, new_len); - return (unsigned char*)RSTRING_PTR(destination); -} - -static int -econv_opts(mrb_state *mrb, mrb_value opt) -{ - mrb_value v; - int ecflags = 0; - - v = mrb_hash_get(mrb, opt, sym_invalid); - if (mrb_nil_p(v)) { - } - else if (mrb_obj_equal(mrb, v, sym_replace)) { - ecflags |= ECONV_INVALID_REPLACE; - } - else { - mrb_raise(mrb, E_ARGUMENT_ERROR, "unknown value for invalid character option"); - } - - v = mrb_hash_get(mrb, opt, sym_undef); - if (mrb_nil_p(v)) { - } - else if (mrb_obj_equal(mrb, v, sym_replace)) { - ecflags |= ECONV_UNDEF_REPLACE; - } - else { - mrb_raise(mrb, E_ARGUMENT_ERROR, "unknown value for undefined character option"); - } - - v = mrb_hash_get(mrb, opt, sym_replace); - if (!mrb_nil_p(v) && !(ecflags & ECONV_INVALID_REPLACE)) { - ecflags |= ECONV_UNDEF_REPLACE; - } - - v = mrb_hash_get(mrb, opt, sym_xml); - if (!mrb_nil_p(v)) { - if (mrb_obj_equal(mrb, v, sym_text)) { - ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF; - } - else if (mrb_obj_equal(mrb, v, sym_attr)) { - ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF; - } - else if (TYPE(v) == MRB_TT_SYMBOL) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "unexpected value for xml option: %s", mrb_sym2name(mrb, SYM2ID(v))); - } - else { - mrb_raise(mrb, E_ARGUMENT_ERROR, "unexpected value for xml option"); - } - } - - v = mrb_hash_get(mrb, opt, sym_universal_newline); - if (RTEST(v)) - ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR; - - v = mrb_hash_get(mrb, opt, sym_crlf_newline); - if (RTEST(v)) - ecflags |= ECONV_CRLF_NEWLINE_DECORATOR; - - v = mrb_hash_get(mrb, opt, sym_cr_newline); - if (RTEST(v)) - ecflags |= ECONV_CR_NEWLINE_DECORATOR; - - return ecflags; -} - -int -mrb_econv_prepare_opts(mrb_state *mrb, mrb_value opthash, mrb_value *opts) -{ - int ecflags; - mrb_value newhash = mrb_nil_value(); - mrb_value v; - - if (mrb_nil_p(opthash)) { - *opts = mrb_nil_value(); - return 0; - } - ecflags = econv_opts(mrb, opthash); - - v = mrb_hash_get(mrb, opthash, sym_replace); - if (!mrb_nil_p(v)) { - //StringValue(v); - mrb_string_value(mrb, &v); - if (mrb_enc_str_coderange(mrb, v) == ENC_CODERANGE_BROKEN) { - mrb_value dumped = mrb_str_dump(mrb, v); - mrb_raise(mrb, E_ARGUMENT_ERROR, "replacement string is broken: %s as %s", - //StringValueCStr(dumped), - mrb_string_value_cstr(mrb, &dumped), - mrb_enc_name(mrb_enc_get(mrb, v))); - } - v = mrb_str_new_frozen(mrb, v); - newhash = mrb_hash_new_capa(mrb, 0); - mrb_hash_set(mrb, newhash, sym_replace, v); - } - - v = mrb_hash_get(mrb, opthash, sym_fallback); - if (!mrb_nil_p(v)) { - v = mrb_convert_type(mrb, v, MRB_TT_HASH, "Hash", "to_hash"); - if (!mrb_nil_p(v)) { - if (mrb_nil_p(newhash)) - newhash = mrb_hash_new_capa(mrb, 0); - mrb_hash_set(mrb, newhash, sym_fallback, v); - } - } - - //if (!mrb_nil_p(newhash)) - // mrb_hash_freeze(newhash); - *opts = newhash; - - return ecflags; -} - -mrb_econv_t * -mrb_econv_open_opts(mrb_state *mrb, const char *source_encoding, const char *destination_encoding, int ecflags, mrb_value opthash) -{ - mrb_econv_t *ec; - mrb_value replacement; - - if (mrb_nil_p(opthash)) { - replacement = mrb_nil_value(); - } - else { - if (TYPE(opthash) != MRB_TT_HASH /*|| !OBJ_FROZEN(opthash)*/) - mrb_bug("mrb_econv_open_opts called with invalid opthash"); - replacement = mrb_hash_get(mrb, opthash, sym_replace); - } - - ec = mrb_econv_open(mrb, source_encoding, destination_encoding, ecflags); - if (!ec) - return ec; - - if (!mrb_nil_p(replacement)) { - int ret; - mrb_encoding *enc = mrb_enc_get(mrb, replacement); - - ret = mrb_econv_set_replacement(mrb, ec, - (const unsigned char*)RSTRING_PTR(replacement), - RSTRING_LEN(replacement), - mrb_enc_name(enc)); - if (ret == -1) { - mrb_econv_close(ec); - return NULL; - } - } - return ec; -} - -static int -enc_arg(mrb_state *mrb, mrb_value *arg, const char **name_p, mrb_encoding **enc_p) -{ - mrb_encoding *enc; - const char *n; - int encidx; - mrb_value encval; - - if (((encidx = mrb_to_encoding_index(mrb, encval = *arg)) < 0) || - !(enc = mrb_enc_from_index(mrb, encidx))) { - enc = NULL; - encidx = 0; - //n = StringValueCStr(*arg); - n = mrb_string_value_cstr(mrb, arg); - } - else { - n = mrb_enc_name(enc); - } - - *name_p = n; - *enc_p = enc; - - return encidx; -} - -static int -str_transcode_enc_args(mrb_state *mrb, - mrb_value str, mrb_value *arg1, mrb_value *arg2, - const char **sname_p, mrb_encoding **senc_p, - const char **dname_p, mrb_encoding **denc_p) -{ - mrb_encoding *senc, *denc; - const char *sname, *dname; - int sencidx, dencidx; - - dencidx = enc_arg(mrb, arg1, &dname, &denc); - - if (mrb_nil_p(*arg2)) { - sencidx = mrb_enc_get_index(mrb, str); - senc = mrb_enc_from_index(mrb, sencidx); - sname = mrb_enc_name(senc); - } - else { - sencidx = enc_arg(mrb, arg2, &sname, &senc); - } - - *sname_p = sname; - *senc_p = senc; - *dname_p = dname; - *denc_p = denc; - return dencidx; -} - -mrb_value -mrb_str_tmp_new(mrb_state *mrb, long len) -{ - return mrb_str_new(mrb, 0, len); -} - -static int -str_transcode0(mrb_state *mrb, int argc, mrb_value *argv, mrb_value *self, int ecflags, mrb_value ecopts) -{ - - mrb_value dest; - mrb_value str = *self; - mrb_value arg1, arg2; - long blen, slen; - unsigned char *buf, *bp, *sp; - const unsigned char *fromp; - mrb_encoding *senc, *denc; - const char *sname, *dname; - int dencidx; - - if (argc <0 || argc > 2) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (%d for 0..2)", argc); - } - - if (argc == 0) { - arg1 = mrb_enc_default_internal(mrb); - if (mrb_nil_p(arg1)) { - if (!ecflags) return -1; - arg1 = mrb_obj_encoding(mrb, str); - } - ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE; - } - else { - arg1 = argv[0]; - } - arg2 = argc<=1 ? mrb_nil_value() : argv[1]; - dencidx = str_transcode_enc_args(mrb, str, &arg1, &arg2, &sname, &senc, &dname, &denc); - - if ((ecflags & (ECONV_UNIVERSAL_NEWLINE_DECORATOR| - ECONV_CRLF_NEWLINE_DECORATOR| - ECONV_CR_NEWLINE_DECORATOR| - ECONV_XML_TEXT_DECORATOR| - ECONV_XML_ATTR_CONTENT_DECORATOR| - ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) { - if (senc && senc == denc) { - return mrb_nil_p(arg2) ? -1 : dencidx; - } - if (senc && denc && mrb_enc_asciicompat(mrb, senc) && mrb_enc_asciicompat(mrb, denc)) { - if (mrb_enc_str_coderange(mrb, str) == ENC_CODERANGE_7BIT) { - return dencidx; - } - } - if (encoding_equal(sname, dname)) { - return mrb_nil_p(arg2) ? -1 : dencidx; - } - } - else { - if (encoding_equal(sname, dname)) { - sname = ""; - dname = ""; - } - } - - fromp = sp = (unsigned char*)RSTRING_PTR(str); - slen = RSTRING_LEN(str); - blen = slen + 30; /* len + margin */ - dest = mrb_str_tmp_new(mrb, blen); - bp = (unsigned char*)RSTRING_PTR(dest); - - transcode_loop(mrb, &fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts); - if (fromp != sp+slen) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "not fully converted, %td bytes left", sp+slen-fromp); - } - buf = (unsigned char*)RSTRING_PTR(dest); - *bp = '\0'; - mrb_str_set_len(mrb, dest, bp - buf); - - /* set encoding */ - if (!denc) { - dencidx = mrb_define_dummy_encoding(mrb, dname); - } - *self = dest; - - return dencidx; -} - -static int -str_transcode(mrb_state *mrb, int argc, mrb_value *argv, mrb_value *self) -{ - mrb_value opt; - int ecflags = 0; - mrb_value ecopts = mrb_nil_value(); - - if (0 < argc) { - opt = mrb_check_convert_type(mrb, argv[argc-1], MRB_TT_HASH, "Hash", "to_hash"); - if (!mrb_nil_p(opt)) { - argc--; - ecflags = mrb_econv_prepare_opts(mrb, opt, &ecopts); - } - } - return str_transcode0(mrb, argc, argv, self, ecflags, ecopts); -} - -static inline mrb_value -str_encode_associate(mrb_state *mrb, mrb_value str, int encidx) -{ - int cr = 0; - - mrb_enc_associate_index(mrb, str, encidx); - - /* transcoded string never be broken. */ - if (mrb_enc_asciicompat(mrb, mrb_enc_from_index(mrb, encidx))) { - mrb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr); - } - else { - cr = ENC_CODERANGE_VALID; - } - ENC_CODERANGE_SET(str, cr); - return str; -} - -/* - * call-seq: - * str.encode!(encoding [, options] ) -> str - * str.encode!(dst_encoding, src_encoding [, options] ) -> str - * - * The first form transcodes the contents of <i>str</i> from - * str.encoding to +encoding+. - * The second form transcodes the contents of <i>str</i> from - * src_encoding to dst_encoding. - * The options Hash gives details for conversion. See String#encode - * for details. - * Returns the string even if no changes were made. - */ - -static mrb_value -str_encode_bang(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value str) -{ - mrb_value argv[16]; - int argc; - mrb_value newstr; - int encidx; - - mrb_get_args(mrb, "*", &argv, &argc); - - newstr = str; - encidx = str_transcode(mrb, argc, argv, &newstr); - - if (encidx < 0) return str; - mrb_str_shared_replace(mrb, str, newstr); - return str_encode_associate(mrb, str, encidx); -} - -/* - * call-seq: - * str.encode(encoding [, options] ) -> str - * str.encode(dst_encoding, src_encoding [, options] ) -> str - * str.encode([options]) -> str - * - * The first form returns a copy of <i>str</i> transcoded - * to encoding +encoding+. - * The second form returns a copy of <i>str</i> transcoded - * from src_encoding to dst_encoding. - * The last form returns a copy of <i>str</i> transcoded to - * <code>Encoding.default_internal</code>. - * By default, the first and second form raise - * Encoding::UndefinedConversionError for characters that are - * undefined in the destination encoding, and - * Encoding::InvalidByteSequenceError for invalid byte sequences - * in the source encoding. The last form by default does not raise - * exceptions but uses replacement strings. - * The <code>options</code> Hash gives details for conversion. - * - * === options - * The hash <code>options</code> can have the following keys: - * :invalid :: - * If the value is <code>:replace</code>, <code>#encode</code> replaces - * invalid byte sequences in <code>str</code> with the replacement character. - * The default is to raise the exception - * :undef :: - * If the value is <code>:replace</code>, <code>#encode</code> replaces - * characters which are undefined in the destination encoding with - * the replacement character. - * :replace :: - * Sets the replacement string to the value. The default replacement - * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise. - * :fallback :: - * Sets the replacement string by the hash for undefined character. - * Its key is a such undefined character encoded in source encoding - * of current transcoder. Its value can be any encoding until it - * can be converted into the destination encoding of the transcoder. - * :xml :: - * The value must be <code>:text</code> or <code>:attr</code>. - * If the value is <code>:text</code> <code>#encode</code> replaces - * undefined characters with their (upper-case hexadecimal) numeric - * character references. '&', '<', and '>' are converted to "&", - * "<", and ">", respectively. - * If the value is <code>:attr</code>, <code>#encode</code> also quotes - * the replacement result (using '"'), and replaces '"' with """. - * :cr_newline :: - * Replaces LF ("\n") with CR ("\r") if value is true. - * :crlf_newline :: - * Replaces LF ("\n") with CRLF ("\r\n") if value is true. - * :universal_newline :: - * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true. - */ - -static mrb_value -str_encode(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value str) -{ - mrb_value argv[16]; - int argc; - mrb_value newstr; - int encidx; - - mrb_get_args(mrb, "*", &argv, &argc); - newstr = str; - encidx = str_transcode(mrb, argc, argv, &newstr); - - if (encidx < 0) return mrb_str_dup(mrb, str); - if (mrb_obj_equal(mrb, newstr, str)) { - newstr = mrb_str_dup(mrb, str); - } - else { - RBASIC(newstr)->c = mrb_obj_class(mrb, str); - } - return str_encode_associate(mrb, newstr, encidx); -} - -mrb_value -mrb_str_encode(mrb_state *mrb, mrb_value str, mrb_value to, int ecflags, mrb_value ecopts) -{ - int argc = 1; - mrb_value *argv = &to; - mrb_value newstr = str; - int encidx = str_transcode0(mrb, argc, argv, &newstr, ecflags, ecopts); - - if (encidx < 0) return mrb_str_dup(mrb, str); - if (mrb_obj_equal(mrb, newstr, str)) { - newstr = mrb_str_dup(mrb, str); - } - else { - RBASIC(newstr)->c = mrb_obj_class(mrb, str); - } - return str_encode_associate(mrb, newstr, encidx); -} - -static void -econv_free(mrb_state *mrb, void *ptr) -{ - mrb_econv_t *ec = ptr; - mrb_econv_close(ec); -} - -static const struct mrb_data_type econv_data_type = { - "econv", econv_free, -}; - -static mrb_encoding * -make_dummy_encoding(mrb_state *mrb, const char *name) -{ - mrb_encoding *enc; - int idx; - idx = mrb_define_dummy_encoding(mrb, name); - enc = mrb_enc_from_index(mrb, idx); - return enc; -} - -static mrb_encoding * -make_encoding(mrb_state *mrb, const char *name) -{ - mrb_encoding *enc; - enc = mrb_enc_find(mrb, name); - if (!enc) - enc = make_dummy_encoding(mrb, name); - return enc; -} - -static mrb_value -make_encobj(mrb_state *mrb, const char *name) -{ - return mrb_enc_from_encoding(mrb, make_encoding(mrb, name)); -} - -/* - * call-seq: - * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil - * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil - * - * Returns the corresponding ASCII compatible encoding. - * - * Returns nil if the argument is an ASCII compatible encoding. - * - * "corresponding ASCII compatible encoding" is a ASCII compatible encoding which - * can represents exactly the same characters as the given ASCII incompatible encoding. - * So, no conversion undefined error occurs when converting between the two encodings. - * - * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP> - * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8> - * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil - * - */ -static mrb_value -econv_s_asciicompat_encoding(mrb_state *mrb, mrb_value klass) -{ - mrb_value arg; - const char *arg_name, *result_name; - mrb_encoding *arg_enc, *result_enc; - - mrb_get_args(mrb, "o", &arg); - enc_arg(mrb, &arg, &arg_name, &arg_enc); - - result_name = mrb_econv_asciicompat_encoding(arg_name); - - if (result_name == NULL) - return mrb_nil_value(); - - result_enc = make_encoding(mrb, result_name); - - return mrb_enc_from_encoding(mrb, result_enc); -} - -static void -econv_args(mrb_state *mrb, - int argc, mrb_value *argv, - mrb_value *snamev_p, mrb_value *dnamev_p, - const char **sname_p, const char **dname_p, - mrb_encoding **senc_p, mrb_encoding **denc_p, - int *ecflags_p, - mrb_value *ecopts_p) -{ - mrb_value opt, opthash, flags_v, ecopts; - int sidx, didx; - const char *sname, *dname; - mrb_encoding *senc, *denc; - int ecflags; - - //mrb_scan_args(argc, argv, "21", snamev_p, dnamev_p, &opt); - *snamev_p = argv[0]; - *dnamev_p = argv[1]; - opt = argv[2]; - - if (argc < 3) {//mrb_nil_p(opt)) { - ecflags = 0; - ecopts = mrb_nil_value(); - } - else if (!mrb_nil_p(flags_v = mrb_check_to_integer(mrb, opt, "to_int"))) { - ecflags = mrb_fixnum(flags_v); - ecopts = mrb_nil_value(); - } - else { - opthash = mrb_convert_type(mrb, opt, MRB_TT_HASH, "Hash", "to_hash"); - ecflags = mrb_econv_prepare_opts(mrb, opthash, &ecopts); - } - - senc = NULL; - sidx = mrb_to_encoding_index(mrb, *snamev_p); - if (0 <= sidx) { - senc = mrb_enc_from_index(mrb, sidx); - } - else { - //StringValue(*snamev_p); - mrb_string_value(mrb, snamev_p); - } - - denc = NULL; - didx = mrb_to_encoding_index(mrb, *dnamev_p); - if (0 <= didx) { - denc = mrb_enc_from_index(mrb, didx); - } - else { - //StringValue(*dnamev_p); - mrb_string_value(mrb, dnamev_p); - } - - //sname = senc ? mrb_enc_name(senc) : StringValueCStr(*snamev_p); - sname = senc ? mrb_enc_name(senc) : mrb_string_value_cstr(mrb, snamev_p); - //dname = denc ? mrb_enc_name(denc) : StringValueCStr(*dnamev_p); - dname = denc ? mrb_enc_name(denc) : mrb_string_value_cstr(mrb, dnamev_p); - - *sname_p = sname; - *dname_p = dname; - *senc_p = senc; - *denc_p = denc; - *ecflags_p = ecflags; - *ecopts_p = ecopts; -} - -static int -decorate_convpath(mrb_state *mrb, mrb_value convpath, int ecflags) -{ - int num_decorators; - const char *decorators[MAX_ECFLAGS_DECORATORS]; - int i; - int n, len; - - num_decorators = decorator_names(ecflags, decorators); - if (num_decorators == -1) - return -1; - - len = n = RARRAY_LEN(convpath);//RARRAY_LENINT(convpath); - if (n != 0) { - mrb_value pair = RARRAY_PTR(convpath)[n-1]; - if (TYPE(pair) == MRB_TT_ARRAY) { - const char *sname = mrb_enc_name(mrb_to_encoding(mrb, RARRAY_PTR(pair)[0])); - const char *dname = mrb_enc_name(mrb_to_encoding(mrb, RARRAY_PTR(pair)[1])); - transcoder_entry_t *entry = get_transcoder_entry(sname, dname); - const mrb_transcoder *tr = load_transcoder_entry(mrb, entry); - if (!tr) - return -1; - if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && - tr->asciicompat_type == asciicompat_encoder) { - n--; - mrb_ary_set(mrb, convpath, len + num_decorators - 1, pair); - } - } - else { - mrb_ary_set(mrb, convpath, len + num_decorators - 1, pair); - } - } - - for (i = 0; i < num_decorators; i++) - mrb_ary_set(mrb, convpath, n + i, mrb_str_new_cstr(mrb, decorators[i])); - - return 0; -} - -static void -search_convpath_i(mrb_state *mrb, const char *sname, const char *dname, int depth, void *arg) -{ - mrb_value *ary_p = arg; - mrb_value v; - - if (mrb_obj_equal(mrb, *ary_p, mrb_nil_value())) { - *ary_p = mrb_ary_new(mrb); - } - - if (DECORATOR_P(sname, dname)) { - v = mrb_str_new_cstr(mrb, dname); - } - else { - v = mrb_assoc_new(mrb, make_encobj(mrb, sname), make_encobj(mrb, dname)); - } - mrb_ary_set(mrb, *ary_p, depth, v); -} - -/* - * call-seq: - * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary - * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary - * - * Returns a conversion path. - * - * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP") - * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], - * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]] - * - * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true) - * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], - * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], - * # "universal_newline"] - * - * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true) - * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], - * # "universal_newline", - * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]] - */ -static mrb_value -econv_s_search_convpath(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value klass) -{ - mrb_value snamev, dnamev; - const char *sname, *dname; - mrb_encoding *senc, *denc; - int ecflags; - mrb_value ecopts; - mrb_value convpath; - - mrb_value argv[16]; - int argc; - - mrb_get_args(mrb, "*", &argv, &argc); - econv_args(mrb, argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts); - convpath = mrb_nil_value(); - transcode_search_path(mrb, sname, dname, search_convpath_i, &convpath); - - if (mrb_nil_p(convpath)) - mrb_exc_raise(mrb, mrb_econv_open_exc(mrb, sname, dname, ecflags)); - - if (decorate_convpath(mrb, convpath, ecflags) == -1) - mrb_exc_raise(mrb, mrb_econv_open_exc(mrb, sname, dname, ecflags)); - - return convpath; -} - -/* - * Check the existence of a conversion path. - * Returns the number of converters in the conversion path. - * result: >=0:success -1:failure - */ -int -mrb_econv_has_convpath_p(mrb_state *mrb, const char* from_encoding, const char* to_encoding) -{ - mrb_value convpath = mrb_nil_value(); - transcode_search_path(mrb, from_encoding, to_encoding, search_convpath_i, - &convpath); - return RTEST(convpath); -} - -struct mrb_econv_init_by_convpath_t { - mrb_econv_t *ec; - int index; - int ret; -}; - -static void -mrb_econv_init_by_convpath_i(mrb_state *mrb, const char *sname, const char *dname, int depth, void *arg) -{ - struct mrb_econv_init_by_convpath_t *a = (struct mrb_econv_init_by_convpath_t*)arg; - int ret; - - if (a->ret == -1) - return; - - ret = mrb_econv_add_converter(mrb, a->ec, sname, dname, a->index); - - a->ret = ret; - return; -} - -static mrb_econv_t * -mrb_econv_init_by_convpath(mrb_state *mrb, mrb_value self, mrb_value convpath, - const char **sname_p, const char **dname_p, - mrb_encoding **senc_p, mrb_encoding**denc_p) -{ - mrb_econv_t *ec; - long i; - int ret, first=1; - mrb_value elt; - mrb_encoding *senc = 0, *denc = 0; - const char *sname, *dname; - - ec = mrb_econv_alloc(RARRAY_LEN/*INT*/(convpath)); - DATA_PTR(self) = ec; - - for (i = 0; i < RARRAY_LEN(convpath); i++) { - mrb_value snamev, dnamev; - mrb_value pair; - elt = mrb_ary_ref(mrb, convpath, i); - if (!mrb_nil_p(pair = mrb_check_array_type(mrb, elt))) { - if (RARRAY_LEN(pair) != 2) - mrb_raise(mrb, E_ARGUMENT_ERROR, "not a 2-element array in convpath"); - snamev = mrb_ary_ref(mrb, pair, 0); - enc_arg(mrb, &snamev, &sname, &senc); - dnamev = mrb_ary_ref(mrb, pair, 1); - enc_arg(mrb, &dnamev, &dname, &denc); - } - else { - sname = ""; - //dname = StringValueCStr(elt); - dname = mrb_string_value_cstr(mrb, &elt); - } - if (DECORATOR_P(sname, dname)) { - ret = mrb_econv_add_converter(mrb, ec, sname, dname, ec->num_trans); - if (ret == -1) - mrb_raise(mrb, E_ARGUMENT_ERROR, "decoration failed: %s", dname); - } - else { - int j = ec->num_trans; - struct mrb_econv_init_by_convpath_t arg; - arg.ec = ec; - arg.index = ec->num_trans; - arg.ret = 0; - ret = transcode_search_path(mrb, sname, dname, mrb_econv_init_by_convpath_i, &arg); - if (ret == -1 || arg.ret == -1) - mrb_raise(mrb, E_ARGUMENT_ERROR, "adding conversion failed: %s to %s", sname, dname); - if (first) { - first = 0; - *senc_p = senc; - *sname_p = ec->elems[j].tc->transcoder->src_encoding; - } - *denc_p = denc; - *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding; - } - } - - if (first) { - *senc_p = NULL; - *denc_p = NULL; - *sname_p = ""; - *dname_p = ""; - } - - ec->source_encoding_name = *sname_p; - ec->destination_encoding_name = *dname_p; - - return ec; -} - -/* - * call-seq: - * Encoding::Converter.new(source_encoding, destination_encoding) - * Encoding::Converter.new(source_encoding, destination_encoding, opt) - * Encoding::Converter.new(convpath) - * - * possible options elements: - * hash form: - * :invalid => nil # raise error on invalid byte sequence (default) - * :invalid => :replace # replace invalid byte sequence - * :undef => nil # raise error on undefined conversion (default) - * :undef => :replace # replace undefined conversion - * :replace => string # replacement string ("?" or "\uFFFD" if not specified) - * :universal_newline => true # decorator for converting CRLF and CR to LF - * :crlf_newline => true # decorator for converting LF to CRLF - * :cr_newline => true # decorator for converting LF to CR - * :xml => :text # escape as XML CharData. - * :xml => :attr # escape as XML AttValue - * integer form: - * Encoding::Converter::INVALID_REPLACE - * Encoding::Converter::UNDEF_REPLACE - * Encoding::Converter::UNDEF_HEX_CHARREF - * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR - * Encoding::Converter::CRLF_NEWLINE_DECORATOR - * Encoding::Converter::CR_NEWLINE_DECORATOR - * Encoding::Converter::XML_TEXT_DECORATOR - * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR - * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR - * - * Encoding::Converter.new creates an instance of Encoding::Converter. - * - * Source_encoding and destination_encoding should be a string or - * Encoding object. - * - * opt should be nil, a hash or an integer. - * - * convpath should be an array. - * convpath may contain - * - two-element arrays which contain encodings or encoding names, or - * - strings representing decorator names. - * - * Encoding::Converter.new optionally takes an option. - * The option should be a hash or an integer. - * The option hash can contain :invalid => nil, etc. - * The option integer should be logical-or of constants such as - * Encoding::Converter::INVALID_REPLACE, etc. - * - * [:invalid => nil] - * Raise error on invalid byte sequence. This is a default behavior. - * [:invalid => :replace] - * Replace invalid byte sequence by replacement string. - * [:undef => nil] - * Raise an error if a character in source_encoding is not defined in destination_encoding. - * This is a default behavior. - * [:undef => :replace] - * Replace undefined character in destination_encoding with replacement string. - * [:replace => string] - * Specify the replacement string. - * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others. - * [:universal_newline => true] - * Convert CRLF and CR to LF. - * [:crlf_newline => true] - * Convert LF to CRLF. - * [:cr_newline => true] - * Convert LF to CR. - * [:xml => :text] - * Escape as XML CharData. - * This form can be used as a HTML 4.0 #PCDATA. - * - '&' -> '&' - * - '<' -> '<' - * - '>' -> '>' - * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH; - * [:xml => :attr] - * Escape as XML AttValue. - * The converted result is quoted as "...". - * This form can be used as a HTML 4.0 attribute value. - * - '&' -> '&' - * - '<' -> '<' - * - '>' -> '>' - * - '"' -> '"' - * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH; - * - * Examples: - * # UTF-16BE to UTF-8 - * ec = Encoding::Converter.new("UTF-16BE", "UTF-8") - * - * # Usually, decorators such as newline conversion are inserted last. - * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true) - * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>], - * # "universal_newline"] - * - * # But, if the last encoding is ASCII incompatible, - * # decorators are inserted before the last conversion. - * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true) - * p ec.convpath #=> ["crlf_newline", - * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]] - * - * # Conversion path can be specified directly. - * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]]) - * p ec.convpath #=> ["universal_newline", - * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>], - * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]] - */ -static mrb_value -econv_init(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value self) -{ - mrb_value ecopts; - mrb_value snamev, dnamev; - const char *sname, *dname; - mrb_encoding *senc, *denc; - mrb_econv_t *ec; - int ecflags; - mrb_value convpath; - mrb_value argv[16]; - int argc; - - mrb_get_args(mrb, "*", &argv, &argc); - if (mrb_check_datatype(mrb, self, &econv_data_type)) { - mrb_raise(mrb, E_TYPE_ERROR, "already initialized"); - } - - if (argc == 1 && !mrb_nil_p(convpath = mrb_check_array_type(mrb, argv[0]))) { - ec = mrb_econv_init_by_convpath(mrb, self, convpath, &sname, &dname, &senc, &denc); - ecflags = 0; - ecopts = mrb_nil_value(); - } - else { - econv_args(mrb, argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts); - ec = mrb_econv_open_opts(mrb, sname, dname, ecflags, ecopts); - } - - if (!ec) { - mrb_exc_raise(mrb, mrb_econv_open_exc(mrb, sname, dname, ecflags)); - } - - if (!DECORATOR_P(sname, dname)) { - if (!senc) - senc = make_dummy_encoding(mrb, sname); - if (!denc) - denc = make_dummy_encoding(mrb, dname); - } - - ec->source_encoding = senc; - ec->destination_encoding = denc; - - DATA_PTR(self) = ec; - - return self; -} - -/* - * call-seq: - * ec.inspect -> string - * - * Returns a printable version of <i>ec</i> - * - * ec = Encoding::Converter.new("iso-8859-1", "utf-8") - * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8> - * - */ -static mrb_value -econv_inspect(mrb_state *mrb, mrb_value self) -{ - const char *cname = mrb_obj_classname(mrb, self); - mrb_econv_t *ec; - - Data_Get_Struct(mrb, self, &econv_data_type, ec); - if (!ec) - return mrb_sprintf(mrb, "#<%s: uninitialized>", cname); - else { - const char *sname = ec->source_encoding_name; - const char *dname = ec->destination_encoding_name; - mrb_value str; - str = mrb_sprintf(mrb, "#<%s: ", cname); - econv_description(mrb, sname, dname, ec->flags, str); - mrb_str_cat2(mrb, str, ">"); - return str; - } -} - -static mrb_econv_t * -check_econv(mrb_state *mrb, mrb_value self) -{ - mrb_econv_t *ec; - - Data_Get_Struct(mrb, self, &econv_data_type, ec); - if (!ec) { - mrb_raise(mrb, E_TYPE_ERROR, "uninitialized encoding converter"); - } - return ec; -} - -/* - * call-seq: - * ec.source_encoding -> encoding - * - * Returns the source encoding as an Encoding object. - */ -static mrb_value -econv_source_encoding(mrb_state *mrb, mrb_value self) -{ - mrb_econv_t *ec = check_econv(mrb, self); - if (!ec->source_encoding) - return mrb_nil_value(); - return mrb_enc_from_encoding(mrb, ec->source_encoding); -} - -/* - * call-seq: - * ec.destination_encoding -> encoding - * - * Returns the destination encoding as an Encoding object. - */ -static mrb_value -econv_destination_encoding(mrb_state *mrb, mrb_value self) -{ - mrb_econv_t *ec = check_econv(mrb, self); - if (!ec->destination_encoding) - return mrb_nil_value(); - return mrb_enc_from_encoding(mrb, ec->destination_encoding); -} - -/* - * call-seq: - * ec.convpath -> ary - * - * Returns the conversion path of ec. - * - * The result is an array of conversions. - * - * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true) - * p ec.convpath - * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], - * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], - * # "crlf_newline"] - * - * Each element of the array is a pair of encodings or a string. - * A pair means an encoding conversion. - * A string means a decorator. - * - * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means - * a converter from ISO-8859-1 to UTF-8. - * "crlf_newline" means newline converter from LF to CRLF. - */ -static mrb_value -econv_convpath(mrb_state *mrb, mrb_value self) -{ - mrb_econv_t *ec = check_econv(mrb, self); - mrb_value result; - int i; - - result = mrb_ary_new(mrb); - for (i = 0; i < ec->num_trans; i++) { - const mrb_transcoder *tr = ec->elems[i].tc->transcoder; - mrb_value v; - if (DECORATOR_P(tr->src_encoding, tr->dst_encoding)) - v = mrb_str_new_cstr(mrb, tr->dst_encoding); - else - v = mrb_assoc_new(mrb, make_encobj(mrb, tr->src_encoding), make_encobj(mrb, tr->dst_encoding)); - mrb_ary_push(mrb, result, v); - } - return result; -} - -static mrb_value -econv_result_to_symbol(mrb_econv_result_t res) -{ - switch (res) { - case econv_invalid_byte_sequence: return sym_invalid_byte_sequence; - case econv_incomplete_input: return sym_incomplete_input; - case econv_undefined_conversion: return sym_undefined_conversion; - case econv_destination_buffer_full: return sym_destination_buffer_full; - case econv_source_buffer_empty: return sym_source_buffer_empty; - case econv_finished: return sym_finished; - case econv_after_output: return sym_after_output; - default: return mrb_fixnum_value(res); /* should not be reached */ - } -} - -mrb_value econv_primitive_cnvproc(mrb_state *mrb, int argc, mrb_value *argv, mrb_value self) -{ - mrb_value input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v; - mrb_econv_t *ec = check_econv(mrb, self); - mrb_econv_result_t res; - const unsigned char *ip, *is; - unsigned char *op, *os; - long output_byteoffset, output_bytesize; - unsigned long output_byteend; - int flags; - - //mrb_scan_args(argc, argv, "23", &input, &output, &output_byteoffset_v, &output_bytesize_v, &opt); - input = argv[0]; - output = argv[1]; - output_byteoffset_v = argv[2]; - output_bytesize_v = argv[3]; - opt = argv[4]; - - if (argc < 3)//mrb_nil_p(output_byteoffset_v)) - output_byteoffset = 0; /* dummy */ - else - output_byteoffset = mrb_fixnum(output_byteoffset_v); - - if (argc < 4)//mrb_nil_p(output_bytesize_v)) - output_bytesize = 0; /* dummy */ - else - output_bytesize = mrb_fixnum(output_bytesize_v); - - if (argc < 5) {//mrb_nil_p(opt)) { - flags = 0; - } - else if (!mrb_nil_p(flags_v = mrb_check_to_integer(mrb, opt, "to_int"))) { - flags = mrb_fixnum(flags_v); - } - else { - mrb_value v; - opt = mrb_convert_type(mrb, opt, MRB_TT_HASH, "Hash", "to_hash"); - flags = 0; - v = mrb_hash_get(mrb, opt, sym_partial_input); - if (RTEST(v)) - flags |= ECONV_PARTIAL_INPUT; - v = mrb_hash_get(mrb, opt, sym_after_output); - if (RTEST(v)) - flags |= ECONV_AFTER_OUTPUT; - } - - //StringValue(output); - mrb_string_value(mrb, &output); - if (!mrb_nil_p(input)) - //StringValue(input); - mrb_string_value(mrb, &input); - mrb_str_modify(mrb, output); - - if (mrb_nil_p(output_bytesize_v)) { - output_bytesize = STR_BUF_MIN_SIZE; - if (!mrb_nil_p(input) && output_bytesize < RSTRING_LEN(input)) - output_bytesize = RSTRING_LEN(input); - } - - retry: - - if (mrb_nil_p(output_byteoffset_v)) - output_byteoffset = RSTRING_LEN(output); - - if (output_byteoffset < 0) - mrb_raise(mrb, E_ARGUMENT_ERROR, "negative output_byteoffset"); - - if (RSTRING_LEN(output) < output_byteoffset) - mrb_raise(mrb, E_ARGUMENT_ERROR, "output_byteoffset too big"); - - if (output_bytesize < 0) - mrb_raise(mrb, E_ARGUMENT_ERROR, "negative output_bytesize"); - - output_byteend = (unsigned long)output_byteoffset + - (unsigned long)output_bytesize; - - if (output_byteend < (unsigned long)output_byteoffset || - LONG_MAX < output_byteend) - mrb_raise(mrb, E_ARGUMENT_ERROR, "output_byteoffset+output_bytesize too big"); - - if (mrb_str_capacity(output) < output_byteend) - mrb_str_resize(mrb, output, output_byteend); - - if (mrb_nil_p(input)) { - ip = is = NULL; - } - else { - ip = (const unsigned char*)RSTRING_PTR(input); - is = ip + RSTRING_LEN(input); - } - - op = (unsigned char*)RSTRING_PTR(output) + output_byteoffset; - os = op + output_bytesize; - - res = mrb_econv_convert(mrb, ec, &ip, is, &op, os, flags); - mrb_str_set_len(mrb, output, op-(unsigned char*)RSTRING_PTR(output)); - if (!mrb_nil_p(input)) - mrb_str_drop_bytes(mrb, input, ip - (unsigned char*)RSTRING_PTR(input)); - - if (mrb_nil_p(output_bytesize_v) && res == econv_destination_buffer_full) { - if (LONG_MAX / 2 < output_bytesize) - mrb_raise(mrb, E_ARGUMENT_ERROR, "too long conversion result"); - output_bytesize *= 2; - output_byteoffset_v = mrb_nil_value(); - goto retry; - } - - if (ec->destination_encoding) { - mrb_enc_associate(mrb, output, ec->destination_encoding); - } - - return econv_result_to_symbol(res); -} - -/* - * call-seq: - * ec.primitive_convert(source_buffer, destination_buffer) -> symbol - * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol - * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol - * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol - * - * possible opt elements: - * hash form: - * :partial_input => true # source buffer may be part of larger source - * :after_output => true # stop conversion after output before input - * integer form: - * Encoding::Converter::PARTIAL_INPUT - * Encoding::Converter::AFTER_OUTPUT - * - * possible results: - * :invalid_byte_sequence - * :incomplete_input - * :undefined_conversion - * :after_output - * :destination_buffer_full - * :source_buffer_empty - * :finished - * - * primitive_convert converts source_buffer into destination_buffer. - * - * source_buffer should be a string or nil. - * nil means a empty string. - * - * destination_buffer should be a string. - * - * destination_byteoffset should be an integer or nil. - * nil means the end of destination_buffer. - * If it is omitted, nil is assumed. - * - * destination_bytesize should be an integer or nil. - * nil means unlimited. - * If it is omitted, nil is assumed. - * - * opt should be nil, a hash or an integer. - * nil means no flags. - * If it is omitted, nil is assumed. - * - * primitive_convert converts the content of source_buffer from beginning - * and store the result into destination_buffer. - * - * destination_byteoffset and destination_bytesize specify the region which - * the converted result is stored. - * destination_byteoffset specifies the start position in destination_buffer in bytes. - * If destination_byteoffset is nil, - * destination_buffer.bytesize is used for appending the result. - * destination_bytesize specifies maximum number of bytes. - * If destination_bytesize is nil, - * destination size is unlimited. - * After conversion, destination_buffer is resized to - * destination_byteoffset + actually produced number of bytes. - * Also destination_buffer's encoding is set to destination_encoding. - * - * primitive_convert drops the converted part of source_buffer. - * the dropped part is converted in destination_buffer or - * buffered in Encoding::Converter object. - * - * primitive_convert stops conversion when one of following condition met. - * - invalid byte sequence found in source buffer (:invalid_byte_sequence) - * - unexpected end of source buffer (:incomplete_input) - * this occur only when :partial_input is not specified. - * - character not representable in output encoding (:undefined_conversion) - * - after some output is generated, before input is done (:after_output) - * this occur only when :after_output is specified. - * - destination buffer is full (:destination_buffer_full) - * this occur only when destination_bytesize is non-nil. - * - source buffer is empty (:source_buffer_empty) - * this occur only when :partial_input is specified. - * - conversion is finished (:finished) - * - * example: - * ec = Encoding::Converter.new("UTF-8", "UTF-16BE") - * ret = ec.primitive_convert(src="pi", dst="", nil, 100) - * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"] - * - * ec = Encoding::Converter.new("UTF-8", "UTF-16BE") - * ret = ec.primitive_convert(src="pi", dst="", nil, 1) - * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"] - * ret = ec.primitive_convert(src, dst="", nil, 1) - * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"] - * ret = ec.primitive_convert(src, dst="", nil, 1) - * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"] - * ret = ec.primitive_convert(src, dst="", nil, 1) - * p [ret, src, dst] #=> [:finished, "", "i"] - * - */ -static mrb_value -econv_primitive_convert(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value self) -{ - mrb_value argv[16]; - int argc; - - mrb_get_args(mrb, "*", &argv, &argc); - return econv_primitive_cnvproc(mrb, argc, argv, self); -} - -/* - * call-seq: - * ec.convert(source_string) -> destination_string - * - * Convert source_string and return destination_string. - * - * source_string is assumed as a part of source. - * i.e. :partial_input=>true is specified internally. - * finish method should be used last. - * - * ec = Encoding::Converter.new("utf-8", "euc-jp") - * puts ec.convert("\u3042").dump #=> "\xA4\xA2" - * puts ec.finish.dump #=> "" - * - * ec = Encoding::Converter.new("euc-jp", "utf-8") - * puts ec.convert("\xA4").dump #=> "" - * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82" - * puts ec.finish.dump #=> "" - * - * ec = Encoding::Converter.new("utf-8", "iso-2022-jp") - * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP") - * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP") - * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP") - * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP") - * - * If a conversion error occur, - * Encoding::UndefinedConversionError or - * Encoding::InvalidByteSequenceError is raised. - * Encoding::Converter#convert doesn't supply methods to recover or restart - * from these exceptions. - * When you want to handle these conversion errors, - * use Encoding::Converter#primitive_convert. - * - */ -static mrb_value -econv_convert(mrb_state *mrb, mrb_value self) -{ - mrb_value source_string; - mrb_value ret, dst; - mrb_value av[5]; - int ac; - mrb_econv_t *ec = check_econv(mrb, self); - - mrb_get_args(mrb, "o", &source_string); - //StringValue(source_string); - mrb_string_value(mrb, &source_string); - - dst = mrb_str_new(mrb, NULL, 0); - - av[0] = mrb_str_dup(mrb, source_string); - av[1] = dst; - av[2] = mrb_nil_value(); - av[3] = mrb_nil_value(); - av[4] = mrb_fixnum_value(ECONV_PARTIAL_INPUT); - ac = 5; - - ret = econv_primitive_cnvproc(mrb, ac, av, self); - - if (mrb_obj_equal(mrb, ret, sym_invalid_byte_sequence) || - mrb_obj_equal(mrb, ret, sym_undefined_conversion) || - mrb_obj_equal(mrb, ret, sym_incomplete_input)) { - mrb_value exc = make_econv_exception(mrb, ec); - mrb_exc_raise(mrb, exc); - } - - if (mrb_obj_equal(mrb, ret, sym_finished)) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "converter already finished"); - } - - if (!mrb_obj_equal(mrb, ret, sym_source_buffer_empty)) { - mrb_bug("unexpected result of econv_primitive_convert"); - } - - return dst; -} - -/* - * call-seq: - * ec.finish -> string - * - * Finishes the converter. - * It returns the last part of the converted string. - * - * ec = Encoding::Converter.new("utf-8", "iso-2022-jp") - * p ec.convert("\u3042") #=> "\e$B$\"" - * p ec.finish #=> "\e(B" - */ -static mrb_value -econv_finish(mrb_state *mrb, mrb_value self) -{ - mrb_value ret, dst; - mrb_value av[5]; - int ac; - mrb_econv_t *ec = check_econv(mrb, self); - - dst = mrb_str_new(mrb, NULL, 0); - - av[0] = mrb_nil_value(); - av[1] = dst; - av[2] = mrb_nil_value(); - av[3] = mrb_nil_value(); - av[4] = mrb_fixnum_value(0); - ac = 5; - - ret = econv_primitive_cnvproc(mrb, ac, av, self); - - if (mrb_obj_equal(mrb, ret, sym_invalid_byte_sequence) || - mrb_obj_equal(mrb, ret, sym_undefined_conversion) || - mrb_obj_equal(mrb, ret, sym_incomplete_input)) { - mrb_value exc = make_econv_exception(mrb, ec); - mrb_exc_raise(mrb, exc); - } - - if (!mrb_obj_equal(mrb, ret, sym_finished)) { - mrb_bug("unexpected result of econv_primitive_convert"); - } - - return dst; -} - -/* - * call-seq: - * ec.primitive_errinfo -> array - * - * primitive_errinfo returns important information regarding the last error - * as a 5-element array: - * - * [result, enc1, enc2, error_bytes, readagain_bytes] - * - * result is the last result of primitive_convert. - * - * Other elements are only meaningful when result is - * :invalid_byte_sequence, :incomplete_input or :undefined_conversion. - * - * enc1 and enc2 indicate a conversion step as a pair of strings. - * For example, a converter from EUC-JP to ISO-8859-1 converts - * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1. - * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"]. - * - * error_bytes and readagain_bytes indicate the byte sequences which caused the error. - * error_bytes is discarded portion. - * readagain_bytes is buffered portion which is read again on next conversion. - * - * Example: - * - * # \xff is invalid as EUC-JP. - * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") - * ec.primitive_convert(src="\xff", dst="", nil, 10) - * p ec.primitive_errinfo - * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""] - * - * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1. - * # Since this error is occur in UTF-8 to ISO-8859-1 conversion, - * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82). - * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") - * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10) - * p ec.primitive_errinfo - * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""] - * - * # partial character is invalid - * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") - * ec.primitive_convert(src="\xa4", dst="", nil, 10) - * p ec.primitive_errinfo - * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""] - * - * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by - * # partial characters. - * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") - * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) - * p ec.primitive_errinfo - * #=> [:source_buffer_empty, nil, nil, nil, nil] - * - * # \xd8\x00\x00@ is invalid as UTF-16BE because - * # no low surrogate after high surrogate (\xd8\x00). - * # It is detected by 3rd byte (\00) which is part of next character. - * # So the high surrogate (\xd8\x00) is discarded and - * # the 3rd byte is read again later. - * # Since the byte is buffered in ec, it is dropped from src. - * ec = Encoding::Converter.new("UTF-16BE", "UTF-8") - * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10) - * p ec.primitive_errinfo - * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"] - * p src - * #=> "@" - * - * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE. - * # The problem is detected by 4th byte. - * ec = Encoding::Converter.new("UTF-16LE", "UTF-8") - * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10) - * p ec.primitive_errinfo - * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"] - * p src - * #=> "" - * - */ -static mrb_value -econv_primitive_errinfo(mrb_state *mrb, mrb_value self) -{ - mrb_econv_t *ec = check_econv(mrb, self); - - mrb_value ary; - - ary = mrb_ary_new_capa(mrb, 5);//mrb_ary_new2(5); - - mrb_ary_set(mrb, ary, 0, econv_result_to_symbol(ec->last_error.result));//rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result)); - mrb_ary_set(mrb, ary, 4, mrb_nil_value());//rb_ary_store(ary, 4, mrb_nil_value()); - - if (ec->last_error.source_encoding) - mrb_ary_set(mrb, ary, 1, mrb_str_new2(mrb, ec->last_error.source_encoding));//rb_ary_store(ary, 1, mrb_str_new2(mrb, ec->last_error.source_encoding)); - - if (ec->last_error.destination_encoding) - mrb_ary_set(mrb, ary, 2, mrb_str_new2(mrb, ec->last_error.destination_encoding));//rb_ary_store(ary, 2, mrb_str_new2(mrb, ec->last_error.destination_encoding)); - - if (ec->last_error.error_bytes_start) { - //rb_ary_store(ary, 3, mrb_str_new(mrb, (const char*)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len)); - mrb_ary_set(mrb, ary, 3, mrb_str_new(mrb, (const char*)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len)); - //rb_ary_store(ary, 4, mrb_str_new(mrb, (const char*)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len)); - mrb_ary_set(mrb, ary, 4, mrb_str_new(mrb, (const char*)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len)); - } - - return ary; -} - -/* - * call-seq: - * ec.insert_output(string) -> nil - * - * Inserts string into the encoding converter. - * The string will be converted to the destination encoding and - * output on later conversions. - * - * If the destination encoding is stateful, - * string is converted according to the state and the state is updated. - * - * This method should be used only when a conversion error occurs. - * - * ec = Encoding::Converter.new("utf-8", "iso-8859-1") - * src = "HIRAGANA LETTER A is \u{3042}." - * dst = "" - * p ec.primitive_convert(src, dst) #=> :undefined_conversion - * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."] - * ec.insert_output("<err>") - * p ec.primitive_convert(src, dst) #=> :finished - * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""] - * - * ec = Encoding::Converter.new("utf-8", "iso-2022-jp") - * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp - * dst = "" - * p ec.primitive_convert(src, dst) #=> :undefined_conversion - * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"] - * ec.insert_output "?" # state change required to output "?". - * p ec.primitive_convert(src, dst) #=> :finished - * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""] - * - */ -static mrb_value -econv_insert_output(mrb_state *mrb, mrb_value self) -{ - mrb_value string; - const char *insert_enc; - mrb_econv_t *ec; - int ret; - - mrb_get_args(mrb, "o", &string); - ec = check_econv(mrb, self); - - //StringValue(string); - mrb_string_value(mrb, &string); - insert_enc = mrb_econv_encoding_to_insert_output(ec); - string = mrb_str_encode(mrb, string, mrb_enc_from_encoding(mrb, mrb_enc_find(mrb, insert_enc)), 0, mrb_nil_value()); - - ret = mrb_econv_insert_output(mrb, ec, (const unsigned char*)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc); - if (ret == -1) { - mrb_raise(mrb, E_ARGUMENT_ERROR, "too big string"); - } - - return mrb_nil_value(); -} - -/* - * call-seq - * ec.putback -> string - * ec.putback(max_numbytes) -> string - * - * Put back the bytes which will be converted. - * - * The bytes are caused by invalid_byte_sequence error. - * When invalid_byte_sequence error, some bytes are discarded and - * some bytes are buffered to be converted later. - * The latter bytes can be put back. - * It can be observed by - * Encoding::InvalidByteSequenceError#readagain_bytes and - * Encoding::Converter#primitive_errinfo. - * - * ec = Encoding::Converter.new("utf-16le", "iso-8859-1") - * src = "\x00\xd8\x61\x00" - * dst = "" - * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence - * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"] - * p ec.putback #=> "a\x00" - * p ec.putback #=> "" # no more bytes to put back - * - */ -static mrb_value -econv_putback(mrb_state *mrb, /*int argc, mrb_value *argv,*/ mrb_value self) -{ - mrb_econv_t *ec = check_econv(mrb, self); - int n; - int putbackable; - mrb_value str, max; - - mrb_value argv[16]; - int argc; - - //mrb_scan_args(argc, argv, "01", &max); - mrb_get_args(mrb, "*", &argv, &argc); - - if (argc == 0)//mrb_nil_p(max)) - n = mrb_econv_putbackable(ec); - else { - max = argv[0]; - n = mrb_fixnum(max); - putbackable = mrb_econv_putbackable(ec); - if (putbackable < n) - n = putbackable; - } - - str = mrb_str_new(mrb, NULL, n); - mrb_econv_putback(ec, (unsigned char*)RSTRING_PTR(str), n); - - if (ec->source_encoding) { - mrb_enc_associate(mrb, str, ec->source_encoding); - } - - return str; -} - -/* - * call-seq: - * ec.last_error -> exception or nil - * - * Returns an exception object for the last conversion. - * Returns nil if the last conversion did not produce an error. - * - * "error" means that - * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for - * Encoding::Converter#convert and - * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for - * Encoding::Converter#primitive_convert. - * - * ec = Encoding::Converter.new("utf-8", "iso-8859-1") - * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence - * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8> - * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full - * p ec.last_error #=> nil - * - */ -static mrb_value -econv_last_error(mrb_state *mrb, mrb_value self) -{ - mrb_econv_t *ec = check_econv(mrb, self); - mrb_value exc; - - exc = make_econv_exception(mrb, ec); - if (mrb_nil_p(exc)) - return mrb_nil_value(); - return exc; -} - -/* - * call-seq: - * ec.replacement -> string - * - * Returns the replacement string. - * - * ec = Encoding::Converter.new("euc-jp", "us-ascii") - * p ec.replacement #=> "?" - * - * ec = Encoding::Converter.new("euc-jp", "utf-8") - * p ec.replacement #=> "\uFFFD" - */ -static mrb_value -econv_get_replacement(mrb_state *mrb, mrb_value self) -{ - mrb_econv_t *ec = check_econv(mrb, self); - int ret; - mrb_encoding *enc; - - ret = make_replacement(mrb, ec); - if (ret == -1) { - mrb_raise(mrb, E_UNDEFINEDCONVERSION_ERROR, "replacement character setup failed"); - } - - enc = mrb_enc_find(mrb, ec->replacement_enc); - return mrb_enc_str_new(mrb, (const char*)ec->replacement_str, (long)ec->replacement_len, enc); -} - -/* - * call-seq: - * ec.replacement = string - * - * Sets the replacement string. - * - * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace) - * ec.replacement = "<undef>" - * p ec.convert("a \u3042 b") #=> "a <undef> b" - */ -static mrb_value -econv_set_replacement(mrb_state *mrb, mrb_value self) -{ - mrb_value arg; - mrb_econv_t *ec = check_econv(mrb, self); - mrb_value string = arg; - int ret; - mrb_encoding *enc; - mrb_get_args(mrb, "o", &arg); - - //StringValue(string); - mrb_string_value(mrb, &string); - enc = mrb_enc_get(mrb, string); - - ret = mrb_econv_set_replacement(mrb, ec, - (const unsigned char*)RSTRING_PTR(string), - RSTRING_LEN(string), - mrb_enc_name(enc)); - - if (ret == -1) { - /* xxx: mrb_eInvalidByteSequenceError? */ - mrb_raise(mrb, E_UNDEFINEDCONVERSION_ERROR, "replacement character setup failed"); - } - - return arg; -} - -mrb_value -mrb_econv_make_exception(mrb_state *mrb, mrb_econv_t *ec) -{ - return make_econv_exception(mrb, ec); -} - -void -mrb_econv_check_error(mrb_state *mrb, mrb_econv_t *ec) -{ - mrb_value exc; - - exc = make_econv_exception(mrb, ec); - if (mrb_nil_p(exc)) - return; - mrb_exc_raise(mrb, exc); -} - -/* - * call-seq: - * ecerr.source_encoding_name -> string - * - * Returns the source encoding name as a string. - */ -static mrb_value -ecerr_source_encoding_name(mrb_state *mrb, mrb_value self) -{ - return mrb_attr_get(mrb, self, mrb_intern(mrb, "source_encoding_name")); -} - -/* - * call-seq: - * ecerr.source_encoding -> encoding - * - * Returns the source encoding as an encoding object. - * - * Note that the result may not be equal to the source encoding of - * the encoding converter if the conversion has multiple steps. - * - * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP - * begin - * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP. - * rescue Encoding::UndefinedConversionError - * p $!.source_encoding #=> #<Encoding:UTF-8> - * p $!.destination_encoding #=> #<Encoding:EUC-JP> - * p $!.source_encoding_name #=> "UTF-8" - * p $!.destination_encoding_name #=> "EUC-JP" - * end - * - */ -static mrb_value -ecerr_source_encoding(mrb_state *mrb, mrb_value self) -{ - return mrb_attr_get(mrb, self, mrb_intern(mrb, "source_encoding")); -} - -/* - * call-seq: - * ecerr.destination_encoding_name -> string - * - * Returns the destination encoding name as a string. - */ -static mrb_value -ecerr_destination_encoding_name(mrb_state *mrb, mrb_value self) -{ - return mrb_attr_get(mrb, self, mrb_intern(mrb, "destination_encoding_name")); -} - -/* - * call-seq: - * ecerr.destination_encoding -> string - * - * Returns the destination encoding as an encoding object. - */ -static mrb_value -ecerr_destination_encoding(mrb_state *mrb, mrb_value self) -{ - return mrb_attr_get(mrb, self, mrb_intern(mrb, "destination_encoding")); -} - -/* - * call-seq: - * ecerr.error_char -> string - * - * Returns the one-character string which cause Encoding::UndefinedConversionError. - * - * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") - * begin - * ec.convert("\xa0") - * rescue Encoding::UndefinedConversionError - * puts $!.error_char.dump #=> "\xC2\xA0" - * p $!.error_char.encoding #=> #<Encoding:UTF-8> - * end - * - */ -static mrb_value -ecerr_error_char(mrb_state *mrb, mrb_value self) -{ - return mrb_attr_get(mrb, self, mrb_intern(mrb, "error_char")); -} - -/* - * call-seq: - * ecerr.error_bytes -> string - * - * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs. - * - * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") - * begin - * ec.convert("abc\xA1\xFFdef") - * rescue Encoding::InvalidByteSequenceError - * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP> - * puts $!.error_bytes.dump #=> "\xA1" - * puts $!.readagain_bytes.dump #=> "\xFF" - * end - */ -static mrb_value -ecerr_error_bytes(mrb_state *mrb, mrb_value self) -{ - return mrb_attr_get(mrb, self, mrb_intern(mrb, "error_bytes")); -} - -/* - * call-seq: - * ecerr.readagain_bytes -> string - * - * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs. - */ -static mrb_value -ecerr_readagain_bytes(mrb_state *mrb, mrb_value self) -{ - return mrb_attr_get(mrb, self, mrb_intern(mrb, "readagain_bytes")); -} - -/* - * call-seq: - * ecerr.incomplete_input? -> true or false - * - * Returns true if the invalid byte sequence error is caused by - * premature end of string. - * - * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") - * - * begin - * ec.convert("abc\xA1z") - * rescue Encoding::InvalidByteSequenceError - * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP> - * p $!.incomplete_input? #=> false - * end - * - * begin - * ec.convert("abc\xA1") - * ec.finish - * rescue Encoding::InvalidByteSequenceError - * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP> - * p $!.incomplete_input? #=> true - * end - */ -static mrb_value -ecerr_incomplete_input(mrb_state *mrb, mrb_value self) -{ - return mrb_attr_get(mrb, self, mrb_intern(mrb, "incomplete_input")); -} - -extern void Init_newline(void); - -/* - * Document-class: Encoding::UndefinedConversionError - * - * Raised by Encoding and String methods when a transcoding operation - * fails. - */ - -/* - * Document-class: Encoding::InvalidByteSequenceError - * - * Raised by Encoding and String methods when the string being - * transcoded contains a byte invalid for the either the source or - * target encoding. - */ - -/* - * Document-class: Encoding::ConverterNotFoundError - * - * Raised by transcoding methods when a named encoding does not - * correspond with a known converter. - */ - -void -mrb_init_transcode(mrb_state *mrb) -{ - struct RClass *s; - struct RClass *c; - struct RClass *u; - struct RClass *i; - struct RClass *eConverterNotFoundError_class; - struct RClass *eInvalidByteSequenceError_class; - struct RClass *eUndefinedConversionError_class; - - eUndefinedConversionError_class = mrb_define_class(mrb, "UndefinedConversionError", E_ENCODING_ERROR); - eInvalidByteSequenceError_class = mrb_define_class(mrb, "InvalidByteSequenceError", E_ENCODING_ERROR); - eConverterNotFoundError_class = mrb_define_class(mrb, "ConverterNotFoundError", E_ENCODING_ERROR); - - transcoder_table = st_init_strcasetable(); - - //sym_invalid = ID2SYM(mrb_intern("invalid")); - //sym_undef = ID2SYM(mrb_intern("undef")); - //sym_replace = ID2SYM(mrb_intern("replace")); - //sym_fallback = ID2SYM(mrb_intern("fallback")); - //sym_xml = ID2SYM(mrb_intern("xml")); - //sym_text = ID2SYM(mrb_intern("text")); - //sym_attr = ID2SYM(mrb_intern("attr")); - - //sym_invalid_byte_sequence = ID2SYM(mrb_intern("invalid_byte_sequence")); - //sym_undefined_conversion = ID2SYM(mrb_intern("undefined_conversion")); - //sym_destination_buffer_full = ID2SYM(mrb_intern("destination_buffer_full")); - //sym_source_buffer_empty = ID2SYM(mrb_intern("source_buffer_empty")); - //sym_finished = ID2SYM(mrb_intern("finished")); - //sym_after_output = ID2SYM(mrb_intern("after_output")); - //sym_incomplete_input = ID2SYM(mrb_intern("incomplete_input")); - //sym_universal_newline = ID2SYM(mrb_intern("universal_newline")); - //sym_crlf_newline = ID2SYM(mrb_intern("crlf_newline")); - //sym_cr_newline = ID2SYM(mrb_intern("cr_newline")); - //sym_partial_input = ID2SYM(mrb_intern("partial_input")); - - s = mrb->string_class; - mrb_define_method(mrb, s, "encode", str_encode, ARGS_ANY()); - mrb_define_method(mrb, s, "encode!", str_encode_bang, ARGS_ANY()); - - c = mrb_define_class(mrb, "Converter", ENCODE_CLASS); - //mrb_cEncodingConverter = rb_define_class_under(mrb_cEncoding, "Converter", rb_cData); - //mrb_define_alloc_func(mrb_cEncodingConverter, econv_s_allocate); - mrb_define_class_method(mrb, c, "asciicompat_encoding", econv_s_asciicompat_encoding, ARGS_REQ(1)); /* 1 */ - mrb_define_class_method(mrb, c, "search_convpath", econv_s_search_convpath, ARGS_ANY()); /* 2 */ - mrb_define_method(mrb, s, "initialize", econv_init, ARGS_ANY()); - mrb_define_method(mrb, s, "inspect", econv_inspect, ARGS_NONE()); - mrb_define_method(mrb, s, "convpath", econv_convpath, ARGS_NONE()); - mrb_define_method(mrb, s, "source_encoding", econv_source_encoding, ARGS_NONE()); - mrb_define_method(mrb, s, "destination_encoding", econv_destination_encoding, ARGS_NONE()); - mrb_define_method(mrb, s, "primitive_convert", econv_primitive_convert, ARGS_ANY()); - mrb_define_method(mrb, s, "convert", econv_convert, ARGS_REQ(1)); - mrb_define_method(mrb, s, "finish", econv_finish, ARGS_NONE()); - mrb_define_method(mrb, s, "primitive_errinfo", econv_primitive_errinfo, ARGS_NONE()); - mrb_define_method(mrb, s, "insert_output", econv_insert_output, ARGS_REQ(1)); - mrb_define_method(mrb, s, "putback", econv_putback, ARGS_ANY()); - mrb_define_method(mrb, s, "last_error", econv_last_error, ARGS_NONE()); - mrb_define_method(mrb, s, "replacement", econv_get_replacement, ARGS_NONE()); - mrb_define_method(mrb, s, "replacement=", econv_set_replacement, ARGS_REQ(1)); - - mrb_define_const(mrb, s, "INVALID_MASK", mrb_fixnum_value(ECONV_INVALID_MASK)); - mrb_define_const(mrb, s, "INVALID_REPLACE", mrb_fixnum_value(ECONV_INVALID_REPLACE)); - mrb_define_const(mrb, s, "UNDEF_MASK", mrb_fixnum_value(ECONV_UNDEF_MASK)); - mrb_define_const(mrb, s, "UNDEF_REPLACE", mrb_fixnum_value(ECONV_UNDEF_REPLACE)); - mrb_define_const(mrb, s, "UNDEF_HEX_CHARREF", mrb_fixnum_value(ECONV_UNDEF_HEX_CHARREF)); - mrb_define_const(mrb, s, "PARTIAL_INPUT", mrb_fixnum_value(ECONV_PARTIAL_INPUT)); - mrb_define_const(mrb, s, "AFTER_OUTPUT", mrb_fixnum_value(ECONV_AFTER_OUTPUT)); - mrb_define_const(mrb, s, "UNIVERSAL_NEWLINE_DECORATOR", mrb_fixnum_value(ECONV_UNIVERSAL_NEWLINE_DECORATOR)); - mrb_define_const(mrb, s, "CRLF_NEWLINE_DECORATOR", mrb_fixnum_value(ECONV_CRLF_NEWLINE_DECORATOR)); - mrb_define_const(mrb, s, "CR_NEWLINE_DECORATOR", mrb_fixnum_value(ECONV_CR_NEWLINE_DECORATOR)); - mrb_define_const(mrb, s, "XML_TEXT_DECORATOR", mrb_fixnum_value(ECONV_XML_TEXT_DECORATOR)); - mrb_define_const(mrb, s, "XML_ATTR_CONTENT_DECORATOR", mrb_fixnum_value(ECONV_XML_ATTR_CONTENT_DECORATOR)); - mrb_define_const(mrb, s, "XML_ATTR_QUOTE_DECORATOR", mrb_fixnum_value(ECONV_XML_ATTR_QUOTE_DECORATOR)); - - u = E_UNDEFINEDCONVERSION_ERROR; - mrb_define_method(mrb, u, "source_encoding_name", ecerr_source_encoding_name, ARGS_NONE()); - mrb_define_method(mrb, u, "destination_encoding_name", ecerr_destination_encoding_name, ARGS_NONE()); - mrb_define_method(mrb, u, "source_encoding", ecerr_source_encoding, ARGS_NONE()); - mrb_define_method(mrb, u, "destination_encoding", ecerr_destination_encoding, ARGS_NONE()); - mrb_define_method(mrb, u, "error_char", ecerr_error_char, ARGS_NONE()); - - i = E_INVALIDBYTESEQUENCE_ERROR; - mrb_define_method(mrb, i, "source_encoding_name", ecerr_source_encoding_name, ARGS_NONE()); - mrb_define_method(mrb, i, "destination_encoding_name", ecerr_destination_encoding_name, ARGS_NONE()); - mrb_define_method(mrb, i, "source_encoding", ecerr_source_encoding, ARGS_NONE()); - mrb_define_method(mrb, i, "destination_encoding", ecerr_destination_encoding, ARGS_NONE()); - mrb_define_method(mrb, i, "error_bytes", ecerr_error_bytes, ARGS_NONE()); - mrb_define_method(mrb, i, "readagain_bytes", ecerr_readagain_bytes, ARGS_NONE()); - mrb_define_method(mrb, i, "incomplete_input?", ecerr_incomplete_input, ARGS_NONE()); - - //Init_newline(); -} -#endif //INCLUDE_ENCODING diff --git a/src/transcode_data.h b/src/transcode_data.h deleted file mode 100644 index 62051701a..000000000 --- a/src/transcode_data.h +++ /dev/null @@ -1,109 +0,0 @@ -/********************************************************************** - - transcode_data.h - - - $Author: duerst $ - created at: Mon 10 Dec 2007 14:01:47 JST 2007 - - Copyright (C) 2007 Martin Duerst - -**********************************************************************/ - -//#include "ruby/ruby.h" - -#ifndef RUBY_TRANSCODE_DATA_H -#define RUBY_TRANSCODE_DATA_H 1 - -#define WORDINDEX_SHIFT_BITS 2 -#define WORDINDEX2INFO(widx) ((widx) << WORDINDEX_SHIFT_BITS) -#define INFO2WORDINDEX(info) ((info) >> WORDINDEX_SHIFT_BITS) -#define BYTE_LOOKUP_BASE(bl) ((bl)[0]) -#define BYTE_LOOKUP_INFO(bl) ((bl)[1]) - -#define PType (unsigned int) - -#define NOMAP (PType 0x01) /* direct map */ -#define ONEbt (0x02) /* one byte payload */ -#define TWObt (0x03) /* two bytes payload */ -#define THREEbt (0x05) /* three bytes payload */ -#define FOURbt (0x06) /* four bytes payload, UTF-8 only, macros start at getBT0 */ -#define INVALID (PType 0x07) /* invalid byte sequence */ -#define UNDEF (PType 0x09) /* legal but undefined */ -#define ZERObt (PType 0x0A) /* zero bytes of payload, i.e. remove */ -#define FUNii (PType 0x0B) /* function from info to info */ -#define FUNsi (PType 0x0D) /* function from start to info */ -#define FUNio (PType 0x0E) /* function from info to output */ -#define FUNso (PType 0x0F) /* function from start to output */ -#define STR1 (PType 0x11) /* string 4 <= len <= 259 bytes: 1byte length + content */ -#define GB4bt (PType 0x12) /* GB18030 four bytes payload */ -#define FUNsio (PType 0x13) /* function from start and info to output */ - -#define STR1_LENGTH(byte_addr) (unsigned int)(*(byte_addr) + 4) -#define STR1_BYTEINDEX(w) ((w) >> 6) -#define makeSTR1(bi) (((bi) << 6) | STR1) -#define makeSTR1LEN(len) ((len)-4) - -#define o1(b1) (PType((((unsigned char)(b1))<<8)|ONEbt)) -#define o2(b1,b2) (PType((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|TWObt)) -#define o3(b1,b2,b3) (PType(((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned int)(unsigned char)(b3))<<24)|THREEbt)&0xffffffffU)) -#define o4(b0,b1,b2,b3) (PType(((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|((((unsigned char)(b0))&0x07)<<5)|FOURbt)&0xffffffffU)) -#define g4(b0,b1,b2,b3) (PType(((((unsigned char)(b0))<<8)|(((unsigned char)(b2))<<16)|((((unsigned char)(b1))&0x0f)<<24)|((((unsigned int)(unsigned char)(b3))&0x0f)<<28)|GB4bt)&0xffffffffU)) -#define funsio(diff) (PType((((unsigned int)(diff))<<8)|FUNsio)) - -#define getBT1(a) ((unsigned char)((a)>> 8)) -#define getBT2(a) ((unsigned char)((a)>>16)) -#define getBT3(a) ((unsigned char)((a)>>24)) -#define getBT0(a) (((unsigned char)((a)>> 5)&0x07)|0xF0) /* for UTF-8 only!!! */ - -#define getGB4bt0(a) ((unsigned char)((a)>> 8)) -#define getGB4bt1(a) ((((unsigned char)((a)>>24))&0x0F)|0x30) -#define getGB4bt2(a) ((unsigned char)((a)>>16)) -#define getGB4bt3(a) ((((unsigned char)((a)>>28))&0x0F)|0x30) - -#define o2FUNii(b1,b2) (PType((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|FUNii)) - -/* do we need these??? maybe not, can be done with simple tables */ -#define ONETRAIL /* legal but undefined if one more trailing UTF-8 */ -#define TWOTRAIL /* legal but undefined if two more trailing UTF-8 */ -#define THREETRAIL /* legal but undefined if three more trailing UTF-8 */ - -typedef enum { - asciicompat_converter, /* ASCII-compatible -> ASCII-compatible */ - asciicompat_decoder, /* ASCII-incompatible -> ASCII-compatible */ - asciicompat_encoder /* ASCII-compatible -> ASCII-incompatible */ - /* ASCII-incompatible -> ASCII-incompatible is intentionally omitted. */ -} mrb_transcoder_asciicompat_type_t; - -typedef struct mrb_transcoder mrb_transcoder; - -/* static structure, one per supported encoding pair */ -struct mrb_transcoder { - const char *src_encoding; - const char *dst_encoding; - unsigned int conv_tree_start; - const unsigned char *byte_array; - unsigned int byte_array_length; - const unsigned int *word_array; - unsigned int word_array_length; - int word_size; - int input_unit_length; - int max_input; - int max_output; - mrb_transcoder_asciicompat_type_t asciicompat_type; - size_t state_size; - int (*state_init_func)(void*); /* ret==0:success ret!=0:failure(errno) */ - int (*state_fini_func)(void*); /* ret==0:success ret!=0:failure(errno) */ - mrb_value (*func_ii)(void*, mrb_value); /* info -> info */ - mrb_value (*func_si)(void*, const unsigned char*, size_t); /* start -> info */ - ssize_t (*func_io)(void*, mrb_value, const unsigned char*, size_t); /* info -> output */ - ssize_t (*func_so)(void*, const unsigned char*, size_t, unsigned char*, size_t); /* start -> output */ - ssize_t (*finish_func)(void*, unsigned char*, size_t); /* -> output */ - ssize_t (*resetsize_func)(void*); /* -> len */ - ssize_t (*resetstate_func)(void*, unsigned char*, size_t); /* -> output */ - ssize_t (*func_sio)(void*, const unsigned char*, size_t, mrb_value, unsigned char*, size_t); /* start -> output */ -}; - -void mrb_declare_transcoder(mrb_state *mrb, const char *enc1, const char *enc2, const char *lib); -void mrb_register_transcoder(mrb_state *mrb, const mrb_transcoder *); - -#endif /* RUBY_TRANSCODE_DATA_H */ diff --git a/src/unicode.c b/src/unicode.c deleted file mode 100644 index dec692500..000000000 --- a/src/unicode.c +++ /dev/null @@ -1,2607 +0,0 @@ -/********************************************************************** - unicode.c - Oniguruma (regular expression library) -**********************************************************************/ -/*- - * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "mruby.h" -#ifdef INCLUDE_ENCODING -#include <string.h> -#include "regint.h" - -#include "encoding.h" //#define TOLOWER(c) - -#define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \ - ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0) - -static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = { - 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, - 0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008, - 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, - 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, - 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, - 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, - 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, - 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, - 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2, - 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, - 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, - 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0, - 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2, - 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, - 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, - 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008, - 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008, - 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, - 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, - 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, - 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, - 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0, - 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0, - 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0, - 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, - 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, - 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0, - 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2, - 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, - 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, - 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0, - 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2 -}; - -#include "name2ctype.h" - -typedef struct { - int n; - OnigCodePoint code[3]; -} CodePointList3; - -typedef struct { - OnigCodePoint from; - CodePointList3 to; -} CaseFold_11_Type; - -typedef struct { - OnigCodePoint from; - CodePointList3 to; -} CaseUnfold_11_Type; - -typedef struct { - int n; - OnigCodePoint code[2]; -} CodePointList2; - -typedef struct { - OnigCodePoint from[2]; - CodePointList2 to; -} CaseUnfold_12_Type; - -typedef struct { - OnigCodePoint from[3]; - CodePointList2 to; -} CaseUnfold_13_Type; - -static const CaseFold_11_Type CaseFold[] = { - { 0x0041, {1, {0x0061}}}, - { 0x0042, {1, {0x0062}}}, - { 0x0043, {1, {0x0063}}}, - { 0x0044, {1, {0x0064}}}, - { 0x0045, {1, {0x0065}}}, - { 0x0046, {1, {0x0066}}}, - { 0x0047, {1, {0x0067}}}, - { 0x0048, {1, {0x0068}}}, - { 0x004a, {1, {0x006a}}}, - { 0x004b, {1, {0x006b}}}, - { 0x004c, {1, {0x006c}}}, - { 0x004d, {1, {0x006d}}}, - { 0x004e, {1, {0x006e}}}, - { 0x004f, {1, {0x006f}}}, - { 0x0050, {1, {0x0070}}}, - { 0x0051, {1, {0x0071}}}, - { 0x0052, {1, {0x0072}}}, - { 0x0053, {1, {0x0073}}}, - { 0x0054, {1, {0x0074}}}, - { 0x0055, {1, {0x0075}}}, - { 0x0056, {1, {0x0076}}}, - { 0x0057, {1, {0x0077}}}, - { 0x0058, {1, {0x0078}}}, - { 0x0059, {1, {0x0079}}}, - { 0x005a, {1, {0x007a}}}, - { 0x00b5, {1, {0x03bc}}}, - { 0x00c0, {1, {0x00e0}}}, - { 0x00c1, {1, {0x00e1}}}, - { 0x00c2, {1, {0x00e2}}}, - { 0x00c3, {1, {0x00e3}}}, - { 0x00c4, {1, {0x00e4}}}, - { 0x00c5, {1, {0x00e5}}}, - { 0x00c6, {1, {0x00e6}}}, - { 0x00c7, {1, {0x00e7}}}, - { 0x00c8, {1, {0x00e8}}}, - { 0x00c9, {1, {0x00e9}}}, - { 0x00ca, {1, {0x00ea}}}, - { 0x00cb, {1, {0x00eb}}}, - { 0x00cc, {1, {0x00ec}}}, - { 0x00cd, {1, {0x00ed}}}, - { 0x00ce, {1, {0x00ee}}}, - { 0x00cf, {1, {0x00ef}}}, - { 0x00d0, {1, {0x00f0}}}, - { 0x00d1, {1, {0x00f1}}}, - { 0x00d2, {1, {0x00f2}}}, - { 0x00d3, {1, {0x00f3}}}, - { 0x00d4, {1, {0x00f4}}}, - { 0x00d5, {1, {0x00f5}}}, - { 0x00d6, {1, {0x00f6}}}, - { 0x00d8, {1, {0x00f8}}}, - { 0x00d9, {1, {0x00f9}}}, - { 0x00da, {1, {0x00fa}}}, - { 0x00db, {1, {0x00fb}}}, - { 0x00dc, {1, {0x00fc}}}, - { 0x00dd, {1, {0x00fd}}}, - { 0x00de, {1, {0x00fe}}}, - { 0x00df, {2, {0x0073, 0x0073}}}, - { 0x0100, {1, {0x0101}}}, - { 0x0102, {1, {0x0103}}}, - { 0x0104, {1, {0x0105}}}, - { 0x0106, {1, {0x0107}}}, - { 0x0108, {1, {0x0109}}}, - { 0x010a, {1, {0x010b}}}, - { 0x010c, {1, {0x010d}}}, - { 0x010e, {1, {0x010f}}}, - { 0x0110, {1, {0x0111}}}, - { 0x0112, {1, {0x0113}}}, - { 0x0114, {1, {0x0115}}}, - { 0x0116, {1, {0x0117}}}, - { 0x0118, {1, {0x0119}}}, - { 0x011a, {1, {0x011b}}}, - { 0x011c, {1, {0x011d}}}, - { 0x011e, {1, {0x011f}}}, - { 0x0120, {1, {0x0121}}}, - { 0x0122, {1, {0x0123}}}, - { 0x0124, {1, {0x0125}}}, - { 0x0126, {1, {0x0127}}}, - { 0x0128, {1, {0x0129}}}, - { 0x012a, {1, {0x012b}}}, - { 0x012c, {1, {0x012d}}}, - { 0x012e, {1, {0x012f}}}, - { 0x0132, {1, {0x0133}}}, - { 0x0134, {1, {0x0135}}}, - { 0x0136, {1, {0x0137}}}, - { 0x0139, {1, {0x013a}}}, - { 0x013b, {1, {0x013c}}}, - { 0x013d, {1, {0x013e}}}, - { 0x013f, {1, {0x0140}}}, - { 0x0141, {1, {0x0142}}}, - { 0x0143, {1, {0x0144}}}, - { 0x0145, {1, {0x0146}}}, - { 0x0147, {1, {0x0148}}}, - { 0x0149, {2, {0x02bc, 0x006e}}}, - { 0x014a, {1, {0x014b}}}, - { 0x014c, {1, {0x014d}}}, - { 0x014e, {1, {0x014f}}}, - { 0x0150, {1, {0x0151}}}, - { 0x0152, {1, {0x0153}}}, - { 0x0154, {1, {0x0155}}}, - { 0x0156, {1, {0x0157}}}, - { 0x0158, {1, {0x0159}}}, - { 0x015a, {1, {0x015b}}}, - { 0x015c, {1, {0x015d}}}, - { 0x015e, {1, {0x015f}}}, - { 0x0160, {1, {0x0161}}}, - { 0x0162, {1, {0x0163}}}, - { 0x0164, {1, {0x0165}}}, - { 0x0166, {1, {0x0167}}}, - { 0x0168, {1, {0x0169}}}, - { 0x016a, {1, {0x016b}}}, - { 0x016c, {1, {0x016d}}}, - { 0x016e, {1, {0x016f}}}, - { 0x0170, {1, {0x0171}}}, - { 0x0172, {1, {0x0173}}}, - { 0x0174, {1, {0x0175}}}, - { 0x0176, {1, {0x0177}}}, - { 0x0178, {1, {0x00ff}}}, - { 0x0179, {1, {0x017a}}}, - { 0x017b, {1, {0x017c}}}, - { 0x017d, {1, {0x017e}}}, - { 0x017f, {1, {0x0073}}}, - { 0x0181, {1, {0x0253}}}, - { 0x0182, {1, {0x0183}}}, - { 0x0184, {1, {0x0185}}}, - { 0x0186, {1, {0x0254}}}, - { 0x0187, {1, {0x0188}}}, - { 0x0189, {1, {0x0256}}}, - { 0x018a, {1, {0x0257}}}, - { 0x018b, {1, {0x018c}}}, - { 0x018e, {1, {0x01dd}}}, - { 0x018f, {1, {0x0259}}}, - { 0x0190, {1, {0x025b}}}, - { 0x0191, {1, {0x0192}}}, - { 0x0193, {1, {0x0260}}}, - { 0x0194, {1, {0x0263}}}, - { 0x0196, {1, {0x0269}}}, - { 0x0197, {1, {0x0268}}}, - { 0x0198, {1, {0x0199}}}, - { 0x019c, {1, {0x026f}}}, - { 0x019d, {1, {0x0272}}}, - { 0x019f, {1, {0x0275}}}, - { 0x01a0, {1, {0x01a1}}}, - { 0x01a2, {1, {0x01a3}}}, - { 0x01a4, {1, {0x01a5}}}, - { 0x01a6, {1, {0x0280}}}, - { 0x01a7, {1, {0x01a8}}}, - { 0x01a9, {1, {0x0283}}}, - { 0x01ac, {1, {0x01ad}}}, - { 0x01ae, {1, {0x0288}}}, - { 0x01af, {1, {0x01b0}}}, - { 0x01b1, {1, {0x028a}}}, - { 0x01b2, {1, {0x028b}}}, - { 0x01b3, {1, {0x01b4}}}, - { 0x01b5, {1, {0x01b6}}}, - { 0x01b7, {1, {0x0292}}}, - { 0x01b8, {1, {0x01b9}}}, - { 0x01bc, {1, {0x01bd}}}, - { 0x01c4, {1, {0x01c6}}}, - { 0x01c5, {1, {0x01c6}}}, - { 0x01c7, {1, {0x01c9}}}, - { 0x01c8, {1, {0x01c9}}}, - { 0x01ca, {1, {0x01cc}}}, - { 0x01cb, {1, {0x01cc}}}, - { 0x01cd, {1, {0x01ce}}}, - { 0x01cf, {1, {0x01d0}}}, - { 0x01d1, {1, {0x01d2}}}, - { 0x01d3, {1, {0x01d4}}}, - { 0x01d5, {1, {0x01d6}}}, - { 0x01d7, {1, {0x01d8}}}, - { 0x01d9, {1, {0x01da}}}, - { 0x01db, {1, {0x01dc}}}, - { 0x01de, {1, {0x01df}}}, - { 0x01e0, {1, {0x01e1}}}, - { 0x01e2, {1, {0x01e3}}}, - { 0x01e4, {1, {0x01e5}}}, - { 0x01e6, {1, {0x01e7}}}, - { 0x01e8, {1, {0x01e9}}}, - { 0x01ea, {1, {0x01eb}}}, - { 0x01ec, {1, {0x01ed}}}, - { 0x01ee, {1, {0x01ef}}}, - { 0x01f0, {2, {0x006a, 0x030c}}}, - { 0x01f1, {1, {0x01f3}}}, - { 0x01f2, {1, {0x01f3}}}, - { 0x01f4, {1, {0x01f5}}}, - { 0x01f6, {1, {0x0195}}}, - { 0x01f7, {1, {0x01bf}}}, - { 0x01f8, {1, {0x01f9}}}, - { 0x01fa, {1, {0x01fb}}}, - { 0x01fc, {1, {0x01fd}}}, - { 0x01fe, {1, {0x01ff}}}, - { 0x0200, {1, {0x0201}}}, - { 0x0202, {1, {0x0203}}}, - { 0x0204, {1, {0x0205}}}, - { 0x0206, {1, {0x0207}}}, - { 0x0208, {1, {0x0209}}}, - { 0x020a, {1, {0x020b}}}, - { 0x020c, {1, {0x020d}}}, - { 0x020e, {1, {0x020f}}}, - { 0x0210, {1, {0x0211}}}, - { 0x0212, {1, {0x0213}}}, - { 0x0214, {1, {0x0215}}}, - { 0x0216, {1, {0x0217}}}, - { 0x0218, {1, {0x0219}}}, - { 0x021a, {1, {0x021b}}}, - { 0x021c, {1, {0x021d}}}, - { 0x021e, {1, {0x021f}}}, - { 0x0220, {1, {0x019e}}}, - { 0x0222, {1, {0x0223}}}, - { 0x0224, {1, {0x0225}}}, - { 0x0226, {1, {0x0227}}}, - { 0x0228, {1, {0x0229}}}, - { 0x022a, {1, {0x022b}}}, - { 0x022c, {1, {0x022d}}}, - { 0x022e, {1, {0x022f}}}, - { 0x0230, {1, {0x0231}}}, - { 0x0232, {1, {0x0233}}}, - { 0x023b, {1, {0x023c}}}, - { 0x023d, {1, {0x019a}}}, - { 0x0241, {1, {0x0294}}}, - { 0x0345, {1, {0x03b9}}}, - { 0x0386, {1, {0x03ac}}}, - { 0x0388, {1, {0x03ad}}}, - { 0x0389, {1, {0x03ae}}}, - { 0x038a, {1, {0x03af}}}, - { 0x038c, {1, {0x03cc}}}, - { 0x038e, {1, {0x03cd}}}, - { 0x038f, {1, {0x03ce}}}, - { 0x0390, {3, {0x03b9, 0x0308, 0x0301}}}, - { 0x0391, {1, {0x03b1}}}, - { 0x0392, {1, {0x03b2}}}, - { 0x0393, {1, {0x03b3}}}, - { 0x0394, {1, {0x03b4}}}, - { 0x0395, {1, {0x03b5}}}, - { 0x0396, {1, {0x03b6}}}, - { 0x0397, {1, {0x03b7}}}, - { 0x0398, {1, {0x03b8}}}, - { 0x0399, {1, {0x03b9}}}, - { 0x039a, {1, {0x03ba}}}, - { 0x039b, {1, {0x03bb}}}, - { 0x039c, {1, {0x03bc}}}, - { 0x039d, {1, {0x03bd}}}, - { 0x039e, {1, {0x03be}}}, - { 0x039f, {1, {0x03bf}}}, - { 0x03a0, {1, {0x03c0}}}, - { 0x03a1, {1, {0x03c1}}}, - { 0x03a3, {1, {0x03c3}}}, - { 0x03a4, {1, {0x03c4}}}, - { 0x03a5, {1, {0x03c5}}}, - { 0x03a6, {1, {0x03c6}}}, - { 0x03a7, {1, {0x03c7}}}, - { 0x03a8, {1, {0x03c8}}}, - { 0x03a9, {1, {0x03c9}}}, - { 0x03aa, {1, {0x03ca}}}, - { 0x03ab, {1, {0x03cb}}}, - { 0x03b0, {3, {0x03c5, 0x0308, 0x0301}}}, - { 0x03c2, {1, {0x03c3}}}, - { 0x03d0, {1, {0x03b2}}}, - { 0x03d1, {1, {0x03b8}}}, - { 0x03d5, {1, {0x03c6}}}, - { 0x03d6, {1, {0x03c0}}}, - { 0x03d8, {1, {0x03d9}}}, - { 0x03da, {1, {0x03db}}}, - { 0x03dc, {1, {0x03dd}}}, - { 0x03de, {1, {0x03df}}}, - { 0x03e0, {1, {0x03e1}}}, - { 0x03e2, {1, {0x03e3}}}, - { 0x03e4, {1, {0x03e5}}}, - { 0x03e6, {1, {0x03e7}}}, - { 0x03e8, {1, {0x03e9}}}, - { 0x03ea, {1, {0x03eb}}}, - { 0x03ec, {1, {0x03ed}}}, - { 0x03ee, {1, {0x03ef}}}, - { 0x03f0, {1, {0x03ba}}}, - { 0x03f1, {1, {0x03c1}}}, - { 0x03f4, {1, {0x03b8}}}, - { 0x03f5, {1, {0x03b5}}}, - { 0x03f7, {1, {0x03f8}}}, - { 0x03f9, {1, {0x03f2}}}, - { 0x03fa, {1, {0x03fb}}}, - { 0x0400, {1, {0x0450}}}, - { 0x0401, {1, {0x0451}}}, - { 0x0402, {1, {0x0452}}}, - { 0x0403, {1, {0x0453}}}, - { 0x0404, {1, {0x0454}}}, - { 0x0405, {1, {0x0455}}}, - { 0x0406, {1, {0x0456}}}, - { 0x0407, {1, {0x0457}}}, - { 0x0408, {1, {0x0458}}}, - { 0x0409, {1, {0x0459}}}, - { 0x040a, {1, {0x045a}}}, - { 0x040b, {1, {0x045b}}}, - { 0x040c, {1, {0x045c}}}, - { 0x040d, {1, {0x045d}}}, - { 0x040e, {1, {0x045e}}}, - { 0x040f, {1, {0x045f}}}, - { 0x0410, {1, {0x0430}}}, - { 0x0411, {1, {0x0431}}}, - { 0x0412, {1, {0x0432}}}, - { 0x0413, {1, {0x0433}}}, - { 0x0414, {1, {0x0434}}}, - { 0x0415, {1, {0x0435}}}, - { 0x0416, {1, {0x0436}}}, - { 0x0417, {1, {0x0437}}}, - { 0x0418, {1, {0x0438}}}, - { 0x0419, {1, {0x0439}}}, - { 0x041a, {1, {0x043a}}}, - { 0x041b, {1, {0x043b}}}, - { 0x041c, {1, {0x043c}}}, - { 0x041d, {1, {0x043d}}}, - { 0x041e, {1, {0x043e}}}, - { 0x041f, {1, {0x043f}}}, - { 0x0420, {1, {0x0440}}}, - { 0x0421, {1, {0x0441}}}, - { 0x0422, {1, {0x0442}}}, - { 0x0423, {1, {0x0443}}}, - { 0x0424, {1, {0x0444}}}, - { 0x0425, {1, {0x0445}}}, - { 0x0426, {1, {0x0446}}}, - { 0x0427, {1, {0x0447}}}, - { 0x0428, {1, {0x0448}}}, - { 0x0429, {1, {0x0449}}}, - { 0x042a, {1, {0x044a}}}, - { 0x042b, {1, {0x044b}}}, - { 0x042c, {1, {0x044c}}}, - { 0x042d, {1, {0x044d}}}, - { 0x042e, {1, {0x044e}}}, - { 0x042f, {1, {0x044f}}}, - { 0x0460, {1, {0x0461}}}, - { 0x0462, {1, {0x0463}}}, - { 0x0464, {1, {0x0465}}}, - { 0x0466, {1, {0x0467}}}, - { 0x0468, {1, {0x0469}}}, - { 0x046a, {1, {0x046b}}}, - { 0x046c, {1, {0x046d}}}, - { 0x046e, {1, {0x046f}}}, - { 0x0470, {1, {0x0471}}}, - { 0x0472, {1, {0x0473}}}, - { 0x0474, {1, {0x0475}}}, - { 0x0476, {1, {0x0477}}}, - { 0x0478, {1, {0x0479}}}, - { 0x047a, {1, {0x047b}}}, - { 0x047c, {1, {0x047d}}}, - { 0x047e, {1, {0x047f}}}, - { 0x0480, {1, {0x0481}}}, - { 0x048a, {1, {0x048b}}}, - { 0x048c, {1, {0x048d}}}, - { 0x048e, {1, {0x048f}}}, - { 0x0490, {1, {0x0491}}}, - { 0x0492, {1, {0x0493}}}, - { 0x0494, {1, {0x0495}}}, - { 0x0496, {1, {0x0497}}}, - { 0x0498, {1, {0x0499}}}, - { 0x049a, {1, {0x049b}}}, - { 0x049c, {1, {0x049d}}}, - { 0x049e, {1, {0x049f}}}, - { 0x04a0, {1, {0x04a1}}}, - { 0x04a2, {1, {0x04a3}}}, - { 0x04a4, {1, {0x04a5}}}, - { 0x04a6, {1, {0x04a7}}}, - { 0x04a8, {1, {0x04a9}}}, - { 0x04aa, {1, {0x04ab}}}, - { 0x04ac, {1, {0x04ad}}}, - { 0x04ae, {1, {0x04af}}}, - { 0x04b0, {1, {0x04b1}}}, - { 0x04b2, {1, {0x04b3}}}, - { 0x04b4, {1, {0x04b5}}}, - { 0x04b6, {1, {0x04b7}}}, - { 0x04b8, {1, {0x04b9}}}, - { 0x04ba, {1, {0x04bb}}}, - { 0x04bc, {1, {0x04bd}}}, - { 0x04be, {1, {0x04bf}}}, - { 0x04c1, {1, {0x04c2}}}, - { 0x04c3, {1, {0x04c4}}}, - { 0x04c5, {1, {0x04c6}}}, - { 0x04c7, {1, {0x04c8}}}, - { 0x04c9, {1, {0x04ca}}}, - { 0x04cb, {1, {0x04cc}}}, - { 0x04cd, {1, {0x04ce}}}, - { 0x04d0, {1, {0x04d1}}}, - { 0x04d2, {1, {0x04d3}}}, - { 0x04d4, {1, {0x04d5}}}, - { 0x04d6, {1, {0x04d7}}}, - { 0x04d8, {1, {0x04d9}}}, - { 0x04da, {1, {0x04db}}}, - { 0x04dc, {1, {0x04dd}}}, - { 0x04de, {1, {0x04df}}}, - { 0x04e0, {1, {0x04e1}}}, - { 0x04e2, {1, {0x04e3}}}, - { 0x04e4, {1, {0x04e5}}}, - { 0x04e6, {1, {0x04e7}}}, - { 0x04e8, {1, {0x04e9}}}, - { 0x04ea, {1, {0x04eb}}}, - { 0x04ec, {1, {0x04ed}}}, - { 0x04ee, {1, {0x04ef}}}, - { 0x04f0, {1, {0x04f1}}}, - { 0x04f2, {1, {0x04f3}}}, - { 0x04f4, {1, {0x04f5}}}, - { 0x04f6, {1, {0x04f7}}}, - { 0x04f8, {1, {0x04f9}}}, - { 0x0500, {1, {0x0501}}}, - { 0x0502, {1, {0x0503}}}, - { 0x0504, {1, {0x0505}}}, - { 0x0506, {1, {0x0507}}}, - { 0x0508, {1, {0x0509}}}, - { 0x050a, {1, {0x050b}}}, - { 0x050c, {1, {0x050d}}}, - { 0x050e, {1, {0x050f}}}, - { 0x0531, {1, {0x0561}}}, - { 0x0532, {1, {0x0562}}}, - { 0x0533, {1, {0x0563}}}, - { 0x0534, {1, {0x0564}}}, - { 0x0535, {1, {0x0565}}}, - { 0x0536, {1, {0x0566}}}, - { 0x0537, {1, {0x0567}}}, - { 0x0538, {1, {0x0568}}}, - { 0x0539, {1, {0x0569}}}, - { 0x053a, {1, {0x056a}}}, - { 0x053b, {1, {0x056b}}}, - { 0x053c, {1, {0x056c}}}, - { 0x053d, {1, {0x056d}}}, - { 0x053e, {1, {0x056e}}}, - { 0x053f, {1, {0x056f}}}, - { 0x0540, {1, {0x0570}}}, - { 0x0541, {1, {0x0571}}}, - { 0x0542, {1, {0x0572}}}, - { 0x0543, {1, {0x0573}}}, - { 0x0544, {1, {0x0574}}}, - { 0x0545, {1, {0x0575}}}, - { 0x0546, {1, {0x0576}}}, - { 0x0547, {1, {0x0577}}}, - { 0x0548, {1, {0x0578}}}, - { 0x0549, {1, {0x0579}}}, - { 0x054a, {1, {0x057a}}}, - { 0x054b, {1, {0x057b}}}, - { 0x054c, {1, {0x057c}}}, - { 0x054d, {1, {0x057d}}}, - { 0x054e, {1, {0x057e}}}, - { 0x054f, {1, {0x057f}}}, - { 0x0550, {1, {0x0580}}}, - { 0x0551, {1, {0x0581}}}, - { 0x0552, {1, {0x0582}}}, - { 0x0553, {1, {0x0583}}}, - { 0x0554, {1, {0x0584}}}, - { 0x0555, {1, {0x0585}}}, - { 0x0556, {1, {0x0586}}}, - { 0x0587, {2, {0x0565, 0x0582}}}, - { 0x10a0, {1, {0x2d00}}}, - { 0x10a1, {1, {0x2d01}}}, - { 0x10a2, {1, {0x2d02}}}, - { 0x10a3, {1, {0x2d03}}}, - { 0x10a4, {1, {0x2d04}}}, - { 0x10a5, {1, {0x2d05}}}, - { 0x10a6, {1, {0x2d06}}}, - { 0x10a7, {1, {0x2d07}}}, - { 0x10a8, {1, {0x2d08}}}, - { 0x10a9, {1, {0x2d09}}}, - { 0x10aa, {1, {0x2d0a}}}, - { 0x10ab, {1, {0x2d0b}}}, - { 0x10ac, {1, {0x2d0c}}}, - { 0x10ad, {1, {0x2d0d}}}, - { 0x10ae, {1, {0x2d0e}}}, - { 0x10af, {1, {0x2d0f}}}, - { 0x10b0, {1, {0x2d10}}}, - { 0x10b1, {1, {0x2d11}}}, - { 0x10b2, {1, {0x2d12}}}, - { 0x10b3, {1, {0x2d13}}}, - { 0x10b4, {1, {0x2d14}}}, - { 0x10b5, {1, {0x2d15}}}, - { 0x10b6, {1, {0x2d16}}}, - { 0x10b7, {1, {0x2d17}}}, - { 0x10b8, {1, {0x2d18}}}, - { 0x10b9, {1, {0x2d19}}}, - { 0x10ba, {1, {0x2d1a}}}, - { 0x10bb, {1, {0x2d1b}}}, - { 0x10bc, {1, {0x2d1c}}}, - { 0x10bd, {1, {0x2d1d}}}, - { 0x10be, {1, {0x2d1e}}}, - { 0x10bf, {1, {0x2d1f}}}, - { 0x10c0, {1, {0x2d20}}}, - { 0x10c1, {1, {0x2d21}}}, - { 0x10c2, {1, {0x2d22}}}, - { 0x10c3, {1, {0x2d23}}}, - { 0x10c4, {1, {0x2d24}}}, - { 0x10c5, {1, {0x2d25}}}, - { 0x1e00, {1, {0x1e01}}}, - { 0x1e02, {1, {0x1e03}}}, - { 0x1e04, {1, {0x1e05}}}, - { 0x1e06, {1, {0x1e07}}}, - { 0x1e08, {1, {0x1e09}}}, - { 0x1e0a, {1, {0x1e0b}}}, - { 0x1e0c, {1, {0x1e0d}}}, - { 0x1e0e, {1, {0x1e0f}}}, - { 0x1e10, {1, {0x1e11}}}, - { 0x1e12, {1, {0x1e13}}}, - { 0x1e14, {1, {0x1e15}}}, - { 0x1e16, {1, {0x1e17}}}, - { 0x1e18, {1, {0x1e19}}}, - { 0x1e1a, {1, {0x1e1b}}}, - { 0x1e1c, {1, {0x1e1d}}}, - { 0x1e1e, {1, {0x1e1f}}}, - { 0x1e20, {1, {0x1e21}}}, - { 0x1e22, {1, {0x1e23}}}, - { 0x1e24, {1, {0x1e25}}}, - { 0x1e26, {1, {0x1e27}}}, - { 0x1e28, {1, {0x1e29}}}, - { 0x1e2a, {1, {0x1e2b}}}, - { 0x1e2c, {1, {0x1e2d}}}, - { 0x1e2e, {1, {0x1e2f}}}, - { 0x1e30, {1, {0x1e31}}}, - { 0x1e32, {1, {0x1e33}}}, - { 0x1e34, {1, {0x1e35}}}, - { 0x1e36, {1, {0x1e37}}}, - { 0x1e38, {1, {0x1e39}}}, - { 0x1e3a, {1, {0x1e3b}}}, - { 0x1e3c, {1, {0x1e3d}}}, - { 0x1e3e, {1, {0x1e3f}}}, - { 0x1e40, {1, {0x1e41}}}, - { 0x1e42, {1, {0x1e43}}}, - { 0x1e44, {1, {0x1e45}}}, - { 0x1e46, {1, {0x1e47}}}, - { 0x1e48, {1, {0x1e49}}}, - { 0x1e4a, {1, {0x1e4b}}}, - { 0x1e4c, {1, {0x1e4d}}}, - { 0x1e4e, {1, {0x1e4f}}}, - { 0x1e50, {1, {0x1e51}}}, - { 0x1e52, {1, {0x1e53}}}, - { 0x1e54, {1, {0x1e55}}}, - { 0x1e56, {1, {0x1e57}}}, - { 0x1e58, {1, {0x1e59}}}, - { 0x1e5a, {1, {0x1e5b}}}, - { 0x1e5c, {1, {0x1e5d}}}, - { 0x1e5e, {1, {0x1e5f}}}, - { 0x1e60, {1, {0x1e61}}}, - { 0x1e62, {1, {0x1e63}}}, - { 0x1e64, {1, {0x1e65}}}, - { 0x1e66, {1, {0x1e67}}}, - { 0x1e68, {1, {0x1e69}}}, - { 0x1e6a, {1, {0x1e6b}}}, - { 0x1e6c, {1, {0x1e6d}}}, - { 0x1e6e, {1, {0x1e6f}}}, - { 0x1e70, {1, {0x1e71}}}, - { 0x1e72, {1, {0x1e73}}}, - { 0x1e74, {1, {0x1e75}}}, - { 0x1e76, {1, {0x1e77}}}, - { 0x1e78, {1, {0x1e79}}}, - { 0x1e7a, {1, {0x1e7b}}}, - { 0x1e7c, {1, {0x1e7d}}}, - { 0x1e7e, {1, {0x1e7f}}}, - { 0x1e80, {1, {0x1e81}}}, - { 0x1e82, {1, {0x1e83}}}, - { 0x1e84, {1, {0x1e85}}}, - { 0x1e86, {1, {0x1e87}}}, - { 0x1e88, {1, {0x1e89}}}, - { 0x1e8a, {1, {0x1e8b}}}, - { 0x1e8c, {1, {0x1e8d}}}, - { 0x1e8e, {1, {0x1e8f}}}, - { 0x1e90, {1, {0x1e91}}}, - { 0x1e92, {1, {0x1e93}}}, - { 0x1e94, {1, {0x1e95}}}, - { 0x1e96, {2, {0x0068, 0x0331}}}, - { 0x1e97, {2, {0x0074, 0x0308}}}, - { 0x1e98, {2, {0x0077, 0x030a}}}, - { 0x1e99, {2, {0x0079, 0x030a}}}, - { 0x1e9a, {2, {0x0061, 0x02be}}}, - { 0x1e9b, {1, {0x1e61}}}, - { 0x1ea0, {1, {0x1ea1}}}, - { 0x1ea2, {1, {0x1ea3}}}, - { 0x1ea4, {1, {0x1ea5}}}, - { 0x1ea6, {1, {0x1ea7}}}, - { 0x1ea8, {1, {0x1ea9}}}, - { 0x1eaa, {1, {0x1eab}}}, - { 0x1eac, {1, {0x1ead}}}, - { 0x1eae, {1, {0x1eaf}}}, - { 0x1eb0, {1, {0x1eb1}}}, - { 0x1eb2, {1, {0x1eb3}}}, - { 0x1eb4, {1, {0x1eb5}}}, - { 0x1eb6, {1, {0x1eb7}}}, - { 0x1eb8, {1, {0x1eb9}}}, - { 0x1eba, {1, {0x1ebb}}}, - { 0x1ebc, {1, {0x1ebd}}}, - { 0x1ebe, {1, {0x1ebf}}}, - { 0x1ec0, {1, {0x1ec1}}}, - { 0x1ec2, {1, {0x1ec3}}}, - { 0x1ec4, {1, {0x1ec5}}}, - { 0x1ec6, {1, {0x1ec7}}}, - { 0x1ec8, {1, {0x1ec9}}}, - { 0x1eca, {1, {0x1ecb}}}, - { 0x1ecc, {1, {0x1ecd}}}, - { 0x1ece, {1, {0x1ecf}}}, - { 0x1ed0, {1, {0x1ed1}}}, - { 0x1ed2, {1, {0x1ed3}}}, - { 0x1ed4, {1, {0x1ed5}}}, - { 0x1ed6, {1, {0x1ed7}}}, - { 0x1ed8, {1, {0x1ed9}}}, - { 0x1eda, {1, {0x1edb}}}, - { 0x1edc, {1, {0x1edd}}}, - { 0x1ede, {1, {0x1edf}}}, - { 0x1ee0, {1, {0x1ee1}}}, - { 0x1ee2, {1, {0x1ee3}}}, - { 0x1ee4, {1, {0x1ee5}}}, - { 0x1ee6, {1, {0x1ee7}}}, - { 0x1ee8, {1, {0x1ee9}}}, - { 0x1eea, {1, {0x1eeb}}}, - { 0x1eec, {1, {0x1eed}}}, - { 0x1eee, {1, {0x1eef}}}, - { 0x1ef0, {1, {0x1ef1}}}, - { 0x1ef2, {1, {0x1ef3}}}, - { 0x1ef4, {1, {0x1ef5}}}, - { 0x1ef6, {1, {0x1ef7}}}, - { 0x1ef8, {1, {0x1ef9}}}, - { 0x1f08, {1, {0x1f00}}}, - { 0x1f09, {1, {0x1f01}}}, - { 0x1f0a, {1, {0x1f02}}}, - { 0x1f0b, {1, {0x1f03}}}, - { 0x1f0c, {1, {0x1f04}}}, - { 0x1f0d, {1, {0x1f05}}}, - { 0x1f0e, {1, {0x1f06}}}, - { 0x1f0f, {1, {0x1f07}}}, - { 0x1f18, {1, {0x1f10}}}, - { 0x1f19, {1, {0x1f11}}}, - { 0x1f1a, {1, {0x1f12}}}, - { 0x1f1b, {1, {0x1f13}}}, - { 0x1f1c, {1, {0x1f14}}}, - { 0x1f1d, {1, {0x1f15}}}, - { 0x1f28, {1, {0x1f20}}}, - { 0x1f29, {1, {0x1f21}}}, - { 0x1f2a, {1, {0x1f22}}}, - { 0x1f2b, {1, {0x1f23}}}, - { 0x1f2c, {1, {0x1f24}}}, - { 0x1f2d, {1, {0x1f25}}}, - { 0x1f2e, {1, {0x1f26}}}, - { 0x1f2f, {1, {0x1f27}}}, - { 0x1f38, {1, {0x1f30}}}, - { 0x1f39, {1, {0x1f31}}}, - { 0x1f3a, {1, {0x1f32}}}, - { 0x1f3b, {1, {0x1f33}}}, - { 0x1f3c, {1, {0x1f34}}}, - { 0x1f3d, {1, {0x1f35}}}, - { 0x1f3e, {1, {0x1f36}}}, - { 0x1f3f, {1, {0x1f37}}}, - { 0x1f48, {1, {0x1f40}}}, - { 0x1f49, {1, {0x1f41}}}, - { 0x1f4a, {1, {0x1f42}}}, - { 0x1f4b, {1, {0x1f43}}}, - { 0x1f4c, {1, {0x1f44}}}, - { 0x1f4d, {1, {0x1f45}}}, - { 0x1f50, {2, {0x03c5, 0x0313}}}, - { 0x1f52, {3, {0x03c5, 0x0313, 0x0300}}}, - { 0x1f54, {3, {0x03c5, 0x0313, 0x0301}}}, - { 0x1f56, {3, {0x03c5, 0x0313, 0x0342}}}, - { 0x1f59, {1, {0x1f51}}}, - { 0x1f5b, {1, {0x1f53}}}, - { 0x1f5d, {1, {0x1f55}}}, - { 0x1f5f, {1, {0x1f57}}}, - { 0x1f68, {1, {0x1f60}}}, - { 0x1f69, {1, {0x1f61}}}, - { 0x1f6a, {1, {0x1f62}}}, - { 0x1f6b, {1, {0x1f63}}}, - { 0x1f6c, {1, {0x1f64}}}, - { 0x1f6d, {1, {0x1f65}}}, - { 0x1f6e, {1, {0x1f66}}}, - { 0x1f6f, {1, {0x1f67}}}, - { 0x1f80, {2, {0x1f00, 0x03b9}}}, - { 0x1f81, {2, {0x1f01, 0x03b9}}}, - { 0x1f82, {2, {0x1f02, 0x03b9}}}, - { 0x1f83, {2, {0x1f03, 0x03b9}}}, - { 0x1f84, {2, {0x1f04, 0x03b9}}}, - { 0x1f85, {2, {0x1f05, 0x03b9}}}, - { 0x1f86, {2, {0x1f06, 0x03b9}}}, - { 0x1f87, {2, {0x1f07, 0x03b9}}}, - { 0x1f88, {2, {0x1f00, 0x03b9}}}, - { 0x1f89, {2, {0x1f01, 0x03b9}}}, - { 0x1f8a, {2, {0x1f02, 0x03b9}}}, - { 0x1f8b, {2, {0x1f03, 0x03b9}}}, - { 0x1f8c, {2, {0x1f04, 0x03b9}}}, - { 0x1f8d, {2, {0x1f05, 0x03b9}}}, - { 0x1f8e, {2, {0x1f06, 0x03b9}}}, - { 0x1f8f, {2, {0x1f07, 0x03b9}}}, - { 0x1f90, {2, {0x1f20, 0x03b9}}}, - { 0x1f91, {2, {0x1f21, 0x03b9}}}, - { 0x1f92, {2, {0x1f22, 0x03b9}}}, - { 0x1f93, {2, {0x1f23, 0x03b9}}}, - { 0x1f94, {2, {0x1f24, 0x03b9}}}, - { 0x1f95, {2, {0x1f25, 0x03b9}}}, - { 0x1f96, {2, {0x1f26, 0x03b9}}}, - { 0x1f97, {2, {0x1f27, 0x03b9}}}, - { 0x1f98, {2, {0x1f20, 0x03b9}}}, - { 0x1f99, {2, {0x1f21, 0x03b9}}}, - { 0x1f9a, {2, {0x1f22, 0x03b9}}}, - { 0x1f9b, {2, {0x1f23, 0x03b9}}}, - { 0x1f9c, {2, {0x1f24, 0x03b9}}}, - { 0x1f9d, {2, {0x1f25, 0x03b9}}}, - { 0x1f9e, {2, {0x1f26, 0x03b9}}}, - { 0x1f9f, {2, {0x1f27, 0x03b9}}}, - { 0x1fa0, {2, {0x1f60, 0x03b9}}}, - { 0x1fa1, {2, {0x1f61, 0x03b9}}}, - { 0x1fa2, {2, {0x1f62, 0x03b9}}}, - { 0x1fa3, {2, {0x1f63, 0x03b9}}}, - { 0x1fa4, {2, {0x1f64, 0x03b9}}}, - { 0x1fa5, {2, {0x1f65, 0x03b9}}}, - { 0x1fa6, {2, {0x1f66, 0x03b9}}}, - { 0x1fa7, {2, {0x1f67, 0x03b9}}}, - { 0x1fa8, {2, {0x1f60, 0x03b9}}}, - { 0x1fa9, {2, {0x1f61, 0x03b9}}}, - { 0x1faa, {2, {0x1f62, 0x03b9}}}, - { 0x1fab, {2, {0x1f63, 0x03b9}}}, - { 0x1fac, {2, {0x1f64, 0x03b9}}}, - { 0x1fad, {2, {0x1f65, 0x03b9}}}, - { 0x1fae, {2, {0x1f66, 0x03b9}}}, - { 0x1faf, {2, {0x1f67, 0x03b9}}}, - { 0x1fb2, {2, {0x1f70, 0x03b9}}}, - { 0x1fb3, {2, {0x03b1, 0x03b9}}}, - { 0x1fb4, {2, {0x03ac, 0x03b9}}}, - { 0x1fb6, {2, {0x03b1, 0x0342}}}, - { 0x1fb7, {3, {0x03b1, 0x0342, 0x03b9}}}, - { 0x1fb8, {1, {0x1fb0}}}, - { 0x1fb9, {1, {0x1fb1}}}, - { 0x1fba, {1, {0x1f70}}}, - { 0x1fbb, {1, {0x1f71}}}, - { 0x1fbc, {2, {0x03b1, 0x03b9}}}, - { 0x1fbe, {1, {0x03b9}}}, - { 0x1fc2, {2, {0x1f74, 0x03b9}}}, - { 0x1fc3, {2, {0x03b7, 0x03b9}}}, - { 0x1fc4, {2, {0x03ae, 0x03b9}}}, - { 0x1fc6, {2, {0x03b7, 0x0342}}}, - { 0x1fc7, {3, {0x03b7, 0x0342, 0x03b9}}}, - { 0x1fc8, {1, {0x1f72}}}, - { 0x1fc9, {1, {0x1f73}}}, - { 0x1fca, {1, {0x1f74}}}, - { 0x1fcb, {1, {0x1f75}}}, - { 0x1fcc, {2, {0x03b7, 0x03b9}}}, - { 0x1fd2, {3, {0x03b9, 0x0308, 0x0300}}}, - { 0x1fd3, {3, {0x03b9, 0x0308, 0x0301}}}, - { 0x1fd6, {2, {0x03b9, 0x0342}}}, - { 0x1fd7, {3, {0x03b9, 0x0308, 0x0342}}}, - { 0x1fd8, {1, {0x1fd0}}}, - { 0x1fd9, {1, {0x1fd1}}}, - { 0x1fda, {1, {0x1f76}}}, - { 0x1fdb, {1, {0x1f77}}}, - { 0x1fe2, {3, {0x03c5, 0x0308, 0x0300}}}, - { 0x1fe3, {3, {0x03c5, 0x0308, 0x0301}}}, - { 0x1fe4, {2, {0x03c1, 0x0313}}}, - { 0x1fe6, {2, {0x03c5, 0x0342}}}, - { 0x1fe7, {3, {0x03c5, 0x0308, 0x0342}}}, - { 0x1fe8, {1, {0x1fe0}}}, - { 0x1fe9, {1, {0x1fe1}}}, - { 0x1fea, {1, {0x1f7a}}}, - { 0x1feb, {1, {0x1f7b}}}, - { 0x1fec, {1, {0x1fe5}}}, - { 0x1ff2, {2, {0x1f7c, 0x03b9}}}, - { 0x1ff3, {2, {0x03c9, 0x03b9}}}, - { 0x1ff4, {2, {0x03ce, 0x03b9}}}, - { 0x1ff6, {2, {0x03c9, 0x0342}}}, - { 0x1ff7, {3, {0x03c9, 0x0342, 0x03b9}}}, - { 0x1ff8, {1, {0x1f78}}}, - { 0x1ff9, {1, {0x1f79}}}, - { 0x1ffa, {1, {0x1f7c}}}, - { 0x1ffb, {1, {0x1f7d}}}, - { 0x1ffc, {2, {0x03c9, 0x03b9}}}, - { 0x2126, {1, {0x03c9}}}, - { 0x212a, {1, {0x006b}}}, - { 0x212b, {1, {0x00e5}}}, - { 0x2160, {1, {0x2170}}}, - { 0x2161, {1, {0x2171}}}, - { 0x2162, {1, {0x2172}}}, - { 0x2163, {1, {0x2173}}}, - { 0x2164, {1, {0x2174}}}, - { 0x2165, {1, {0x2175}}}, - { 0x2166, {1, {0x2176}}}, - { 0x2167, {1, {0x2177}}}, - { 0x2168, {1, {0x2178}}}, - { 0x2169, {1, {0x2179}}}, - { 0x216a, {1, {0x217a}}}, - { 0x216b, {1, {0x217b}}}, - { 0x216c, {1, {0x217c}}}, - { 0x216d, {1, {0x217d}}}, - { 0x216e, {1, {0x217e}}}, - { 0x216f, {1, {0x217f}}}, - { 0x24b6, {1, {0x24d0}}}, - { 0x24b7, {1, {0x24d1}}}, - { 0x24b8, {1, {0x24d2}}}, - { 0x24b9, {1, {0x24d3}}}, - { 0x24ba, {1, {0x24d4}}}, - { 0x24bb, {1, {0x24d5}}}, - { 0x24bc, {1, {0x24d6}}}, - { 0x24bd, {1, {0x24d7}}}, - { 0x24be, {1, {0x24d8}}}, - { 0x24bf, {1, {0x24d9}}}, - { 0x24c0, {1, {0x24da}}}, - { 0x24c1, {1, {0x24db}}}, - { 0x24c2, {1, {0x24dc}}}, - { 0x24c3, {1, {0x24dd}}}, - { 0x24c4, {1, {0x24de}}}, - { 0x24c5, {1, {0x24df}}}, - { 0x24c6, {1, {0x24e0}}}, - { 0x24c7, {1, {0x24e1}}}, - { 0x24c8, {1, {0x24e2}}}, - { 0x24c9, {1, {0x24e3}}}, - { 0x24ca, {1, {0x24e4}}}, - { 0x24cb, {1, {0x24e5}}}, - { 0x24cc, {1, {0x24e6}}}, - { 0x24cd, {1, {0x24e7}}}, - { 0x24ce, {1, {0x24e8}}}, - { 0x24cf, {1, {0x24e9}}}, - { 0x2c00, {1, {0x2c30}}}, - { 0x2c01, {1, {0x2c31}}}, - { 0x2c02, {1, {0x2c32}}}, - { 0x2c03, {1, {0x2c33}}}, - { 0x2c04, {1, {0x2c34}}}, - { 0x2c05, {1, {0x2c35}}}, - { 0x2c06, {1, {0x2c36}}}, - { 0x2c07, {1, {0x2c37}}}, - { 0x2c08, {1, {0x2c38}}}, - { 0x2c09, {1, {0x2c39}}}, - { 0x2c0a, {1, {0x2c3a}}}, - { 0x2c0b, {1, {0x2c3b}}}, - { 0x2c0c, {1, {0x2c3c}}}, - { 0x2c0d, {1, {0x2c3d}}}, - { 0x2c0e, {1, {0x2c3e}}}, - { 0x2c0f, {1, {0x2c3f}}}, - { 0x2c10, {1, {0x2c40}}}, - { 0x2c11, {1, {0x2c41}}}, - { 0x2c12, {1, {0x2c42}}}, - { 0x2c13, {1, {0x2c43}}}, - { 0x2c14, {1, {0x2c44}}}, - { 0x2c15, {1, {0x2c45}}}, - { 0x2c16, {1, {0x2c46}}}, - { 0x2c17, {1, {0x2c47}}}, - { 0x2c18, {1, {0x2c48}}}, - { 0x2c19, {1, {0x2c49}}}, - { 0x2c1a, {1, {0x2c4a}}}, - { 0x2c1b, {1, {0x2c4b}}}, - { 0x2c1c, {1, {0x2c4c}}}, - { 0x2c1d, {1, {0x2c4d}}}, - { 0x2c1e, {1, {0x2c4e}}}, - { 0x2c1f, {1, {0x2c4f}}}, - { 0x2c20, {1, {0x2c50}}}, - { 0x2c21, {1, {0x2c51}}}, - { 0x2c22, {1, {0x2c52}}}, - { 0x2c23, {1, {0x2c53}}}, - { 0x2c24, {1, {0x2c54}}}, - { 0x2c25, {1, {0x2c55}}}, - { 0x2c26, {1, {0x2c56}}}, - { 0x2c27, {1, {0x2c57}}}, - { 0x2c28, {1, {0x2c58}}}, - { 0x2c29, {1, {0x2c59}}}, - { 0x2c2a, {1, {0x2c5a}}}, - { 0x2c2b, {1, {0x2c5b}}}, - { 0x2c2c, {1, {0x2c5c}}}, - { 0x2c2d, {1, {0x2c5d}}}, - { 0x2c2e, {1, {0x2c5e}}}, - { 0x2c80, {1, {0x2c81}}}, - { 0x2c82, {1, {0x2c83}}}, - { 0x2c84, {1, {0x2c85}}}, - { 0x2c86, {1, {0x2c87}}}, - { 0x2c88, {1, {0x2c89}}}, - { 0x2c8a, {1, {0x2c8b}}}, - { 0x2c8c, {1, {0x2c8d}}}, - { 0x2c8e, {1, {0x2c8f}}}, - { 0x2c90, {1, {0x2c91}}}, - { 0x2c92, {1, {0x2c93}}}, - { 0x2c94, {1, {0x2c95}}}, - { 0x2c96, {1, {0x2c97}}}, - { 0x2c98, {1, {0x2c99}}}, - { 0x2c9a, {1, {0x2c9b}}}, - { 0x2c9c, {1, {0x2c9d}}}, - { 0x2c9e, {1, {0x2c9f}}}, - { 0x2ca0, {1, {0x2ca1}}}, - { 0x2ca2, {1, {0x2ca3}}}, - { 0x2ca4, {1, {0x2ca5}}}, - { 0x2ca6, {1, {0x2ca7}}}, - { 0x2ca8, {1, {0x2ca9}}}, - { 0x2caa, {1, {0x2cab}}}, - { 0x2cac, {1, {0x2cad}}}, - { 0x2cae, {1, {0x2caf}}}, - { 0x2cb0, {1, {0x2cb1}}}, - { 0x2cb2, {1, {0x2cb3}}}, - { 0x2cb4, {1, {0x2cb5}}}, - { 0x2cb6, {1, {0x2cb7}}}, - { 0x2cb8, {1, {0x2cb9}}}, - { 0x2cba, {1, {0x2cbb}}}, - { 0x2cbc, {1, {0x2cbd}}}, - { 0x2cbe, {1, {0x2cbf}}}, - { 0x2cc0, {1, {0x2cc1}}}, - { 0x2cc2, {1, {0x2cc3}}}, - { 0x2cc4, {1, {0x2cc5}}}, - { 0x2cc6, {1, {0x2cc7}}}, - { 0x2cc8, {1, {0x2cc9}}}, - { 0x2cca, {1, {0x2ccb}}}, - { 0x2ccc, {1, {0x2ccd}}}, - { 0x2cce, {1, {0x2ccf}}}, - { 0x2cd0, {1, {0x2cd1}}}, - { 0x2cd2, {1, {0x2cd3}}}, - { 0x2cd4, {1, {0x2cd5}}}, - { 0x2cd6, {1, {0x2cd7}}}, - { 0x2cd8, {1, {0x2cd9}}}, - { 0x2cda, {1, {0x2cdb}}}, - { 0x2cdc, {1, {0x2cdd}}}, - { 0x2cde, {1, {0x2cdf}}}, - { 0x2ce0, {1, {0x2ce1}}}, - { 0x2ce2, {1, {0x2ce3}}}, - { 0xfb00, {2, {0x0066, 0x0066}}}, - { 0xfb01, {2, {0x0066, 0x0069}}}, - { 0xfb02, {2, {0x0066, 0x006c}}}, - { 0xfb03, {3, {0x0066, 0x0066, 0x0069}}}, - { 0xfb04, {3, {0x0066, 0x0066, 0x006c}}}, - { 0xfb05, {2, {0x0073, 0x0074}}}, - { 0xfb06, {2, {0x0073, 0x0074}}}, - { 0xfb13, {2, {0x0574, 0x0576}}}, - { 0xfb14, {2, {0x0574, 0x0565}}}, - { 0xfb15, {2, {0x0574, 0x056b}}}, - { 0xfb16, {2, {0x057e, 0x0576}}}, - { 0xfb17, {2, {0x0574, 0x056d}}}, - { 0xff21, {1, {0xff41}}}, - { 0xff22, {1, {0xff42}}}, - { 0xff23, {1, {0xff43}}}, - { 0xff24, {1, {0xff44}}}, - { 0xff25, {1, {0xff45}}}, - { 0xff26, {1, {0xff46}}}, - { 0xff27, {1, {0xff47}}}, - { 0xff28, {1, {0xff48}}}, - { 0xff29, {1, {0xff49}}}, - { 0xff2a, {1, {0xff4a}}}, - { 0xff2b, {1, {0xff4b}}}, - { 0xff2c, {1, {0xff4c}}}, - { 0xff2d, {1, {0xff4d}}}, - { 0xff2e, {1, {0xff4e}}}, - { 0xff2f, {1, {0xff4f}}}, - { 0xff30, {1, {0xff50}}}, - { 0xff31, {1, {0xff51}}}, - { 0xff32, {1, {0xff52}}}, - { 0xff33, {1, {0xff53}}}, - { 0xff34, {1, {0xff54}}}, - { 0xff35, {1, {0xff55}}}, - { 0xff36, {1, {0xff56}}}, - { 0xff37, {1, {0xff57}}}, - { 0xff38, {1, {0xff58}}}, - { 0xff39, {1, {0xff59}}}, - { 0xff3a, {1, {0xff5a}}}, - { 0x10400, {1, {0x10428}}}, - { 0x10401, {1, {0x10429}}}, - { 0x10402, {1, {0x1042a}}}, - { 0x10403, {1, {0x1042b}}}, - { 0x10404, {1, {0x1042c}}}, - { 0x10405, {1, {0x1042d}}}, - { 0x10406, {1, {0x1042e}}}, - { 0x10407, {1, {0x1042f}}}, - { 0x10408, {1, {0x10430}}}, - { 0x10409, {1, {0x10431}}}, - { 0x1040a, {1, {0x10432}}}, - { 0x1040b, {1, {0x10433}}}, - { 0x1040c, {1, {0x10434}}}, - { 0x1040d, {1, {0x10435}}}, - { 0x1040e, {1, {0x10436}}}, - { 0x1040f, {1, {0x10437}}}, - { 0x10410, {1, {0x10438}}}, - { 0x10411, {1, {0x10439}}}, - { 0x10412, {1, {0x1043a}}}, - { 0x10413, {1, {0x1043b}}}, - { 0x10414, {1, {0x1043c}}}, - { 0x10415, {1, {0x1043d}}}, - { 0x10416, {1, {0x1043e}}}, - { 0x10417, {1, {0x1043f}}}, - { 0x10418, {1, {0x10440}}}, - { 0x10419, {1, {0x10441}}}, - { 0x1041a, {1, {0x10442}}}, - { 0x1041b, {1, {0x10443}}}, - { 0x1041c, {1, {0x10444}}}, - { 0x1041d, {1, {0x10445}}}, - { 0x1041e, {1, {0x10446}}}, - { 0x1041f, {1, {0x10447}}}, - { 0x10420, {1, {0x10448}}}, - { 0x10421, {1, {0x10449}}}, - { 0x10422, {1, {0x1044a}}}, - { 0x10423, {1, {0x1044b}}}, - { 0x10424, {1, {0x1044c}}}, - { 0x10425, {1, {0x1044d}}}, - { 0x10426, {1, {0x1044e}}}, - { 0x10427, {1, {0x1044f}}} -}; - -static const CaseFold_11_Type CaseFold_Locale[] = { - { 0x0049, {1, {0x0069}}}, - { 0x0130, {2, {0x0069, 0x0307}}} -}; - -static const CaseUnfold_11_Type CaseUnfold_11[] = { - { 0x0061, {1, {0x0041 }}}, - { 0x0062, {1, {0x0042 }}}, - { 0x0063, {1, {0x0043 }}}, - { 0x0064, {1, {0x0044 }}}, - { 0x0065, {1, {0x0045 }}}, - { 0x0066, {1, {0x0046 }}}, - { 0x0067, {1, {0x0047 }}}, - { 0x0068, {1, {0x0048 }}}, - { 0x006a, {1, {0x004a }}}, - { 0x006b, {2, {0x212a, 0x004b }}}, - { 0x006c, {1, {0x004c }}}, - { 0x006d, {1, {0x004d }}}, - { 0x006e, {1, {0x004e }}}, - { 0x006f, {1, {0x004f }}}, - { 0x0070, {1, {0x0050 }}}, - { 0x0071, {1, {0x0051 }}}, - { 0x0072, {1, {0x0052 }}}, - { 0x0073, {2, {0x0053, 0x017f }}}, - { 0x0074, {1, {0x0054 }}}, - { 0x0075, {1, {0x0055 }}}, - { 0x0076, {1, {0x0056 }}}, - { 0x0077, {1, {0x0057 }}}, - { 0x0078, {1, {0x0058 }}}, - { 0x0079, {1, {0x0059 }}}, - { 0x007a, {1, {0x005a }}}, - { 0x00e0, {1, {0x00c0 }}}, - { 0x00e1, {1, {0x00c1 }}}, - { 0x00e2, {1, {0x00c2 }}}, - { 0x00e3, {1, {0x00c3 }}}, - { 0x00e4, {1, {0x00c4 }}}, - { 0x00e5, {2, {0x212b, 0x00c5 }}}, - { 0x00e6, {1, {0x00c6 }}}, - { 0x00e7, {1, {0x00c7 }}}, - { 0x00e8, {1, {0x00c8 }}}, - { 0x00e9, {1, {0x00c9 }}}, - { 0x00ea, {1, {0x00ca }}}, - { 0x00eb, {1, {0x00cb }}}, - { 0x00ec, {1, {0x00cc }}}, - { 0x00ed, {1, {0x00cd }}}, - { 0x00ee, {1, {0x00ce }}}, - { 0x00ef, {1, {0x00cf }}}, - { 0x00f0, {1, {0x00d0 }}}, - { 0x00f1, {1, {0x00d1 }}}, - { 0x00f2, {1, {0x00d2 }}}, - { 0x00f3, {1, {0x00d3 }}}, - { 0x00f4, {1, {0x00d4 }}}, - { 0x00f5, {1, {0x00d5 }}}, - { 0x00f6, {1, {0x00d6 }}}, - { 0x00f8, {1, {0x00d8 }}}, - { 0x00f9, {1, {0x00d9 }}}, - { 0x00fa, {1, {0x00da }}}, - { 0x00fb, {1, {0x00db }}}, - { 0x00fc, {1, {0x00dc }}}, - { 0x00fd, {1, {0x00dd }}}, - { 0x00fe, {1, {0x00de }}}, - { 0x00ff, {1, {0x0178 }}}, - { 0x0101, {1, {0x0100 }}}, - { 0x0103, {1, {0x0102 }}}, - { 0x0105, {1, {0x0104 }}}, - { 0x0107, {1, {0x0106 }}}, - { 0x0109, {1, {0x0108 }}}, - { 0x010b, {1, {0x010a }}}, - { 0x010d, {1, {0x010c }}}, - { 0x010f, {1, {0x010e }}}, - { 0x0111, {1, {0x0110 }}}, - { 0x0113, {1, {0x0112 }}}, - { 0x0115, {1, {0x0114 }}}, - { 0x0117, {1, {0x0116 }}}, - { 0x0119, {1, {0x0118 }}}, - { 0x011b, {1, {0x011a }}}, - { 0x011d, {1, {0x011c }}}, - { 0x011f, {1, {0x011e }}}, - { 0x0121, {1, {0x0120 }}}, - { 0x0123, {1, {0x0122 }}}, - { 0x0125, {1, {0x0124 }}}, - { 0x0127, {1, {0x0126 }}}, - { 0x0129, {1, {0x0128 }}}, - { 0x012b, {1, {0x012a }}}, - { 0x012d, {1, {0x012c }}}, - { 0x012f, {1, {0x012e }}}, - { 0x0133, {1, {0x0132 }}}, - { 0x0135, {1, {0x0134 }}}, - { 0x0137, {1, {0x0136 }}}, - { 0x013a, {1, {0x0139 }}}, - { 0x013c, {1, {0x013b }}}, - { 0x013e, {1, {0x013d }}}, - { 0x0140, {1, {0x013f }}}, - { 0x0142, {1, {0x0141 }}}, - { 0x0144, {1, {0x0143 }}}, - { 0x0146, {1, {0x0145 }}}, - { 0x0148, {1, {0x0147 }}}, - { 0x014b, {1, {0x014a }}}, - { 0x014d, {1, {0x014c }}}, - { 0x014f, {1, {0x014e }}}, - { 0x0151, {1, {0x0150 }}}, - { 0x0153, {1, {0x0152 }}}, - { 0x0155, {1, {0x0154 }}}, - { 0x0157, {1, {0x0156 }}}, - { 0x0159, {1, {0x0158 }}}, - { 0x015b, {1, {0x015a }}}, - { 0x015d, {1, {0x015c }}}, - { 0x015f, {1, {0x015e }}}, - { 0x0161, {1, {0x0160 }}}, - { 0x0163, {1, {0x0162 }}}, - { 0x0165, {1, {0x0164 }}}, - { 0x0167, {1, {0x0166 }}}, - { 0x0169, {1, {0x0168 }}}, - { 0x016b, {1, {0x016a }}}, - { 0x016d, {1, {0x016c }}}, - { 0x016f, {1, {0x016e }}}, - { 0x0171, {1, {0x0170 }}}, - { 0x0173, {1, {0x0172 }}}, - { 0x0175, {1, {0x0174 }}}, - { 0x0177, {1, {0x0176 }}}, - { 0x017a, {1, {0x0179 }}}, - { 0x017c, {1, {0x017b }}}, - { 0x017e, {1, {0x017d }}}, - { 0x0183, {1, {0x0182 }}}, - { 0x0185, {1, {0x0184 }}}, - { 0x0188, {1, {0x0187 }}}, - { 0x018c, {1, {0x018b }}}, - { 0x0192, {1, {0x0191 }}}, - { 0x0195, {1, {0x01f6 }}}, - { 0x0199, {1, {0x0198 }}}, - { 0x019a, {1, {0x023d }}}, - { 0x019e, {1, {0x0220 }}}, - { 0x01a1, {1, {0x01a0 }}}, - { 0x01a3, {1, {0x01a2 }}}, - { 0x01a5, {1, {0x01a4 }}}, - { 0x01a8, {1, {0x01a7 }}}, - { 0x01ad, {1, {0x01ac }}}, - { 0x01b0, {1, {0x01af }}}, - { 0x01b4, {1, {0x01b3 }}}, - { 0x01b6, {1, {0x01b5 }}}, - { 0x01b9, {1, {0x01b8 }}}, - { 0x01bd, {1, {0x01bc }}}, - { 0x01bf, {1, {0x01f7 }}}, - { 0x01c6, {2, {0x01c4, 0x01c5 }}}, - { 0x01c9, {2, {0x01c7, 0x01c8 }}}, - { 0x01cc, {2, {0x01ca, 0x01cb }}}, - { 0x01ce, {1, {0x01cd }}}, - { 0x01d0, {1, {0x01cf }}}, - { 0x01d2, {1, {0x01d1 }}}, - { 0x01d4, {1, {0x01d3 }}}, - { 0x01d6, {1, {0x01d5 }}}, - { 0x01d8, {1, {0x01d7 }}}, - { 0x01da, {1, {0x01d9 }}}, - { 0x01dc, {1, {0x01db }}}, - { 0x01dd, {1, {0x018e }}}, - { 0x01df, {1, {0x01de }}}, - { 0x01e1, {1, {0x01e0 }}}, - { 0x01e3, {1, {0x01e2 }}}, - { 0x01e5, {1, {0x01e4 }}}, - { 0x01e7, {1, {0x01e6 }}}, - { 0x01e9, {1, {0x01e8 }}}, - { 0x01eb, {1, {0x01ea }}}, - { 0x01ed, {1, {0x01ec }}}, - { 0x01ef, {1, {0x01ee }}}, - { 0x01f3, {2, {0x01f1, 0x01f2 }}}, - { 0x01f5, {1, {0x01f4 }}}, - { 0x01f9, {1, {0x01f8 }}}, - { 0x01fb, {1, {0x01fa }}}, - { 0x01fd, {1, {0x01fc }}}, - { 0x01ff, {1, {0x01fe }}}, - { 0x0201, {1, {0x0200 }}}, - { 0x0203, {1, {0x0202 }}}, - { 0x0205, {1, {0x0204 }}}, - { 0x0207, {1, {0x0206 }}}, - { 0x0209, {1, {0x0208 }}}, - { 0x020b, {1, {0x020a }}}, - { 0x020d, {1, {0x020c }}}, - { 0x020f, {1, {0x020e }}}, - { 0x0211, {1, {0x0210 }}}, - { 0x0213, {1, {0x0212 }}}, - { 0x0215, {1, {0x0214 }}}, - { 0x0217, {1, {0x0216 }}}, - { 0x0219, {1, {0x0218 }}}, - { 0x021b, {1, {0x021a }}}, - { 0x021d, {1, {0x021c }}}, - { 0x021f, {1, {0x021e }}}, - { 0x0223, {1, {0x0222 }}}, - { 0x0225, {1, {0x0224 }}}, - { 0x0227, {1, {0x0226 }}}, - { 0x0229, {1, {0x0228 }}}, - { 0x022b, {1, {0x022a }}}, - { 0x022d, {1, {0x022c }}}, - { 0x022f, {1, {0x022e }}}, - { 0x0231, {1, {0x0230 }}}, - { 0x0233, {1, {0x0232 }}}, - { 0x023c, {1, {0x023b }}}, - { 0x0253, {1, {0x0181 }}}, - { 0x0254, {1, {0x0186 }}}, - { 0x0256, {1, {0x0189 }}}, - { 0x0257, {1, {0x018a }}}, - { 0x0259, {1, {0x018f }}}, - { 0x025b, {1, {0x0190 }}}, - { 0x0260, {1, {0x0193 }}}, - { 0x0263, {1, {0x0194 }}}, - { 0x0268, {1, {0x0197 }}}, - { 0x0269, {1, {0x0196 }}}, - { 0x026f, {1, {0x019c }}}, - { 0x0272, {1, {0x019d }}}, - { 0x0275, {1, {0x019f }}}, - { 0x0280, {1, {0x01a6 }}}, - { 0x0283, {1, {0x01a9 }}}, - { 0x0288, {1, {0x01ae }}}, - { 0x028a, {1, {0x01b1 }}}, - { 0x028b, {1, {0x01b2 }}}, - { 0x0292, {1, {0x01b7 }}}, - { 0x0294, {1, {0x0241 }}}, - { 0x03ac, {1, {0x0386 }}}, - { 0x03ad, {1, {0x0388 }}}, - { 0x03ae, {1, {0x0389 }}}, - { 0x03af, {1, {0x038a }}}, - { 0x03b1, {1, {0x0391 }}}, - { 0x03b2, {2, {0x0392, 0x03d0 }}}, - { 0x03b3, {1, {0x0393 }}}, - { 0x03b4, {1, {0x0394 }}}, - { 0x03b5, {2, {0x03f5, 0x0395 }}}, - { 0x03b6, {1, {0x0396 }}}, - { 0x03b7, {1, {0x0397 }}}, - { 0x03b8, {3, {0x03f4, 0x0398, 0x03d1 }}}, - { 0x03b9, {3, {0x1fbe, 0x0399, 0x0345 }}}, - { 0x03ba, {2, {0x03f0, 0x039a }}}, - { 0x03bb, {1, {0x039b }}}, - { 0x03bc, {2, {0x00b5, 0x039c }}}, - { 0x03bd, {1, {0x039d }}}, - { 0x03be, {1, {0x039e }}}, - { 0x03bf, {1, {0x039f }}}, - { 0x03c0, {2, {0x03a0, 0x03d6 }}}, - { 0x03c1, {2, {0x03f1, 0x03a1 }}}, - { 0x03c3, {2, {0x03a3, 0x03c2 }}}, - { 0x03c4, {1, {0x03a4 }}}, - { 0x03c5, {1, {0x03a5 }}}, - { 0x03c6, {2, {0x03a6, 0x03d5 }}}, - { 0x03c7, {1, {0x03a7 }}}, - { 0x03c8, {1, {0x03a8 }}}, - { 0x03c9, {2, {0x03a9, 0x2126 }}}, - { 0x03ca, {1, {0x03aa }}}, - { 0x03cb, {1, {0x03ab }}}, - { 0x03cc, {1, {0x038c }}}, - { 0x03cd, {1, {0x038e }}}, - { 0x03ce, {1, {0x038f }}}, - { 0x03d9, {1, {0x03d8 }}}, - { 0x03db, {1, {0x03da }}}, - { 0x03dd, {1, {0x03dc }}}, - { 0x03df, {1, {0x03de }}}, - { 0x03e1, {1, {0x03e0 }}}, - { 0x03e3, {1, {0x03e2 }}}, - { 0x03e5, {1, {0x03e4 }}}, - { 0x03e7, {1, {0x03e6 }}}, - { 0x03e9, {1, {0x03e8 }}}, - { 0x03eb, {1, {0x03ea }}}, - { 0x03ed, {1, {0x03ec }}}, - { 0x03ef, {1, {0x03ee }}}, - { 0x03f2, {1, {0x03f9 }}}, - { 0x03f8, {1, {0x03f7 }}}, - { 0x03fb, {1, {0x03fa }}}, - { 0x0430, {1, {0x0410 }}}, - { 0x0431, {1, {0x0411 }}}, - { 0x0432, {1, {0x0412 }}}, - { 0x0433, {1, {0x0413 }}}, - { 0x0434, {1, {0x0414 }}}, - { 0x0435, {1, {0x0415 }}}, - { 0x0436, {1, {0x0416 }}}, - { 0x0437, {1, {0x0417 }}}, - { 0x0438, {1, {0x0418 }}}, - { 0x0439, {1, {0x0419 }}}, - { 0x043a, {1, {0x041a }}}, - { 0x043b, {1, {0x041b }}}, - { 0x043c, {1, {0x041c }}}, - { 0x043d, {1, {0x041d }}}, - { 0x043e, {1, {0x041e }}}, - { 0x043f, {1, {0x041f }}}, - { 0x0440, {1, {0x0420 }}}, - { 0x0441, {1, {0x0421 }}}, - { 0x0442, {1, {0x0422 }}}, - { 0x0443, {1, {0x0423 }}}, - { 0x0444, {1, {0x0424 }}}, - { 0x0445, {1, {0x0425 }}}, - { 0x0446, {1, {0x0426 }}}, - { 0x0447, {1, {0x0427 }}}, - { 0x0448, {1, {0x0428 }}}, - { 0x0449, {1, {0x0429 }}}, - { 0x044a, {1, {0x042a }}}, - { 0x044b, {1, {0x042b }}}, - { 0x044c, {1, {0x042c }}}, - { 0x044d, {1, {0x042d }}}, - { 0x044e, {1, {0x042e }}}, - { 0x044f, {1, {0x042f }}}, - { 0x0450, {1, {0x0400 }}}, - { 0x0451, {1, {0x0401 }}}, - { 0x0452, {1, {0x0402 }}}, - { 0x0453, {1, {0x0403 }}}, - { 0x0454, {1, {0x0404 }}}, - { 0x0455, {1, {0x0405 }}}, - { 0x0456, {1, {0x0406 }}}, - { 0x0457, {1, {0x0407 }}}, - { 0x0458, {1, {0x0408 }}}, - { 0x0459, {1, {0x0409 }}}, - { 0x045a, {1, {0x040a }}}, - { 0x045b, {1, {0x040b }}}, - { 0x045c, {1, {0x040c }}}, - { 0x045d, {1, {0x040d }}}, - { 0x045e, {1, {0x040e }}}, - { 0x045f, {1, {0x040f }}}, - { 0x0461, {1, {0x0460 }}}, - { 0x0463, {1, {0x0462 }}}, - { 0x0465, {1, {0x0464 }}}, - { 0x0467, {1, {0x0466 }}}, - { 0x0469, {1, {0x0468 }}}, - { 0x046b, {1, {0x046a }}}, - { 0x046d, {1, {0x046c }}}, - { 0x046f, {1, {0x046e }}}, - { 0x0471, {1, {0x0470 }}}, - { 0x0473, {1, {0x0472 }}}, - { 0x0475, {1, {0x0474 }}}, - { 0x0477, {1, {0x0476 }}}, - { 0x0479, {1, {0x0478 }}}, - { 0x047b, {1, {0x047a }}}, - { 0x047d, {1, {0x047c }}}, - { 0x047f, {1, {0x047e }}}, - { 0x0481, {1, {0x0480 }}}, - { 0x048b, {1, {0x048a }}}, - { 0x048d, {1, {0x048c }}}, - { 0x048f, {1, {0x048e }}}, - { 0x0491, {1, {0x0490 }}}, - { 0x0493, {1, {0x0492 }}}, - { 0x0495, {1, {0x0494 }}}, - { 0x0497, {1, {0x0496 }}}, - { 0x0499, {1, {0x0498 }}}, - { 0x049b, {1, {0x049a }}}, - { 0x049d, {1, {0x049c }}}, - { 0x049f, {1, {0x049e }}}, - { 0x04a1, {1, {0x04a0 }}}, - { 0x04a3, {1, {0x04a2 }}}, - { 0x04a5, {1, {0x04a4 }}}, - { 0x04a7, {1, {0x04a6 }}}, - { 0x04a9, {1, {0x04a8 }}}, - { 0x04ab, {1, {0x04aa }}}, - { 0x04ad, {1, {0x04ac }}}, - { 0x04af, {1, {0x04ae }}}, - { 0x04b1, {1, {0x04b0 }}}, - { 0x04b3, {1, {0x04b2 }}}, - { 0x04b5, {1, {0x04b4 }}}, - { 0x04b7, {1, {0x04b6 }}}, - { 0x04b9, {1, {0x04b8 }}}, - { 0x04bb, {1, {0x04ba }}}, - { 0x04bd, {1, {0x04bc }}}, - { 0x04bf, {1, {0x04be }}}, - { 0x04c2, {1, {0x04c1 }}}, - { 0x04c4, {1, {0x04c3 }}}, - { 0x04c6, {1, {0x04c5 }}}, - { 0x04c8, {1, {0x04c7 }}}, - { 0x04ca, {1, {0x04c9 }}}, - { 0x04cc, {1, {0x04cb }}}, - { 0x04ce, {1, {0x04cd }}}, - { 0x04d1, {1, {0x04d0 }}}, - { 0x04d3, {1, {0x04d2 }}}, - { 0x04d5, {1, {0x04d4 }}}, - { 0x04d7, {1, {0x04d6 }}}, - { 0x04d9, {1, {0x04d8 }}}, - { 0x04db, {1, {0x04da }}}, - { 0x04dd, {1, {0x04dc }}}, - { 0x04df, {1, {0x04de }}}, - { 0x04e1, {1, {0x04e0 }}}, - { 0x04e3, {1, {0x04e2 }}}, - { 0x04e5, {1, {0x04e4 }}}, - { 0x04e7, {1, {0x04e6 }}}, - { 0x04e9, {1, {0x04e8 }}}, - { 0x04eb, {1, {0x04ea }}}, - { 0x04ed, {1, {0x04ec }}}, - { 0x04ef, {1, {0x04ee }}}, - { 0x04f1, {1, {0x04f0 }}}, - { 0x04f3, {1, {0x04f2 }}}, - { 0x04f5, {1, {0x04f4 }}}, - { 0x04f7, {1, {0x04f6 }}}, - { 0x04f9, {1, {0x04f8 }}}, - { 0x0501, {1, {0x0500 }}}, - { 0x0503, {1, {0x0502 }}}, - { 0x0505, {1, {0x0504 }}}, - { 0x0507, {1, {0x0506 }}}, - { 0x0509, {1, {0x0508 }}}, - { 0x050b, {1, {0x050a }}}, - { 0x050d, {1, {0x050c }}}, - { 0x050f, {1, {0x050e }}}, - { 0x0561, {1, {0x0531 }}}, - { 0x0562, {1, {0x0532 }}}, - { 0x0563, {1, {0x0533 }}}, - { 0x0564, {1, {0x0534 }}}, - { 0x0565, {1, {0x0535 }}}, - { 0x0566, {1, {0x0536 }}}, - { 0x0567, {1, {0x0537 }}}, - { 0x0568, {1, {0x0538 }}}, - { 0x0569, {1, {0x0539 }}}, - { 0x056a, {1, {0x053a }}}, - { 0x056b, {1, {0x053b }}}, - { 0x056c, {1, {0x053c }}}, - { 0x056d, {1, {0x053d }}}, - { 0x056e, {1, {0x053e }}}, - { 0x056f, {1, {0x053f }}}, - { 0x0570, {1, {0x0540 }}}, - { 0x0571, {1, {0x0541 }}}, - { 0x0572, {1, {0x0542 }}}, - { 0x0573, {1, {0x0543 }}}, - { 0x0574, {1, {0x0544 }}}, - { 0x0575, {1, {0x0545 }}}, - { 0x0576, {1, {0x0546 }}}, - { 0x0577, {1, {0x0547 }}}, - { 0x0578, {1, {0x0548 }}}, - { 0x0579, {1, {0x0549 }}}, - { 0x057a, {1, {0x054a }}}, - { 0x057b, {1, {0x054b }}}, - { 0x057c, {1, {0x054c }}}, - { 0x057d, {1, {0x054d }}}, - { 0x057e, {1, {0x054e }}}, - { 0x057f, {1, {0x054f }}}, - { 0x0580, {1, {0x0550 }}}, - { 0x0581, {1, {0x0551 }}}, - { 0x0582, {1, {0x0552 }}}, - { 0x0583, {1, {0x0553 }}}, - { 0x0584, {1, {0x0554 }}}, - { 0x0585, {1, {0x0555 }}}, - { 0x0586, {1, {0x0556 }}}, - { 0x1e01, {1, {0x1e00 }}}, - { 0x1e03, {1, {0x1e02 }}}, - { 0x1e05, {1, {0x1e04 }}}, - { 0x1e07, {1, {0x1e06 }}}, - { 0x1e09, {1, {0x1e08 }}}, - { 0x1e0b, {1, {0x1e0a }}}, - { 0x1e0d, {1, {0x1e0c }}}, - { 0x1e0f, {1, {0x1e0e }}}, - { 0x1e11, {1, {0x1e10 }}}, - { 0x1e13, {1, {0x1e12 }}}, - { 0x1e15, {1, {0x1e14 }}}, - { 0x1e17, {1, {0x1e16 }}}, - { 0x1e19, {1, {0x1e18 }}}, - { 0x1e1b, {1, {0x1e1a }}}, - { 0x1e1d, {1, {0x1e1c }}}, - { 0x1e1f, {1, {0x1e1e }}}, - { 0x1e21, {1, {0x1e20 }}}, - { 0x1e23, {1, {0x1e22 }}}, - { 0x1e25, {1, {0x1e24 }}}, - { 0x1e27, {1, {0x1e26 }}}, - { 0x1e29, {1, {0x1e28 }}}, - { 0x1e2b, {1, {0x1e2a }}}, - { 0x1e2d, {1, {0x1e2c }}}, - { 0x1e2f, {1, {0x1e2e }}}, - { 0x1e31, {1, {0x1e30 }}}, - { 0x1e33, {1, {0x1e32 }}}, - { 0x1e35, {1, {0x1e34 }}}, - { 0x1e37, {1, {0x1e36 }}}, - { 0x1e39, {1, {0x1e38 }}}, - { 0x1e3b, {1, {0x1e3a }}}, - { 0x1e3d, {1, {0x1e3c }}}, - { 0x1e3f, {1, {0x1e3e }}}, - { 0x1e41, {1, {0x1e40 }}}, - { 0x1e43, {1, {0x1e42 }}}, - { 0x1e45, {1, {0x1e44 }}}, - { 0x1e47, {1, {0x1e46 }}}, - { 0x1e49, {1, {0x1e48 }}}, - { 0x1e4b, {1, {0x1e4a }}}, - { 0x1e4d, {1, {0x1e4c }}}, - { 0x1e4f, {1, {0x1e4e }}}, - { 0x1e51, {1, {0x1e50 }}}, - { 0x1e53, {1, {0x1e52 }}}, - { 0x1e55, {1, {0x1e54 }}}, - { 0x1e57, {1, {0x1e56 }}}, - { 0x1e59, {1, {0x1e58 }}}, - { 0x1e5b, {1, {0x1e5a }}}, - { 0x1e5d, {1, {0x1e5c }}}, - { 0x1e5f, {1, {0x1e5e }}}, - { 0x1e61, {2, {0x1e9b, 0x1e60 }}}, - { 0x1e63, {1, {0x1e62 }}}, - { 0x1e65, {1, {0x1e64 }}}, - { 0x1e67, {1, {0x1e66 }}}, - { 0x1e69, {1, {0x1e68 }}}, - { 0x1e6b, {1, {0x1e6a }}}, - { 0x1e6d, {1, {0x1e6c }}}, - { 0x1e6f, {1, {0x1e6e }}}, - { 0x1e71, {1, {0x1e70 }}}, - { 0x1e73, {1, {0x1e72 }}}, - { 0x1e75, {1, {0x1e74 }}}, - { 0x1e77, {1, {0x1e76 }}}, - { 0x1e79, {1, {0x1e78 }}}, - { 0x1e7b, {1, {0x1e7a }}}, - { 0x1e7d, {1, {0x1e7c }}}, - { 0x1e7f, {1, {0x1e7e }}}, - { 0x1e81, {1, {0x1e80 }}}, - { 0x1e83, {1, {0x1e82 }}}, - { 0x1e85, {1, {0x1e84 }}}, - { 0x1e87, {1, {0x1e86 }}}, - { 0x1e89, {1, {0x1e88 }}}, - { 0x1e8b, {1, {0x1e8a }}}, - { 0x1e8d, {1, {0x1e8c }}}, - { 0x1e8f, {1, {0x1e8e }}}, - { 0x1e91, {1, {0x1e90 }}}, - { 0x1e93, {1, {0x1e92 }}}, - { 0x1e95, {1, {0x1e94 }}}, - { 0x1ea1, {1, {0x1ea0 }}}, - { 0x1ea3, {1, {0x1ea2 }}}, - { 0x1ea5, {1, {0x1ea4 }}}, - { 0x1ea7, {1, {0x1ea6 }}}, - { 0x1ea9, {1, {0x1ea8 }}}, - { 0x1eab, {1, {0x1eaa }}}, - { 0x1ead, {1, {0x1eac }}}, - { 0x1eaf, {1, {0x1eae }}}, - { 0x1eb1, {1, {0x1eb0 }}}, - { 0x1eb3, {1, {0x1eb2 }}}, - { 0x1eb5, {1, {0x1eb4 }}}, - { 0x1eb7, {1, {0x1eb6 }}}, - { 0x1eb9, {1, {0x1eb8 }}}, - { 0x1ebb, {1, {0x1eba }}}, - { 0x1ebd, {1, {0x1ebc }}}, - { 0x1ebf, {1, {0x1ebe }}}, - { 0x1ec1, {1, {0x1ec0 }}}, - { 0x1ec3, {1, {0x1ec2 }}}, - { 0x1ec5, {1, {0x1ec4 }}}, - { 0x1ec7, {1, {0x1ec6 }}}, - { 0x1ec9, {1, {0x1ec8 }}}, - { 0x1ecb, {1, {0x1eca }}}, - { 0x1ecd, {1, {0x1ecc }}}, - { 0x1ecf, {1, {0x1ece }}}, - { 0x1ed1, {1, {0x1ed0 }}}, - { 0x1ed3, {1, {0x1ed2 }}}, - { 0x1ed5, {1, {0x1ed4 }}}, - { 0x1ed7, {1, {0x1ed6 }}}, - { 0x1ed9, {1, {0x1ed8 }}}, - { 0x1edb, {1, {0x1eda }}}, - { 0x1edd, {1, {0x1edc }}}, - { 0x1edf, {1, {0x1ede }}}, - { 0x1ee1, {1, {0x1ee0 }}}, - { 0x1ee3, {1, {0x1ee2 }}}, - { 0x1ee5, {1, {0x1ee4 }}}, - { 0x1ee7, {1, {0x1ee6 }}}, - { 0x1ee9, {1, {0x1ee8 }}}, - { 0x1eeb, {1, {0x1eea }}}, - { 0x1eed, {1, {0x1eec }}}, - { 0x1eef, {1, {0x1eee }}}, - { 0x1ef1, {1, {0x1ef0 }}}, - { 0x1ef3, {1, {0x1ef2 }}}, - { 0x1ef5, {1, {0x1ef4 }}}, - { 0x1ef7, {1, {0x1ef6 }}}, - { 0x1ef9, {1, {0x1ef8 }}}, - { 0x1f00, {1, {0x1f08 }}}, - { 0x1f01, {1, {0x1f09 }}}, - { 0x1f02, {1, {0x1f0a }}}, - { 0x1f03, {1, {0x1f0b }}}, - { 0x1f04, {1, {0x1f0c }}}, - { 0x1f05, {1, {0x1f0d }}}, - { 0x1f06, {1, {0x1f0e }}}, - { 0x1f07, {1, {0x1f0f }}}, - { 0x1f10, {1, {0x1f18 }}}, - { 0x1f11, {1, {0x1f19 }}}, - { 0x1f12, {1, {0x1f1a }}}, - { 0x1f13, {1, {0x1f1b }}}, - { 0x1f14, {1, {0x1f1c }}}, - { 0x1f15, {1, {0x1f1d }}}, - { 0x1f20, {1, {0x1f28 }}}, - { 0x1f21, {1, {0x1f29 }}}, - { 0x1f22, {1, {0x1f2a }}}, - { 0x1f23, {1, {0x1f2b }}}, - { 0x1f24, {1, {0x1f2c }}}, - { 0x1f25, {1, {0x1f2d }}}, - { 0x1f26, {1, {0x1f2e }}}, - { 0x1f27, {1, {0x1f2f }}}, - { 0x1f30, {1, {0x1f38 }}}, - { 0x1f31, {1, {0x1f39 }}}, - { 0x1f32, {1, {0x1f3a }}}, - { 0x1f33, {1, {0x1f3b }}}, - { 0x1f34, {1, {0x1f3c }}}, - { 0x1f35, {1, {0x1f3d }}}, - { 0x1f36, {1, {0x1f3e }}}, - { 0x1f37, {1, {0x1f3f }}}, - { 0x1f40, {1, {0x1f48 }}}, - { 0x1f41, {1, {0x1f49 }}}, - { 0x1f42, {1, {0x1f4a }}}, - { 0x1f43, {1, {0x1f4b }}}, - { 0x1f44, {1, {0x1f4c }}}, - { 0x1f45, {1, {0x1f4d }}}, - { 0x1f51, {1, {0x1f59 }}}, - { 0x1f53, {1, {0x1f5b }}}, - { 0x1f55, {1, {0x1f5d }}}, - { 0x1f57, {1, {0x1f5f }}}, - { 0x1f60, {1, {0x1f68 }}}, - { 0x1f61, {1, {0x1f69 }}}, - { 0x1f62, {1, {0x1f6a }}}, - { 0x1f63, {1, {0x1f6b }}}, - { 0x1f64, {1, {0x1f6c }}}, - { 0x1f65, {1, {0x1f6d }}}, - { 0x1f66, {1, {0x1f6e }}}, - { 0x1f67, {1, {0x1f6f }}}, - { 0x1f70, {1, {0x1fba }}}, - { 0x1f71, {1, {0x1fbb }}}, - { 0x1f72, {1, {0x1fc8 }}}, - { 0x1f73, {1, {0x1fc9 }}}, - { 0x1f74, {1, {0x1fca }}}, - { 0x1f75, {1, {0x1fcb }}}, - { 0x1f76, {1, {0x1fda }}}, - { 0x1f77, {1, {0x1fdb }}}, - { 0x1f78, {1, {0x1ff8 }}}, - { 0x1f79, {1, {0x1ff9 }}}, - { 0x1f7a, {1, {0x1fea }}}, - { 0x1f7b, {1, {0x1feb }}}, - { 0x1f7c, {1, {0x1ffa }}}, - { 0x1f7d, {1, {0x1ffb }}}, - { 0x1fb0, {1, {0x1fb8 }}}, - { 0x1fb1, {1, {0x1fb9 }}}, - { 0x1fd0, {1, {0x1fd8 }}}, - { 0x1fd1, {1, {0x1fd9 }}}, - { 0x1fe0, {1, {0x1fe8 }}}, - { 0x1fe1, {1, {0x1fe9 }}}, - { 0x1fe5, {1, {0x1fec }}}, - { 0x2170, {1, {0x2160 }}}, - { 0x2171, {1, {0x2161 }}}, - { 0x2172, {1, {0x2162 }}}, - { 0x2173, {1, {0x2163 }}}, - { 0x2174, {1, {0x2164 }}}, - { 0x2175, {1, {0x2165 }}}, - { 0x2176, {1, {0x2166 }}}, - { 0x2177, {1, {0x2167 }}}, - { 0x2178, {1, {0x2168 }}}, - { 0x2179, {1, {0x2169 }}}, - { 0x217a, {1, {0x216a }}}, - { 0x217b, {1, {0x216b }}}, - { 0x217c, {1, {0x216c }}}, - { 0x217d, {1, {0x216d }}}, - { 0x217e, {1, {0x216e }}}, - { 0x217f, {1, {0x216f }}}, - { 0x24d0, {1, {0x24b6 }}}, - { 0x24d1, {1, {0x24b7 }}}, - { 0x24d2, {1, {0x24b8 }}}, - { 0x24d3, {1, {0x24b9 }}}, - { 0x24d4, {1, {0x24ba }}}, - { 0x24d5, {1, {0x24bb }}}, - { 0x24d6, {1, {0x24bc }}}, - { 0x24d7, {1, {0x24bd }}}, - { 0x24d8, {1, {0x24be }}}, - { 0x24d9, {1, {0x24bf }}}, - { 0x24da, {1, {0x24c0 }}}, - { 0x24db, {1, {0x24c1 }}}, - { 0x24dc, {1, {0x24c2 }}}, - { 0x24dd, {1, {0x24c3 }}}, - { 0x24de, {1, {0x24c4 }}}, - { 0x24df, {1, {0x24c5 }}}, - { 0x24e0, {1, {0x24c6 }}}, - { 0x24e1, {1, {0x24c7 }}}, - { 0x24e2, {1, {0x24c8 }}}, - { 0x24e3, {1, {0x24c9 }}}, - { 0x24e4, {1, {0x24ca }}}, - { 0x24e5, {1, {0x24cb }}}, - { 0x24e6, {1, {0x24cc }}}, - { 0x24e7, {1, {0x24cd }}}, - { 0x24e8, {1, {0x24ce }}}, - { 0x24e9, {1, {0x24cf }}}, - { 0x2c30, {1, {0x2c00 }}}, - { 0x2c31, {1, {0x2c01 }}}, - { 0x2c32, {1, {0x2c02 }}}, - { 0x2c33, {1, {0x2c03 }}}, - { 0x2c34, {1, {0x2c04 }}}, - { 0x2c35, {1, {0x2c05 }}}, - { 0x2c36, {1, {0x2c06 }}}, - { 0x2c37, {1, {0x2c07 }}}, - { 0x2c38, {1, {0x2c08 }}}, - { 0x2c39, {1, {0x2c09 }}}, - { 0x2c3a, {1, {0x2c0a }}}, - { 0x2c3b, {1, {0x2c0b }}}, - { 0x2c3c, {1, {0x2c0c }}}, - { 0x2c3d, {1, {0x2c0d }}}, - { 0x2c3e, {1, {0x2c0e }}}, - { 0x2c3f, {1, {0x2c0f }}}, - { 0x2c40, {1, {0x2c10 }}}, - { 0x2c41, {1, {0x2c11 }}}, - { 0x2c42, {1, {0x2c12 }}}, - { 0x2c43, {1, {0x2c13 }}}, - { 0x2c44, {1, {0x2c14 }}}, - { 0x2c45, {1, {0x2c15 }}}, - { 0x2c46, {1, {0x2c16 }}}, - { 0x2c47, {1, {0x2c17 }}}, - { 0x2c48, {1, {0x2c18 }}}, - { 0x2c49, {1, {0x2c19 }}}, - { 0x2c4a, {1, {0x2c1a }}}, - { 0x2c4b, {1, {0x2c1b }}}, - { 0x2c4c, {1, {0x2c1c }}}, - { 0x2c4d, {1, {0x2c1d }}}, - { 0x2c4e, {1, {0x2c1e }}}, - { 0x2c4f, {1, {0x2c1f }}}, - { 0x2c50, {1, {0x2c20 }}}, - { 0x2c51, {1, {0x2c21 }}}, - { 0x2c52, {1, {0x2c22 }}}, - { 0x2c53, {1, {0x2c23 }}}, - { 0x2c54, {1, {0x2c24 }}}, - { 0x2c55, {1, {0x2c25 }}}, - { 0x2c56, {1, {0x2c26 }}}, - { 0x2c57, {1, {0x2c27 }}}, - { 0x2c58, {1, {0x2c28 }}}, - { 0x2c59, {1, {0x2c29 }}}, - { 0x2c5a, {1, {0x2c2a }}}, - { 0x2c5b, {1, {0x2c2b }}}, - { 0x2c5c, {1, {0x2c2c }}}, - { 0x2c5d, {1, {0x2c2d }}}, - { 0x2c5e, {1, {0x2c2e }}}, - { 0x2c81, {1, {0x2c80 }}}, - { 0x2c83, {1, {0x2c82 }}}, - { 0x2c85, {1, {0x2c84 }}}, - { 0x2c87, {1, {0x2c86 }}}, - { 0x2c89, {1, {0x2c88 }}}, - { 0x2c8b, {1, {0x2c8a }}}, - { 0x2c8d, {1, {0x2c8c }}}, - { 0x2c8f, {1, {0x2c8e }}}, - { 0x2c91, {1, {0x2c90 }}}, - { 0x2c93, {1, {0x2c92 }}}, - { 0x2c95, {1, {0x2c94 }}}, - { 0x2c97, {1, {0x2c96 }}}, - { 0x2c99, {1, {0x2c98 }}}, - { 0x2c9b, {1, {0x2c9a }}}, - { 0x2c9d, {1, {0x2c9c }}}, - { 0x2c9f, {1, {0x2c9e }}}, - { 0x2ca1, {1, {0x2ca0 }}}, - { 0x2ca3, {1, {0x2ca2 }}}, - { 0x2ca5, {1, {0x2ca4 }}}, - { 0x2ca7, {1, {0x2ca6 }}}, - { 0x2ca9, {1, {0x2ca8 }}}, - { 0x2cab, {1, {0x2caa }}}, - { 0x2cad, {1, {0x2cac }}}, - { 0x2caf, {1, {0x2cae }}}, - { 0x2cb1, {1, {0x2cb0 }}}, - { 0x2cb3, {1, {0x2cb2 }}}, - { 0x2cb5, {1, {0x2cb4 }}}, - { 0x2cb7, {1, {0x2cb6 }}}, - { 0x2cb9, {1, {0x2cb8 }}}, - { 0x2cbb, {1, {0x2cba }}}, - { 0x2cbd, {1, {0x2cbc }}}, - { 0x2cbf, {1, {0x2cbe }}}, - { 0x2cc1, {1, {0x2cc0 }}}, - { 0x2cc3, {1, {0x2cc2 }}}, - { 0x2cc5, {1, {0x2cc4 }}}, - { 0x2cc7, {1, {0x2cc6 }}}, - { 0x2cc9, {1, {0x2cc8 }}}, - { 0x2ccb, {1, {0x2cca }}}, - { 0x2ccd, {1, {0x2ccc }}}, - { 0x2ccf, {1, {0x2cce }}}, - { 0x2cd1, {1, {0x2cd0 }}}, - { 0x2cd3, {1, {0x2cd2 }}}, - { 0x2cd5, {1, {0x2cd4 }}}, - { 0x2cd7, {1, {0x2cd6 }}}, - { 0x2cd9, {1, {0x2cd8 }}}, - { 0x2cdb, {1, {0x2cda }}}, - { 0x2cdd, {1, {0x2cdc }}}, - { 0x2cdf, {1, {0x2cde }}}, - { 0x2ce1, {1, {0x2ce0 }}}, - { 0x2ce3, {1, {0x2ce2 }}}, - { 0x2d00, {1, {0x10a0 }}}, - { 0x2d01, {1, {0x10a1 }}}, - { 0x2d02, {1, {0x10a2 }}}, - { 0x2d03, {1, {0x10a3 }}}, - { 0x2d04, {1, {0x10a4 }}}, - { 0x2d05, {1, {0x10a5 }}}, - { 0x2d06, {1, {0x10a6 }}}, - { 0x2d07, {1, {0x10a7 }}}, - { 0x2d08, {1, {0x10a8 }}}, - { 0x2d09, {1, {0x10a9 }}}, - { 0x2d0a, {1, {0x10aa }}}, - { 0x2d0b, {1, {0x10ab }}}, - { 0x2d0c, {1, {0x10ac }}}, - { 0x2d0d, {1, {0x10ad }}}, - { 0x2d0e, {1, {0x10ae }}}, - { 0x2d0f, {1, {0x10af }}}, - { 0x2d10, {1, {0x10b0 }}}, - { 0x2d11, {1, {0x10b1 }}}, - { 0x2d12, {1, {0x10b2 }}}, - { 0x2d13, {1, {0x10b3 }}}, - { 0x2d14, {1, {0x10b4 }}}, - { 0x2d15, {1, {0x10b5 }}}, - { 0x2d16, {1, {0x10b6 }}}, - { 0x2d17, {1, {0x10b7 }}}, - { 0x2d18, {1, {0x10b8 }}}, - { 0x2d19, {1, {0x10b9 }}}, - { 0x2d1a, {1, {0x10ba }}}, - { 0x2d1b, {1, {0x10bb }}}, - { 0x2d1c, {1, {0x10bc }}}, - { 0x2d1d, {1, {0x10bd }}}, - { 0x2d1e, {1, {0x10be }}}, - { 0x2d1f, {1, {0x10bf }}}, - { 0x2d20, {1, {0x10c0 }}}, - { 0x2d21, {1, {0x10c1 }}}, - { 0x2d22, {1, {0x10c2 }}}, - { 0x2d23, {1, {0x10c3 }}}, - { 0x2d24, {1, {0x10c4 }}}, - { 0x2d25, {1, {0x10c5 }}}, - { 0xff41, {1, {0xff21 }}}, - { 0xff42, {1, {0xff22 }}}, - { 0xff43, {1, {0xff23 }}}, - { 0xff44, {1, {0xff24 }}}, - { 0xff45, {1, {0xff25 }}}, - { 0xff46, {1, {0xff26 }}}, - { 0xff47, {1, {0xff27 }}}, - { 0xff48, {1, {0xff28 }}}, - { 0xff49, {1, {0xff29 }}}, - { 0xff4a, {1, {0xff2a }}}, - { 0xff4b, {1, {0xff2b }}}, - { 0xff4c, {1, {0xff2c }}}, - { 0xff4d, {1, {0xff2d }}}, - { 0xff4e, {1, {0xff2e }}}, - { 0xff4f, {1, {0xff2f }}}, - { 0xff50, {1, {0xff30 }}}, - { 0xff51, {1, {0xff31 }}}, - { 0xff52, {1, {0xff32 }}}, - { 0xff53, {1, {0xff33 }}}, - { 0xff54, {1, {0xff34 }}}, - { 0xff55, {1, {0xff35 }}}, - { 0xff56, {1, {0xff36 }}}, - { 0xff57, {1, {0xff37 }}}, - { 0xff58, {1, {0xff38 }}}, - { 0xff59, {1, {0xff39 }}}, - { 0xff5a, {1, {0xff3a }}}, - { 0x10428, {1, {0x10400 }}}, - { 0x10429, {1, {0x10401 }}}, - { 0x1042a, {1, {0x10402 }}}, - { 0x1042b, {1, {0x10403 }}}, - { 0x1042c, {1, {0x10404 }}}, - { 0x1042d, {1, {0x10405 }}}, - { 0x1042e, {1, {0x10406 }}}, - { 0x1042f, {1, {0x10407 }}}, - { 0x10430, {1, {0x10408 }}}, - { 0x10431, {1, {0x10409 }}}, - { 0x10432, {1, {0x1040a }}}, - { 0x10433, {1, {0x1040b }}}, - { 0x10434, {1, {0x1040c }}}, - { 0x10435, {1, {0x1040d }}}, - { 0x10436, {1, {0x1040e }}}, - { 0x10437, {1, {0x1040f }}}, - { 0x10438, {1, {0x10410 }}}, - { 0x10439, {1, {0x10411 }}}, - { 0x1043a, {1, {0x10412 }}}, - { 0x1043b, {1, {0x10413 }}}, - { 0x1043c, {1, {0x10414 }}}, - { 0x1043d, {1, {0x10415 }}}, - { 0x1043e, {1, {0x10416 }}}, - { 0x1043f, {1, {0x10417 }}}, - { 0x10440, {1, {0x10418 }}}, - { 0x10441, {1, {0x10419 }}}, - { 0x10442, {1, {0x1041a }}}, - { 0x10443, {1, {0x1041b }}}, - { 0x10444, {1, {0x1041c }}}, - { 0x10445, {1, {0x1041d }}}, - { 0x10446, {1, {0x1041e }}}, - { 0x10447, {1, {0x1041f }}}, - { 0x10448, {1, {0x10420 }}}, - { 0x10449, {1, {0x10421 }}}, - { 0x1044a, {1, {0x10422 }}}, - { 0x1044b, {1, {0x10423 }}}, - { 0x1044c, {1, {0x10424 }}}, - { 0x1044d, {1, {0x10425 }}}, - { 0x1044e, {1, {0x10426 }}}, - { 0x1044f, {1, {0x10427 }}} -}; - -static const CaseUnfold_11_Type CaseUnfold_11_Locale[] = { - { 0x0069, {1, {0x0049 }}} -}; - -static const CaseUnfold_12_Type CaseUnfold_12[] = { - { {0x0061, 0x02be}, {1, {0x1e9a }}}, - { {0x0066, 0x0066}, {1, {0xfb00 }}}, - { {0x0066, 0x0069}, {1, {0xfb01 }}}, - { {0x0066, 0x006c}, {1, {0xfb02 }}}, - { {0x0068, 0x0331}, {1, {0x1e96 }}}, - { {0x006a, 0x030c}, {1, {0x01f0 }}}, - { {0x0073, 0x0073}, {1, {0x00df }}}, - { {0x0073, 0x0074}, {2, {0xfb05, 0xfb06 }}}, - { {0x0074, 0x0308}, {1, {0x1e97 }}}, - { {0x0077, 0x030a}, {1, {0x1e98 }}}, - { {0x0079, 0x030a}, {1, {0x1e99 }}}, - { {0x02bc, 0x006e}, {1, {0x0149 }}}, - { {0x03ac, 0x03b9}, {1, {0x1fb4 }}}, - { {0x03ae, 0x03b9}, {1, {0x1fc4 }}}, - { {0x03b1, 0x0342}, {1, {0x1fb6 }}}, - { {0x03b1, 0x03b9}, {2, {0x1fb3, 0x1fbc }}}, - { {0x03b7, 0x0342}, {1, {0x1fc6 }}}, - { {0x03b7, 0x03b9}, {2, {0x1fc3, 0x1fcc }}}, - { {0x03b9, 0x0342}, {1, {0x1fd6 }}}, - { {0x03c1, 0x0313}, {1, {0x1fe4 }}}, - { {0x03c5, 0x0313}, {1, {0x1f50 }}}, - { {0x03c5, 0x0342}, {1, {0x1fe6 }}}, - { {0x03c9, 0x0342}, {1, {0x1ff6 }}}, - { {0x03c9, 0x03b9}, {2, {0x1ff3, 0x1ffc }}}, - { {0x03ce, 0x03b9}, {1, {0x1ff4 }}}, - { {0x0565, 0x0582}, {1, {0x0587 }}}, - { {0x0574, 0x0565}, {1, {0xfb14 }}}, - { {0x0574, 0x056b}, {1, {0xfb15 }}}, - { {0x0574, 0x056d}, {1, {0xfb17 }}}, - { {0x0574, 0x0576}, {1, {0xfb13 }}}, - { {0x057e, 0x0576}, {1, {0xfb16 }}}, - { {0x1f00, 0x03b9}, {2, {0x1f88, 0x1f80 }}}, - { {0x1f01, 0x03b9}, {2, {0x1f81, 0x1f89 }}}, - { {0x1f02, 0x03b9}, {2, {0x1f82, 0x1f8a }}}, - { {0x1f03, 0x03b9}, {2, {0x1f83, 0x1f8b }}}, - { {0x1f04, 0x03b9}, {2, {0x1f84, 0x1f8c }}}, - { {0x1f05, 0x03b9}, {2, {0x1f85, 0x1f8d }}}, - { {0x1f06, 0x03b9}, {2, {0x1f86, 0x1f8e }}}, - { {0x1f07, 0x03b9}, {2, {0x1f87, 0x1f8f }}}, - { {0x1f20, 0x03b9}, {2, {0x1f90, 0x1f98 }}}, - { {0x1f21, 0x03b9}, {2, {0x1f91, 0x1f99 }}}, - { {0x1f22, 0x03b9}, {2, {0x1f92, 0x1f9a }}}, - { {0x1f23, 0x03b9}, {2, {0x1f93, 0x1f9b }}}, - { {0x1f24, 0x03b9}, {2, {0x1f94, 0x1f9c }}}, - { {0x1f25, 0x03b9}, {2, {0x1f95, 0x1f9d }}}, - { {0x1f26, 0x03b9}, {2, {0x1f96, 0x1f9e }}}, - { {0x1f27, 0x03b9}, {2, {0x1f97, 0x1f9f }}}, - { {0x1f60, 0x03b9}, {2, {0x1fa0, 0x1fa8 }}}, - { {0x1f61, 0x03b9}, {2, {0x1fa1, 0x1fa9 }}}, - { {0x1f62, 0x03b9}, {2, {0x1fa2, 0x1faa }}}, - { {0x1f63, 0x03b9}, {2, {0x1fa3, 0x1fab }}}, - { {0x1f64, 0x03b9}, {2, {0x1fa4, 0x1fac }}}, - { {0x1f65, 0x03b9}, {2, {0x1fa5, 0x1fad }}}, - { {0x1f66, 0x03b9}, {2, {0x1fa6, 0x1fae }}}, - { {0x1f67, 0x03b9}, {2, {0x1fa7, 0x1faf }}}, - { {0x1f70, 0x03b9}, {1, {0x1fb2 }}}, - { {0x1f74, 0x03b9}, {1, {0x1fc2 }}}, - { {0x1f7c, 0x03b9}, {1, {0x1ff2 }}} -}; - -static const CaseUnfold_12_Type CaseUnfold_12_Locale[] = { - { {0x0069, 0x0307}, {1, {0x0130 }}} -}; - -static const CaseUnfold_13_Type CaseUnfold_13[] = { - { {0x0066, 0x0066, 0x0069}, {1, {0xfb03 }}}, - { {0x0066, 0x0066, 0x006c}, {1, {0xfb04 }}}, - { {0x03b1, 0x0342, 0x03b9}, {1, {0x1fb7 }}}, - { {0x03b7, 0x0342, 0x03b9}, {1, {0x1fc7 }}}, - { {0x03b9, 0x0308, 0x0300}, {1, {0x1fd2 }}}, - { {0x03b9, 0x0308, 0x0301}, {2, {0x0390, 0x1fd3 }}}, - { {0x03b9, 0x0308, 0x0342}, {1, {0x1fd7 }}}, - { {0x03c5, 0x0308, 0x0300}, {1, {0x1fe2 }}}, - { {0x03c5, 0x0308, 0x0301}, {2, {0x03b0, 0x1fe3 }}}, - { {0x03c5, 0x0308, 0x0342}, {1, {0x1fe7 }}}, - { {0x03c5, 0x0313, 0x0300}, {1, {0x1f52 }}}, - { {0x03c5, 0x0313, 0x0301}, {1, {0x1f54 }}}, - { {0x03c5, 0x0313, 0x0342}, {1, {0x1f56 }}}, - { {0x03c9, 0x0342, 0x03b9}, {1, {0x1ff7 }}} -}; - -#define numberof(array) (int)(sizeof(array) / sizeof((array)[0])) -#define CODE_RANGES_NUM numberof(CodeRanges) - -extern int -onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED) -{ - if ( -#ifdef USE_UNICODE_PROPERTIES - ctype <= ONIGENC_MAX_STD_CTYPE && -#endif - code < 256) { - return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype); - } - - if (ctype >= CODE_RANGES_NUM) { - return ONIGERR_TYPE_BUG; - } - - return onig_is_in_code_range((UChar* )CodeRanges[ctype], code); -} - - -extern int -onigenc_unicode_ctype_code_range(int ctype, const OnigCodePoint* ranges[]) -{ - if (ctype >= CODE_RANGES_NUM) { - return ONIGERR_TYPE_BUG; - } - - *ranges = CodeRanges[ctype]; - - return 0; -} - -extern int -onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out, - const OnigCodePoint* ranges[], - struct OnigEncodingTypeST* enc ARG_UNUSED) -{ - *sb_out = 0x00; - return onigenc_unicode_ctype_code_range(ctype, ranges); -} - -#include "st.h" - -#define PROPERTY_NAME_MAX_SIZE MAX_WORD_LENGTH - -extern int -onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end) -{ - int len; - int ctype; - UChar buf[PROPERTY_NAME_MAX_SIZE]; - UChar *p; - OnigCodePoint code; - - p = name; - len = 0; - for (p = name; p < end; p += enclen(enc, p, end)) { - code = ONIGENC_MBC_TO_CODE(enc, p, end); - if (code == ' ' || code == '-' || code == '_') - continue; - if (code >= 0x80) - return ONIGERR_INVALID_CHAR_PROPERTY_NAME; - - buf[len++] = (UChar )TOLOWER((unsigned char)code); - if (len >= PROPERTY_NAME_MAX_SIZE) - return ONIGERR_INVALID_CHAR_PROPERTY_NAME; - } - - buf[len] = 0; - - if ((ctype = uniname2ctype(buf, len)) < 0) { - return ONIGERR_INVALID_CHAR_PROPERTY_NAME; - } - - return ctype; -} - - -static int -code2_cmp(OnigCodePoint* x, OnigCodePoint* y) -{ - if (x[0] == y[0] && x[1] == y[1]) return 0; - return 1; -} - -static st_index_t -code2_hash(OnigCodePoint* x) -{ - return (st_index_t )(x[0] + x[1]); -} - -static const struct st_hash_type type_code2_hash = { - code2_cmp, - code2_hash, -}; - -static int -code3_cmp(OnigCodePoint* x, OnigCodePoint* y) -{ - if (x[0] == y[0] && x[1] == y[1] && x[2] == y[2]) return 0; - return 1; -} - -static st_index_t -code3_hash(OnigCodePoint* x) -{ - return (st_index_t )(x[0] + x[1] + x[2]); -} - -static const struct st_hash_type type_code3_hash = { - code3_cmp, - code3_hash, -}; - - -static st_table* FoldTable; /* fold-1, fold-2, fold-3 */ -static st_table* Unfold1Table; -static st_table* Unfold2Table; -static st_table* Unfold3Table; -static int CaseFoldInited = 0; - -static int init_case_fold_table(void) -{ - const CaseFold_11_Type *p; - const CaseUnfold_11_Type *p1; - const CaseUnfold_12_Type *p2; - const CaseUnfold_13_Type *p3; - int i; - - THREAD_ATOMIC_START; - - FoldTable = st_init_numtable_with_size(1200); - if (ONIG_IS_NULL(FoldTable)) return ONIGERR_MEMORY; - for (i = 0; i < numberof(CaseFold); i++) { - p = &CaseFold[i]; - st_add_direct(FoldTable, (st_data_t )p->from, (st_data_t )&(p->to)); - } - for (i = 0; i < numberof(CaseFold_Locale); i++) { - p = &CaseFold_Locale[i]; - st_add_direct(FoldTable, (st_data_t )p->from, (st_data_t )&(p->to)); - } - - Unfold1Table = st_init_numtable_with_size(1000); - if (ONIG_IS_NULL(Unfold1Table)) return ONIGERR_MEMORY; - - for (i = 0; i < numberof(CaseUnfold_11); i++) { - p1 = &CaseUnfold_11[i]; - st_add_direct(Unfold1Table, (st_data_t )p1->from, (st_data_t )&(p1->to)); - } - for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) { - p1 = &CaseUnfold_11_Locale[i]; - st_add_direct(Unfold1Table, (st_data_t )p1->from, (st_data_t )&(p1->to)); - } - - Unfold2Table = st_init_table_with_size(&type_code2_hash, 200); - if (ONIG_IS_NULL(Unfold2Table)) return ONIGERR_MEMORY; - - for (i = 0; i < numberof(CaseUnfold_12); i++) { - p2 = &CaseUnfold_12[i]; - st_add_direct(Unfold2Table, (st_data_t )p2->from, (st_data_t )(&p2->to)); - } - for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) { - p2 = &CaseUnfold_12_Locale[i]; - st_add_direct(Unfold2Table, (st_data_t )p2->from, (st_data_t )(&p2->to)); - } - - Unfold3Table = st_init_table_with_size(&type_code3_hash, 30); - if (ONIG_IS_NULL(Unfold3Table)) return ONIGERR_MEMORY; - - for (i = 0; i < numberof(CaseUnfold_13); i++) { - p3 = &CaseUnfold_13[i]; - st_add_direct(Unfold3Table, (st_data_t )p3->from, (st_data_t )(&p3->to)); - } - - CaseFoldInited = 1; - THREAD_ATOMIC_END; - return 0; -} - -extern int -onigenc_unicode_mbc_case_fold(OnigEncoding enc, - OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end, - UChar* fold) -{ - CodePointList3 *to; - OnigCodePoint code; - int i, len, rlen; - const UChar *p = *pp; - - if (CaseFoldInited == 0) init_case_fold_table(); - - code = ONIGENC_MBC_TO_CODE(enc, p, end); - len = enclen(enc, p, end); - *pp += len; - -#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI - if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { - if (code == 0x0049) { - return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold); - } - else if (code == 0x0130) { - return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold); - } - } -#endif - - if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0) { - if (to->n == 1) { - return ONIGENC_CODE_TO_MBC(enc, to->code[0], fold); - } - else - { - rlen = 0; - for (i = 0; i < to->n; i++) { - len = ONIGENC_CODE_TO_MBC(enc, to->code[i], fold); - fold += len; - rlen += len; - } - return rlen; - } - } - - for (i = 0; i < len; i++) { - *fold++ = *p++; - } - return len; -} - -extern int -onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, - OnigApplyAllCaseFoldFunc f, void* arg, - OnigEncoding enc ARG_UNUSED) -{ - const CaseUnfold_11_Type* p11; - OnigCodePoint code; - int i, j, k, r; - - /* if (CaseFoldInited == 0) init_case_fold_table(); */ - - for (i = 0; i < numberof(CaseUnfold_11); i++) { - p11 = &CaseUnfold_11[i]; - for (j = 0; j < p11->to.n; j++) { - code = p11->from; - r = (*f)(p11->to.code[j], &code, 1, arg); - if (r != 0) return r; - - code = p11->to.code[j]; - r = (*f)(p11->from, &code, 1, arg); - if (r != 0) return r; - - for (k = 0; k < j; k++) { - r = (*f)(p11->to.code[j], (OnigCodePoint* )(&p11->to.code[k]), 1, arg); - if (r != 0) return r; - - r = (*f)(p11->to.code[k], (OnigCodePoint* )(&p11->to.code[j]), 1, arg); - if (r != 0) return r; - } - } - } - -#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI - if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { - code = 0x0131; - r = (*f)(0x0049, &code, 1, arg); - if (r != 0) return r; - code = 0x0049; - r = (*f)(0x0131, &code, 1, arg); - if (r != 0) return r; - - code = 0x0130; - r = (*f)(0x0069, &code, 1, arg); - if (r != 0) return r; - code = 0x0069; - r = (*f)(0x0130, &code, 1, arg); - if (r != 0) return r; - } - else { -#endif - for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) { - p11 = &CaseUnfold_11_Locale[i]; - for (j = 0; j < p11->to.n; j++) { - code = p11->from; - r = (*f)(p11->to.code[j], &code, 1, arg); - if (r != 0) return r; - - code = p11->to.code[j]; - r = (*f)(p11->from, &code, 1, arg); - if (r != 0) return r; - - for (k = 0; k < j; k++) { - r = (*f)(p11->to.code[j], (OnigCodePoint* )(&p11->to.code[k]), - 1, arg); - if (r != 0) return r; - - r = (*f)(p11->to.code[k], (OnigCodePoint* )(&p11->to.code[j]), - 1, arg); - if (r != 0) return r; - } - } - } -#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI - } -#endif - - if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - for (i = 0; i < numberof(CaseUnfold_12); i++) { - for (j = 0; j < CaseUnfold_12[i].to.n; j++) { - r = (*f)(CaseUnfold_12[i].to.code[j], - (OnigCodePoint* )CaseUnfold_12[i].from, 2, arg); - if (r != 0) return r; - - for (k = 0; k < CaseUnfold_12[i].to.n; k++) { - if (k == j) continue; - - r = (*f)(CaseUnfold_12[i].to.code[j], - (OnigCodePoint* )(&CaseUnfold_12[i].to.code[k]), 1, arg); - if (r != 0) return r; - } - } - } - -#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI - if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) { -#endif - for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) { - for (j = 0; j < CaseUnfold_12_Locale[i].to.n; j++) { - r = (*f)(CaseUnfold_12_Locale[i].to.code[j], - (OnigCodePoint* )CaseUnfold_12_Locale[i].from, 2, arg); - if (r != 0) return r; - - for (k = 0; k < CaseUnfold_12_Locale[i].to.n; k++) { - if (k == j) continue; - - r = (*f)(CaseUnfold_12_Locale[i].to.code[j], - (OnigCodePoint* )(&CaseUnfold_12_Locale[i].to.code[k]), - 1, arg); - if (r != 0) return r; - } - } - } -#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI - } -#endif - - for (i = 0; i < numberof(CaseUnfold_13); i++) { - for (j = 0; j < CaseUnfold_13[i].to.n; j++) { - r = (*f)(CaseUnfold_13[i].to.code[j], - (OnigCodePoint* )CaseUnfold_13[i].from, 3, arg); - if (r != 0) return r; - - for (k = 0; k < CaseUnfold_13[i].to.n; k++) { - if (k == j) continue; - - r = (*f)(CaseUnfold_13[i].to.code[j], - (OnigCodePoint* )(&CaseUnfold_13[i].to.code[k]), 1, arg); - if (r != 0) return r; - } - } - } - } - - return 0; -} - -extern int -onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, - OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, - OnigCaseFoldCodeItem items[]) -{ - int n, i, j, k, len; - OnigCodePoint code, codes[3]; - CodePointList3 *to, *z3; - CodePointList2 *z2; - - if (CaseFoldInited == 0) init_case_fold_table(); - - n = 0; - - code = ONIGENC_MBC_TO_CODE(enc, p, end); - len = enclen(enc, p, end); - -#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI - if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { - if (code == 0x0049) { - items[0].byte_len = len; - items[0].code_len = 1; - items[0].code[0] = 0x0131; - return 1; - } - else if (code == 0x0130) { - items[0].byte_len = len; - items[0].code_len = 1; - items[0].code[0] = 0x0069; - return 1; - } - else if (code == 0x0131) { - items[0].byte_len = len; - items[0].code_len = 1; - items[0].code[0] = 0x0049; - return 1; - } - else if (code == 0x0069) { - items[0].byte_len = len; - items[0].code_len = 1; - items[0].code[0] = 0x0130; - return 1; - } - } -#endif - - if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0) { - if (to->n == 1) { - OnigCodePoint orig_code = code; - - items[0].byte_len = len; - items[0].code_len = 1; - items[0].code[0] = to->code[0]; - n++; - - code = to->code[0]; - if (onig_st_lookup(Unfold1Table, (st_data_t )code, (void* )&to) != 0) { - for (i = 0; i < to->n; i++) { - if (to->code[i] != orig_code) { - items[n].byte_len = len; - items[n].code_len = 1; - items[n].code[0] = to->code[i]; - n++; - } - } - } - } - else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - OnigCodePoint cs[3][4]; - int fn, ncs[3]; - - for (fn = 0; fn < to->n; fn++) { - cs[fn][0] = to->code[fn]; - if (onig_st_lookup(Unfold1Table, (st_data_t )cs[fn][0], - (void* )&z3) != 0) { - for (i = 0; i < z3->n; i++) { - cs[fn][i+1] = z3->code[i]; - } - ncs[fn] = z3->n + 1; - } - else - ncs[fn] = 1; - } - - if (fn == 2) { - for (i = 0; i < ncs[0]; i++) { - for (j = 0; j < ncs[1]; j++) { - items[n].byte_len = len; - items[n].code_len = 2; - items[n].code[0] = cs[0][i]; - items[n].code[1] = cs[1][j]; - n++; - } - } - - if (onig_st_lookup(Unfold2Table, (st_data_t )to->code, - (void* )&z2) != 0) { - for (i = 0; i < z2->n; i++) { - if (z2->code[i] == code) continue; - - items[n].byte_len = len; - items[n].code_len = 1; - items[n].code[0] = z2->code[i]; - n++; - } - } - } - else { - for (i = 0; i < ncs[0]; i++) { - for (j = 0; j < ncs[1]; j++) { - for (k = 0; k < ncs[2]; k++) { - items[n].byte_len = len; - items[n].code_len = 3; - items[n].code[0] = cs[0][i]; - items[n].code[1] = cs[1][j]; - items[n].code[2] = cs[2][k]; - n++; - } - } - } - - if (onig_st_lookup(Unfold3Table, (st_data_t )to->code, - (void* )&z2) != 0) { - for (i = 0; i < z2->n; i++) { - if (z2->code[i] == code) continue; - - items[n].byte_len = len; - items[n].code_len = 1; - items[n].code[0] = z2->code[i]; - n++; - } - } - } - - /* multi char folded code is not head of another folded multi char */ - flag = 0; /* DISABLE_CASE_FOLD_MULTI_CHAR(flag); */ - } - } - else { - if (onig_st_lookup(Unfold1Table, (st_data_t )code, (void* )&to) != 0) { - for (i = 0; i < to->n; i++) { - items[n].byte_len = len; - items[n].code_len = 1; - items[n].code[0] = to->code[i]; - n++; - } - } - } - - - if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - p += len; - if (p < end) { - int clen; - - codes[0] = code; - code = ONIGENC_MBC_TO_CODE(enc, p, end); - if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0 - && to->n == 1) { - codes[1] = to->code[0]; - } - else - codes[1] = code; - - clen = enclen(enc, p, end); - len += clen; - if (onig_st_lookup(Unfold2Table, (st_data_t )codes, (void* )&z2) != 0) { - for (i = 0; i < z2->n; i++) { - items[n].byte_len = len; - items[n].code_len = 1; - items[n].code[0] = z2->code[i]; - n++; - } - } - - p += clen; - if (p < end) { - code = ONIGENC_MBC_TO_CODE(enc, p, end); - if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0 - && to->n == 1) { - codes[2] = to->code[0]; - } - else - codes[2] = code; - - clen = enclen(enc, p, end); - len += clen; - if (onig_st_lookup(Unfold3Table, (st_data_t )codes, - (void* )&z2) != 0) { - for (i = 0; i < z2->n; i++) { - items[n].byte_len = len; - items[n].code_len = 1; - items[n].code[0] = z2->code[i]; - n++; - } - } - } - } - } - - return n; -} -#endif //INCLUDE_ENCODING diff --git a/src/us_ascii.c b/src/us_ascii.c deleted file mode 100644 index b6e3f50cf..000000000 --- a/src/us_ascii.c +++ /dev/null @@ -1,34 +0,0 @@ -#include "mruby.h" -#ifdef INCLUDE_ENCODING -#include "regenc.h" - -static int -us_ascii_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc) -{ - if (*p & 0x80) - return ONIGENC_CONSTRUCT_MBCLEN_INVALID(); - return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1); -} - -OnigEncodingDefine(us_ascii, US_ASCII) = { - us_ascii_mbc_enc_len, - "US-ASCII",/* name */ - 1, /* max byte length */ - 1, /* min byte length */ - onigenc_is_mbc_newline_0x0a, - onigenc_single_byte_mbc_to_code, - onigenc_single_byte_code_to_mbclen, - onigenc_single_byte_code_to_mbc, - onigenc_ascii_mbc_case_fold, - onigenc_ascii_apply_all_case_fold, - onigenc_ascii_get_case_fold_codes_by_str, - onigenc_minimum_property_name_to_ctype, - onigenc_ascii_is_code_ctype, - onigenc_not_support_get_ctype_code_range, - onigenc_single_byte_left_adjust_char_head, - onigenc_always_true_is_allowed_reverse_match -}; -ENC_ALIAS("ASCII", "US-ASCII") -ENC_ALIAS("ANSI_X3.4-1968", "US-ASCII") -ENC_ALIAS("646", "US-ASCII") -#endif //INCLUDE_ENCODING diff --git a/src/utf_8.c b/src/utf_8.c deleted file mode 100644 index c444a2053..000000000 --- a/src/utf_8.c +++ /dev/null @@ -1,460 +0,0 @@ -/********************************************************************** - utf_8.c - Oniguruma (regular expression library) -**********************************************************************/ -/*- - * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "mruby.h" -#ifdef INCLUDE_ENCODING -#include "regenc.h" - -#define USE_INVALID_CODE_SCHEME - -#ifdef USE_INVALID_CODE_SCHEME -/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ -#define INVALID_CODE_FE 0xfffffffe -#define INVALID_CODE_FF 0xffffffff -#define VALID_CODE_LIMIT 0x7fffffff -#endif - -#define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) - -static const int EncLen_UTF8[] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 -}; - -typedef enum { - FAILURE = -2, - ACCEPT, - S0, S1, S2, S3, - S4, S5, S6, S7 -} state_t; -#define A ACCEPT -#define F FAILURE -static const signed char trans[][0x100] = { - { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, - /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, - /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, - /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, - /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, - /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, - /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, - /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, - /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, - /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F - }, - { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, - /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, - /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, - /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, - /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F - }, - { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F - }, - { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F - }, - { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F - }, - { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F - }, - { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F - }, - { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F - }, -}; -#undef A -#undef F - -static int -mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) -{ - int firstbyte = *p++; - state_t s; - s = trans[0][firstbyte]; - if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) : - ONIGENC_CONSTRUCT_MBCLEN_INVALID(); - - if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1); - s = trans[s][*p++]; - if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) : - ONIGENC_CONSTRUCT_MBCLEN_INVALID(); - - if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2); - s = trans[s][*p++]; - if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) : - ONIGENC_CONSTRUCT_MBCLEN_INVALID(); - - if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3); - s = trans[s][*p++]; - return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) : - ONIGENC_CONSTRUCT_MBCLEN_INVALID(); -} - -static int -is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc) -{ - if (p < end) { - if (*p == 0x0a) return 1; - -#ifdef USE_UNICODE_ALL_LINE_TERMINATORS -#ifndef USE_CRNL_AS_LINE_TERMINATOR - if (*p == 0x0d) return 1; -#endif - if (p + 1 < end) { - if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */ - return 1; - if (p + 2 < end) { - if ((*(p+2) == 0xa8 || *(p+2) == 0xa9) - && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */ - return 1; - } - } -#endif - } - - return 0; -} - -static OnigCodePoint -mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) -{ - int c, len; - OnigCodePoint n; - - len = enclen(enc, p, end); - c = *p++; - if (len > 1) { - len--; - n = c & ((1 << (6 - len)) - 1); - while (len--) { - c = *p++; - n = (n << 6) | (c & ((1 << 6) - 1)); - } - return n; - } - else { -#ifdef USE_INVALID_CODE_SCHEME - if (c > 0xfd) { - return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF); - } -#endif - return (OnigCodePoint )c; - } -} - -static int -code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED) -{ - if ((code & 0xffffff80) == 0) return 1; - else if ((code & 0xfffff800) == 0) return 2; - else if ((code & 0xffff0000) == 0) return 3; - else if ((code & 0xffe00000) == 0) return 4; - else if ((code & 0xfc000000) == 0) return 5; - else if ((code & 0x80000000) == 0) return 6; -#ifdef USE_INVALID_CODE_SCHEME - else if (code == INVALID_CODE_FE) return 1; - else if (code == INVALID_CODE_FF) return 1; -#endif - else - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; -} - -static int -code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED) -{ -#define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80) -#define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) | 0x80) - - if ((code & 0xffffff80) == 0) { - *buf = (UChar )code; - return 1; - } - else { - UChar *p = buf; - - if ((code & 0xfffff800) == 0) { - *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0); - } - else if ((code & 0xffff0000) == 0) { - *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0); - *p++ = UTF8_TRAILS(code, 6); - } - else if ((code & 0xffe00000) == 0) { - *p++ = (UChar )(((code>>18) & 0x07) | 0xf0); - *p++ = UTF8_TRAILS(code, 12); - *p++ = UTF8_TRAILS(code, 6); - } - else if ((code & 0xfc000000) == 0) { - *p++ = (UChar )(((code>>24) & 0x03) | 0xf8); - *p++ = UTF8_TRAILS(code, 18); - *p++ = UTF8_TRAILS(code, 12); - *p++ = UTF8_TRAILS(code, 6); - } - else if ((code & 0x80000000) == 0) { - *p++ = (UChar )(((code>>30) & 0x01) | 0xfc); - *p++ = UTF8_TRAILS(code, 24); - *p++ = UTF8_TRAILS(code, 18); - *p++ = UTF8_TRAILS(code, 12); - *p++ = UTF8_TRAILS(code, 6); - } -#ifdef USE_INVALID_CODE_SCHEME - else if (code == INVALID_CODE_FE) { - *p = 0xfe; - return 1; - } - else if (code == INVALID_CODE_FF) { - *p = 0xff; - return 1; - } -#endif - else { - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - } - - *p++ = UTF8_TRAIL0(code); - return (int)(p - buf); - } -} - -static int -mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, - const UChar* end, UChar* fold, OnigEncoding enc) -{ - const UChar* p = *pp; - - if (ONIGENC_IS_MBC_ASCII(p)) { -#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI - if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { - if (*p == 0x49) { - *fold++ = 0xc4; - *fold = 0xb1; - (*pp)++; - return 2; - } - } -#endif - - *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); - (*pp)++; - return 1; /* return byte length of converted char to lower */ - } - else { - return onigenc_unicode_mbc_case_fold(enc, flag, pp, end, fold); - } -} - - -static int -get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out, - const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED) -{ - *sb_out = 0x80; - return onigenc_unicode_ctype_code_range(ctype, ranges); -} - - -static UChar* -left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED) -{ - const UChar *p; - - if (s <= start) return (UChar* )s; - p = s; - - while (!utf8_islead(*p) && p > start) p--; - return (UChar* )p; -} - -static int -get_case_fold_codes_by_str(OnigCaseFoldType flag, - const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[], - OnigEncoding enc) -{ - return onigenc_unicode_get_case_fold_codes_by_str(enc, flag, p, end, items); -} - -OnigEncodingDefine(utf_8, UTF_8) = { - mbc_enc_len, - "UTF-8", /* name */ - 6, /* max byte length */ - 1, /* min byte length */ - is_mbc_newline, - mbc_to_code, - code_to_mbclen, - code_to_mbc, - mbc_case_fold, - onigenc_unicode_apply_all_case_fold, - get_case_fold_codes_by_str, - onigenc_unicode_property_name_to_ctype, - onigenc_unicode_is_code_ctype, - get_ctype_code_range, - left_adjust_char_head, - onigenc_always_true_is_allowed_reverse_match -}; -ENC_ALIAS("CP65001", "UTF-8") - -/* - * Name: UTF8-MAC - * Link: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/BPFileSystem.html - * Link: http://developer.apple.com/qa/qa2001/qa1235.html - * Link: http://developer.apple.com/jp/qa/qa2001/qa1235.html - * Link: http://www.gnu.org/software/emacs/NEWS.23.2 - */ -ENC_REPLICATE("UTF8-MAC", "UTF-8") -ENC_ALIAS("UTF-8-MAC", "UTF8-MAC") -ENC_ALIAS("UTF-8-HFS", "UTF8-MAC") /* Emacs 23.2 */ - -#endif //INCLUDE_ENCODING |
