diff options
| author | Yukihiro "Matz" Matsumoto <[email protected]> | 2014-04-18 00:50:45 +0900 |
|---|---|---|
| committer | Yukihiro "Matz" Matsumoto <[email protected]> | 2014-04-18 00:50:45 +0900 |
| commit | 270659da5fa280d8e835597b0ab384a7470379ef (patch) | |
| tree | eac6184aaf6eef98ee2d2936e8de249a30dffb81 | |
| parent | d000c72bc614be7edf49e04fb05ac62a4707c81a (diff) | |
| parent | db88af83df2abf364e62ee5818d39afc9e63bbcd (diff) | |
| download | mruby-270659da5fa280d8e835597b0ab384a7470379ef.tar.gz mruby-270659da5fa280d8e835597b0ab384a7470379ef.zip | |
Merge pull request #2074 from mattn/fix_string_utf8_index
Implement String#index, fixes #2073
| -rw-r--r-- | mrbgems/mruby-string-utf8/src/string.c | 172 | ||||
| -rw-r--r-- | mrbgems/mruby-string-utf8/test/string.rb | 8 |
2 files changed, 169 insertions, 11 deletions
diff --git a/mrbgems/mruby-string-utf8/src/string.c b/mrbgems/mruby-string-utf8/src/string.c index 2ed25d648..fec0752ba 100644 --- a/mrbgems/mruby-string-utf8/src/string.c +++ b/mrbgems/mruby-string-utf8/src/string.c @@ -5,6 +5,12 @@ #include <ctype.h> #include <string.h> +#define STR_EMBED_P(s) ((s)->flags & MRB_STR_EMBED) +#define STR_EMBED_LEN(s)\ + (size_t)(((s)->flags & MRB_STR_EMBED_LEN_MASK) >> MRB_STR_EMBED_LEN_SHIFT) +#define STR_PTR(s) ((STR_EMBED_P(s)) ? (s)->as.ary : (s)->as.heap.ptr) +#define STR_LEN(s) ((STR_EMBED_P(s)) ? STR_EMBED_LEN(s) : (size_t)(s)->as.heap.len) + static const char utf8len_codepage[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -17,6 +23,8 @@ static const char utf8len_codepage[256] = 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1, }; +static mrb_value mrb_fixnum_chr(mrb_state*, mrb_value); + static mrb_int utf8len(unsigned char* p) { @@ -33,11 +41,12 @@ utf8len(unsigned char* p) } static mrb_int -mrb_utf8_strlen(mrb_value str) +mrb_utf8_strlen(mrb_value str, mrb_int len) { mrb_int total = 0; unsigned char* p = (unsigned char*) RSTRING_PTR(str); - unsigned char* e = p + RSTRING_LEN(str); + unsigned char* e = p; + e += len < 0 ? RSTRING_LEN(str) : len; while (p<e) { p += utf8len(p); total++; @@ -48,12 +57,12 @@ mrb_utf8_strlen(mrb_value str) static mrb_value mrb_str_size(mrb_state *mrb, mrb_value str) { - mrb_int size = mrb_utf8_strlen(str); + mrb_int size = mrb_utf8_strlen(str, -1); return mrb_fixnum_value(size); } -#define RSTRING_LEN_UTF8(s) mrb_utf8_strlen(s) +#define RSTRING_LEN_UTF8(s) mrb_utf8_strlen(s, -1) static mrb_value noregexp(mrb_state *mrb, mrb_value self) @@ -118,7 +127,6 @@ str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) mrb_int i; unsigned char *p = (unsigned char*) RSTRING_PTR(str), *t; unsigned char *e = p + RSTRING_LEN(str); - for (i = 0; i < beg && p<e; i++) { p += utf8len(p); @@ -186,6 +194,35 @@ str_index(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int offset) return pos + offset; } +static mrb_int +str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) +{ + char *s, *sbeg, *t; + struct RString *ps = mrb_str_ptr(str); + mrb_int len = RSTRING_LEN(sub); + + /* substring longer than string */ + if (STR_LEN(ps) < len) return -1; + if (STR_LEN(ps) - pos < len) { + pos = STR_LEN(ps) - len; + } + sbeg = STR_PTR(ps); + s = STR_PTR(ps) + pos; + t = RSTRING_PTR(sub); + if (len) { + while (sbeg <= s) { + if (memcmp(s, t, len) == 0) { + return s - STR_PTR(ps); + } + s--; + } + return -1; + } + else { + return pos; + } +} + static mrb_value mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx) { @@ -246,19 +283,74 @@ mrb_str_aref_m(mrb_state *mrb, mrb_value str) } static mrb_value +mrb_str_index_m(mrb_state *mrb, mrb_value str) +{ + mrb_value *argv; + int argc; + + mrb_value sub; + mrb_int pos; + + mrb_get_args(mrb, "*", &argv, &argc); + if (argc == 2) { + pos = mrb_fixnum(argv[1]); + sub = argv[0]; + } + else { + pos = 0; + if (argc > 0) + sub = argv[0]; + else + sub = mrb_nil_value(); + + } + regexp_check(mrb, sub); + if (pos < 0) { + pos += RSTRING_LEN(str); + if (pos < 0) { + return mrb_nil_value(); + } + } + + if (mrb_type(sub) == MRB_TT_FIXNUM) { + sub = mrb_fixnum_chr(mrb, sub); + } + + switch (mrb_type(sub)) { + default: { + mrb_value tmp; + + tmp = mrb_check_string_type(mrb, sub); + if (mrb_nil_p(tmp)) { + mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %S given", sub); + } + sub = tmp; + } + /* fall through */ + case MRB_TT_STRING: + pos = str_index(mrb, str, sub, pos); + break; + } + + if (pos == -1) return mrb_nil_value(); + return mrb_fixnum_value(mrb_utf8_strlen(str, pos)); +} + +static mrb_value mrb_str_reverse_bang(mrb_state *mrb, mrb_value str) { - mrb_int utf8_len = mrb_utf8_strlen(str); + mrb_int utf8_len = mrb_utf8_strlen(str, -1); if (utf8_len > 1) { mrb_int len = RSTRING_LEN(str); char *buf = (char *)mrb_malloc(mrb, (size_t)len); unsigned char* p = (unsigned char*)buf; unsigned char* e = (unsigned char*)buf + len; - unsigned char* r = (unsigned char*)RSTRING_END(str); - + unsigned char* r; + memcpy(buf, RSTRING_PTR(str), len); mrb_str_modify(mrb, mrb_str_ptr(str)); - + r = (unsigned char*)RSTRING_PTR(str) + len; + while (p<e) { mrb_int clen = utf8len(p); r -= clen; @@ -267,11 +359,67 @@ mrb_str_reverse_bang(mrb_state *mrb, mrb_value str) } mrb_free(mrb, buf); } - + return str; } static mrb_value +mrb_str_rindex_m(mrb_state *mrb, mrb_value str) +{ + mrb_value *argv; + int argc; + mrb_value sub; + mrb_value vpos; + mrb_int pos, len = RSTRING_LEN(str); + + mrb_get_args(mrb, "*", &argv, &argc); + if (argc == 2) { + sub = argv[0]; + vpos = argv[1]; + pos = mrb_fixnum(vpos); + if (pos < 0) { + pos += len; + if (pos < 0) { + regexp_check(mrb, sub); + return mrb_nil_value(); + } + } + if (pos > len) pos = len; + } + else { + pos = len; + if (argc > 0) + sub = argv[0]; + else + sub = mrb_nil_value(); + } + regexp_check(mrb, sub); + + if (mrb_type(sub) == MRB_TT_FIXNUM) { + sub = mrb_fixnum_chr(mrb, sub); + } + + switch (mrb_type(sub)) { + default: { + mrb_value tmp; + + tmp = mrb_check_string_type(mrb, sub); + if (mrb_nil_p(tmp)) { + mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %S given", sub); + } + sub = tmp; + } + /* fall through */ + case MRB_TT_STRING: + pos = str_rindex(mrb, str, sub, pos); + break; + } + + if (pos == -1) return mrb_nil_value(); + return mrb_fixnum_value(mrb_utf8_strlen(str, pos)); +} + +static mrb_value mrb_str_reverse(mrb_state *mrb, mrb_value str) { return mrb_str_reverse_bang(mrb, mrb_str_dup(mrb, str)); @@ -319,10 +467,12 @@ mrb_mruby_string_utf8_gem_init(mrb_state* mrb) mrb_define_method(mrb, s, "size", mrb_str_size, MRB_ARGS_NONE()); mrb_define_method(mrb, s, "length", mrb_str_size, MRB_ARGS_NONE()); + mrb_define_method(mrb, s, "index", mrb_str_index_m, MRB_ARGS_ANY()); mrb_define_method(mrb, s, "[]", mrb_str_aref_m, MRB_ARGS_ANY()); mrb_define_method(mrb, s, "slice", mrb_str_aref_m, MRB_ARGS_ANY()); - mrb_define_method(mrb, s, "reverse", mrb_str_reverse, MRB_ARGS_NONE()); + mrb_define_method(mrb, s, "reverse", mrb_str_reverse, MRB_ARGS_NONE()); mrb_define_method(mrb, s, "reverse!", mrb_str_reverse_bang, MRB_ARGS_NONE()); + mrb_define_method(mrb, s, "rindex", mrb_str_rindex_m, MRB_ARGS_ANY()); mrb_define_method(mrb, mrb->fixnum_class, "chr", mrb_fixnum_chr, MRB_ARGS_NONE()); } diff --git a/mrbgems/mruby-string-utf8/test/string.rb b/mrbgems/mruby-string-utf8/test/string.rb index 325c6eb39..939ac24bb 100644 --- a/mrbgems/mruby-string-utf8/test/string.rb +++ b/mrbgems/mruby-string-utf8/test/string.rb @@ -37,3 +37,11 @@ assert('String#size') do assert_not_equal str.bytesize, str.size assert_equal 2, str[1, 2].size end + +assert('String#index') do + str = "こんにちわ世界!\nこんにちわ世界!" + assert_nil str.index('さ') + assert_equal 3, str.index('ち') + assert_equal 12, str.index('ち', 10) + assert_equal nil, str.index("さ") +end |
