summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorYukihiro "Matz" Matsumoto <[email protected]>2019-08-17 15:52:34 +0900
committerGitHub <[email protected]>2019-08-17 15:52:34 +0900
commit277c91e94cc1e3df52c28048859a267335571d6c (patch)
tree4a6c18e059060d4c09d8fb292e11444c43bdd1a4
parentb55e72b2d67c1042a5170f4cc9cac809f4ed2c90 (diff)
parentbc272730874b5f0e22f0b7126ec46bc761d335b1 (diff)
downloadmruby-277c91e94cc1e3df52c28048859a267335571d6c.tar.gz
mruby-277c91e94cc1e3df52c28048859a267335571d6c.zip
Merge pull request #4625 from dearblue/rindex-broken-utf8
Fix `String#rindex` with invalid UTF-8 string
-rw-r--r--src/string.c75
-rw-r--r--test/t/string.rb14
2 files changed, 74 insertions, 15 deletions
diff --git a/src/string.c b/src/string.c
index 2b9a5cfde..9d2ff34ad 100644
--- a/src/string.c
+++ b/src/string.c
@@ -337,19 +337,69 @@ chars2bytes(mrb_value s, mrb_int off, mrb_int idx)
/* map byte offset to character index */
static mrb_int
-bytes2chars(char *p, mrb_int bi)
+bytes2chars(char *p, mrb_int len, mrb_int bi)
{
- mrb_int i, b, n;
+ const char *e = p + (size_t)len;
+ const char *pivot = p + bi;
+ mrb_int i;
- for (b=i=0; b<bi; i++) {
- n = utf8len_codepage[(unsigned char)*p];
- b += n;
- p += n;
+ for (i = 0; p < pivot; i ++) {
+ p += utf8len(p, e);
}
- if (b != bi) return -1;
+ if (p != pivot) return -1;
return i;
}
+static const char *
+char_adjust(const char *beg, const char *end, const char *ptr)
+{
+ if ((ptr > beg || ptr < end) && (*ptr & 0xc0) == 0x80) {
+ const int utf8_adjust_max = 3;
+ const char *p;
+
+ if (ptr - beg > utf8_adjust_max) {
+ beg = ptr - utf8_adjust_max;
+ }
+
+ p = ptr;
+ while (p > beg) {
+ p --;
+ if ((*p & 0xc0) != 0x80) {
+ int clen = utf8len(p, end);
+ if (clen > ptr - p) return p;
+ break;
+ }
+ }
+ }
+
+ return ptr;
+}
+
+static const char *
+char_backtrack(const char *ptr, const char *end)
+{
+ if (ptr < end) {
+ const int utf8_bytelen_max = 4;
+ const char *p;
+
+ if (end - ptr > utf8_bytelen_max) {
+ ptr = end - utf8_bytelen_max;
+ }
+
+ p = end;
+ while (p > ptr) {
+ p --;
+ if ((*p & 0xc0) != 0x80) {
+ int clen = utf8len_codepage[(unsigned char)*p];
+ if (clen == end - p) { return p; }
+ break;
+ }
+ }
+ }
+
+ return end - 1;
+}
+
static mrb_int
str_index_str_by_char_search(mrb_state *mrb, const char *p, const char *pend, const char *s, const mrb_int slen, mrb_int off)
{
@@ -412,7 +462,9 @@ str_index_str_by_char(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
#else
#define RSTRING_CHAR_LEN(s) RSTRING_LEN(s)
#define chars2bytes(p, off, ci) (ci)
-#define bytes2chars(p, bi) (bi)
+#define bytes2chars(p, end, bi) (bi)
+#define char_adjust(beg, end, ptr) (ptr)
+#define char_backtrack(ptr, end) ((end) - 1)
#define BYTES_ALIGN_CHECK(pos)
#define str_index_str_by_char(mrb, str, sub, pos) str_index_str(mrb, str, sub, pos)
#endif
@@ -624,7 +676,7 @@ str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2)
static mrb_int
str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
{
- char *s, *sbeg, *t;
+ const char *s, *sbeg, *t;
struct RString *ps = mrb_str_ptr(str);
mrb_int len = RSTRING_LEN(sub);
@@ -637,11 +689,12 @@ str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
s = RSTR_PTR(ps) + pos;
t = RSTRING_PTR(sub);
if (len) {
+ s = char_adjust(sbeg, sbeg + RSTR_LEN(ps), s);
while (sbeg <= s) {
if (memcmp(s, t, len) == 0) {
return (mrb_int)(s - RSTR_PTR(ps));
}
- s--;
+ s = char_backtrack(sbeg, s);
}
return -1;
}
@@ -2016,7 +2069,7 @@ mrb_str_rindex(mrb_state *mrb, mrb_value str)
case MRB_TT_STRING:
pos = str_rindex(mrb, str, sub, pos);
if (pos >= 0) {
- pos = bytes2chars(RSTRING_PTR(str), pos);
+ pos = bytes2chars(RSTRING_PTR(str), RSTRING_LEN(str), pos);
BYTES_ALIGN_CHECK(pos);
return mrb_fixnum_value(pos);
}
diff --git a/test/t/string.rb b/test/t/string.rb
index 46cbe6e2a..7e3c327b1 100644
--- a/test/t/string.rb
+++ b/test/t/string.rb
@@ -557,10 +557,16 @@ end
assert('String#rindex(UTF-8)', '15.2.10.5.31') do
str = "こんにちは世界!\nこんにちは世界!"
- assert_nil str.index('さ')
- assert_equal 3, str.index('ち')
- assert_equal 12, str.index('ち', 10)
- assert_equal nil, str.index("さ")
+ assert_nil str.rindex('さ')
+ assert_equal 12, str.rindex('ち')
+ assert_equal 3, str.rindex('ち', 10)
+
+ broken = "\xf0☀\xf1☁\xf2☂\xf3☃\xf0☀\xf1☁\xf2☂\xf3☃"
+ assert_nil broken.rindex("\x81") # "\x81" is a part of "☁" ("\xe2\x98\x81")
+ assert_equal 11, broken.rindex("☁")
+ assert_equal 11, broken.rindex("☁", 12)
+ assert_equal 11, broken.rindex("☁", 11)
+ assert_equal 3, broken.rindex("☁", 10)
end if UTF8STRING
# assert('String#scan', '15.2.10.5.32') do