diff options
| author | ksss <[email protected]> | 2017-04-03 23:01:41 +0900 |
|---|---|---|
| committer | ksss <[email protected]> | 2017-04-04 10:06:30 +0900 |
| commit | a80a745b699965169b7ffebbfbe2a7568d7afd98 (patch) | |
| tree | 6ee3c7b71d64803662530629310b33a42c3bd750 | |
| parent | 88a7fedea413568a1ff0410e109ff55a03b63a5f (diff) | |
| download | mruby-a80a745b699965169b7ffebbfbe2a7568d7afd98.tar.gz mruby-a80a745b699965169b7ffebbfbe2a7568d7afd98.zip | |
Support unpack template "U"
| -rw-r--r-- | src/pack.c | 83 | ||||
| -rw-r--r-- | test/pack.rb | 14 |
2 files changed, 95 insertions, 2 deletions
diff --git a/src/pack.c b/src/pack.c index 6a0075f5a..8c76b2638 100644 --- a/src/pack.c +++ b/src/pack.c @@ -448,6 +448,80 @@ pack_utf8(mrb_state *mrb, mrb_value o, mrb_value str, mrb_int sidx, long count, return len; } +static const unsigned long utf8_limits[] = { + 0x0, /* 1 */ + 0x80, /* 2 */ + 0x800, /* 3 */ + 0x10000, /* 4 */ + 0x200000, /* 5 */ + 0x4000000, /* 6 */ + 0x80000000, /* 7 */ +}; + +static unsigned long +utf8_to_uv(mrb_state *mrb, const char *p, long *lenp) +{ + int c = *p++ & 0xff; + unsigned long uv = c; + long n; + + if (!(uv & 0x80)) { + *lenp = 1; + return uv; + } + if (!(uv & 0x40)) { + *lenp = 1; + mrb_raise(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character"); + } + + if (!(uv & 0x20)) { n = 2; uv &= 0x1f; } + else if (!(uv & 0x10)) { n = 3; uv &= 0x0f; } + else if (!(uv & 0x08)) { n = 4; uv &= 0x07; } + else if (!(uv & 0x04)) { n = 5; uv &= 0x03; } + else if (!(uv & 0x02)) { n = 6; uv &= 0x01; } + else { + *lenp = 1; + mrb_raise(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character"); + } + if (n > *lenp) { + mrb_raisef(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character (expected %S bytes, given %S bytes)", + mrb_fixnum_value(n), mrb_fixnum_value(*lenp)); + } + *lenp = n--; + if (n != 0) { + while (n--) { + c = *p++ & 0xff; + if ((c & 0xc0) != 0x80) { + *lenp -= n + 1; + mrb_raisef(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character"); + } + else { + c &= 0x3f; + uv = uv << 6 | c; + } + } + } + n = *lenp - 1; + if (uv < utf8_limits[n]) { + mrb_raisef(mrb, E_ARGUMENT_ERROR, "redundant UTF-8 sequence"); + } + return uv; +} + +static int +unpack_utf8(mrb_state *mrb, const unsigned char * src, int srclen, mrb_value ary, unsigned int flags) +{ + unsigned long uv; + long lenp = srclen; + + if (srclen == 0) { + return 1; + } + uv = utf8_to_uv(mrb, (const char *)src, &lenp); + mrb_ary_push(mrb, ary, mrb_fixnum_value((mrb_int)uv)); + return (int)lenp; +} + static int pack_a(mrb_state *mrb, mrb_value src, mrb_value dst, mrb_int didx, long count, unsigned int flags) { @@ -482,7 +556,7 @@ pack_a(mrb_state *mrb, mrb_value src, mrb_value dst, mrb_int didx, long count, u while (padlen-- > 0) { *dptr++ = pad; } - + return dptr - dptr0; } @@ -541,7 +615,7 @@ pack_h(mrb_state *mrb, mrb_value src, mrb_value dst, mrb_int didx, long count, u } else if (slen > count) { slen = count; } - + dst = str_len_ensure(mrb, dst, didx + count); dptr = RSTRING_PTR(dst) + didx; @@ -1151,6 +1225,11 @@ mrb_pack_unpack(mrb_state *mrb, mrb_value str) case PACK_DIR_DOUBLE: srcidx += unpack_double(mrb, sptr, srclen - srcidx, result, flags); break; + case PACK_DIR_UTF8: + srcidx += unpack_utf8(mrb, sptr, srclen - srcidx, result, flags); + break; + default: + mrb_raise(mrb, E_RUNTIME_ERROR, "mruby-pack's bug"); } if (count > 0) { count--; diff --git a/test/pack.rb b/test/pack.rb index 5e9932f4f..e9f5fb040 100644 --- a/test/pack.rb +++ b/test/pack.rb @@ -145,3 +145,17 @@ assert 'pack/unpack "I"' do end assert_pack 'I', str, [12345] end + +assert 'pack/unpack "U"' do + assert_equal [], "".unpack("U") + assert_equal [], "".unpack("U*") + assert_equal [65, 66], "ABC".unpack("U2") + assert_equal [12371, 12435, 12395, 12385, 12399, 19990, 30028], "こんにちは世界".unpack("U*") + + assert_equal "", [].pack("U") + assert_equal "", [].pack("U*") + assert_equal "AB", [65, 66, 67].pack("U2") + assert_equal "こんにちは世界", [12371, 12435, 12395, 12385, 12399, 19990, 30028].pack("U*") + + assert_equal "\000", [0].pack("U") +end |
