diff options
| -rw-r--r-- | README.md | 1 | ||||
| -rw-r--r-- | src/pack.c | 103 | ||||
| -rw-r--r-- | test/pack.rb | 18 |
3 files changed, 114 insertions, 8 deletions
@@ -38,6 +38,7 @@ There is no dependency on other mrbgems. - q : 64-bit signed, native endian (`int64_t`) - S : 16-bit unsigned, native endian (`uint16_t`) - s : 16-bit signed, native endian (`int16_t`) + - U : UTF-8 character - V : 32-bit unsigned, VAX (little-endian) byte order - v : 16-bit unsigned, VAX (little-endian) byte order - x : null byte diff --git a/src/pack.c b/src/pack.c index 5a5b97b75..3dd7eb1de 100644 --- a/src/pack.c +++ b/src/pack.c @@ -77,7 +77,7 @@ check_little_endian(void) static unsigned int hex2int(unsigned char ch) { - if (ch >= '0' && ch <= '9') + if (ch >= '0' && ch <= '9') return ch - '0'; else if (ch >= 'A' && ch <= 'F') return 10 + (ch - 'A'); @@ -414,8 +414,12 @@ pack_utf8(mrb_state *mrb, mrb_value o, mrb_value str, mrb_int sidx, long count, { char utf8[4]; int len; - - unsigned long c = mrb_fixnum(o); + unsigned long c = 0; + + if (mrb_float_p(o)) { + goto range_error; + } + c = mrb_fixnum(o); /* Unicode character */ /* from mruby-compiler gem */ @@ -434,20 +438,98 @@ pack_utf8(mrb_state *mrb, mrb_value o, mrb_value str, mrb_int sidx, long count, utf8[2] = (char)(0x80 | ( c & 0x3F)); len = 3; } - else { + else if (c < 0x200000) { utf8[0] = (char)(0xF0 | (c >> 18) ); utf8[1] = (char)(0x80 | ((c >> 12) & 0x3F)); utf8[2] = (char)(0x80 | ((c >> 6) & 0x3F)); utf8[3] = (char)(0x80 | ( c & 0x3F)); len = 4; } - + else { +range_error: + mrb_raise(mrb, E_RANGE_ERROR, "pack(U): value out of range"); + } + str = str_len_ensure(mrb, str, sidx + len); memcpy(RSTRING_PTR(str) + sidx, utf8, len); - + return len; } +static const unsigned long utf8_limits[] = { + 0x0, /* 1 */ + 0x80, /* 2 */ + 0x800, /* 3 */ + 0x10000, /* 4 */ + 0x200000, /* 5 */ + 0x4000000, /* 6 */ + 0x80000000, /* 7 */ +}; + +static unsigned long +utf8_to_uv(mrb_state *mrb, const char *p, long *lenp) +{ + int c = *p++ & 0xff; + unsigned long uv = c; + long n; + + if (!(uv & 0x80)) { + *lenp = 1; + return uv; + } + if (!(uv & 0x40)) { + *lenp = 1; + mrb_raise(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character"); + } + + if (!(uv & 0x20)) { n = 2; uv &= 0x1f; } + else if (!(uv & 0x10)) { n = 3; uv &= 0x0f; } + else if (!(uv & 0x08)) { n = 4; uv &= 0x07; } + else if (!(uv & 0x04)) { n = 5; uv &= 0x03; } + else if (!(uv & 0x02)) { n = 6; uv &= 0x01; } + else { + *lenp = 1; + mrb_raise(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character"); + } + if (n > *lenp) { + mrb_raisef(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character (expected %S bytes, given %S bytes)", + mrb_fixnum_value(n), mrb_fixnum_value(*lenp)); + } + *lenp = n--; + if (n != 0) { + while (n--) { + c = *p++ & 0xff; + if ((c & 0xc0) != 0x80) { + *lenp -= n + 1; + mrb_raisef(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character"); + } + else { + c &= 0x3f; + uv = uv << 6 | c; + } + } + } + n = *lenp - 1; + if (uv < utf8_limits[n]) { + mrb_raisef(mrb, E_ARGUMENT_ERROR, "redundant UTF-8 sequence"); + } + return uv; +} + +static int +unpack_utf8(mrb_state *mrb, const unsigned char * src, int srclen, mrb_value ary, unsigned int flags) +{ + unsigned long uv; + long lenp = srclen; + + if (srclen == 0) { + return 1; + } + uv = utf8_to_uv(mrb, (const char *)src, &lenp); + mrb_ary_push(mrb, ary, mrb_fixnum_value((mrb_int)uv)); + return (int)lenp; +} + static int pack_a(mrb_state *mrb, mrb_value src, mrb_value dst, mrb_int didx, long count, unsigned int flags) { @@ -482,7 +564,7 @@ pack_a(mrb_state *mrb, mrb_value src, mrb_value dst, mrb_int didx, long count, u while (padlen-- > 0) { *dptr++ = pad; } - + return dptr - dptr0; } @@ -541,7 +623,7 @@ pack_h(mrb_state *mrb, mrb_value src, mrb_value dst, mrb_int didx, long count, u } else if (slen > count) { slen = count; } - + dst = str_len_ensure(mrb, dst, didx + count); dptr = RSTRING_PTR(dst) + didx; @@ -1147,6 +1229,11 @@ mrb_pack_unpack(mrb_state *mrb, mrb_value str) case PACK_DIR_DOUBLE: srcidx += unpack_double(mrb, sptr, srclen - srcidx, result, flags); break; + case PACK_DIR_UTF8: + srcidx += unpack_utf8(mrb, sptr, srclen - srcidx, result, flags); + break; + default: + mrb_raise(mrb, E_RUNTIME_ERROR, "mruby-pack's bug"); } if (count > 0) { count--; diff --git a/test/pack.rb b/test/pack.rb index 5e9932f4f..f518ca4fa 100644 --- a/test/pack.rb +++ b/test/pack.rb @@ -145,3 +145,21 @@ assert 'pack/unpack "I"' do end assert_pack 'I', str, [12345] end + +assert 'pack/unpack "U"' do + assert_equal [], "".unpack("U") + assert_equal [], "".unpack("U*") + assert_equal [65, 66], "ABC".unpack("U2") + assert_equal [12371, 12435, 12395, 12385, 12399, 19990, 30028], "こんにちは世界".unpack("U*") + + assert_equal "", [].pack("U") + assert_equal "", [].pack("U*") + assert_equal "AB", [65, 66, 67].pack("U2") + assert_equal "こんにちは世界", [12371, 12435, 12395, 12385, 12399, 19990, 30028].pack("U*") + + assert_equal "\000", [0].pack("U") + + assert_raise(RangeError) { [-0x40000000].pack("U") } + assert_raise(RangeError) { [-1].pack("U") } + assert_raise(RangeError) { [0x40000000].pack("U") } +end |
