summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--src/pack.c83
-rw-r--r--test/pack.rb14
2 files changed, 95 insertions, 2 deletions
diff --git a/src/pack.c b/src/pack.c
index 6a0075f5a..8c76b2638 100644
--- a/src/pack.c
+++ b/src/pack.c
@@ -448,6 +448,80 @@ pack_utf8(mrb_state *mrb, mrb_value o, mrb_value str, mrb_int sidx, long count,
return len;
}
+static const unsigned long utf8_limits[] = {
+ 0x0, /* 1 */
+ 0x80, /* 2 */
+ 0x800, /* 3 */
+ 0x10000, /* 4 */
+ 0x200000, /* 5 */
+ 0x4000000, /* 6 */
+ 0x80000000, /* 7 */
+};
+
+static unsigned long
+utf8_to_uv(mrb_state *mrb, const char *p, long *lenp)
+{
+ int c = *p++ & 0xff;
+ unsigned long uv = c;
+ long n;
+
+ if (!(uv & 0x80)) {
+ *lenp = 1;
+ return uv;
+ }
+ if (!(uv & 0x40)) {
+ *lenp = 1;
+ mrb_raise(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character");
+ }
+
+ if (!(uv & 0x20)) { n = 2; uv &= 0x1f; }
+ else if (!(uv & 0x10)) { n = 3; uv &= 0x0f; }
+ else if (!(uv & 0x08)) { n = 4; uv &= 0x07; }
+ else if (!(uv & 0x04)) { n = 5; uv &= 0x03; }
+ else if (!(uv & 0x02)) { n = 6; uv &= 0x01; }
+ else {
+ *lenp = 1;
+ mrb_raise(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character");
+ }
+ if (n > *lenp) {
+ mrb_raisef(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character (expected %S bytes, given %S bytes)",
+ mrb_fixnum_value(n), mrb_fixnum_value(*lenp));
+ }
+ *lenp = n--;
+ if (n != 0) {
+ while (n--) {
+ c = *p++ & 0xff;
+ if ((c & 0xc0) != 0x80) {
+ *lenp -= n + 1;
+ mrb_raisef(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character");
+ }
+ else {
+ c &= 0x3f;
+ uv = uv << 6 | c;
+ }
+ }
+ }
+ n = *lenp - 1;
+ if (uv < utf8_limits[n]) {
+ mrb_raisef(mrb, E_ARGUMENT_ERROR, "redundant UTF-8 sequence");
+ }
+ return uv;
+}
+
+static int
+unpack_utf8(mrb_state *mrb, const unsigned char * src, int srclen, mrb_value ary, unsigned int flags)
+{
+ unsigned long uv;
+ long lenp = srclen;
+
+ if (srclen == 0) {
+ return 1;
+ }
+ uv = utf8_to_uv(mrb, (const char *)src, &lenp);
+ mrb_ary_push(mrb, ary, mrb_fixnum_value((mrb_int)uv));
+ return (int)lenp;
+}
+
static int
pack_a(mrb_state *mrb, mrb_value src, mrb_value dst, mrb_int didx, long count, unsigned int flags)
{
@@ -482,7 +556,7 @@ pack_a(mrb_state *mrb, mrb_value src, mrb_value dst, mrb_int didx, long count, u
while (padlen-- > 0) {
*dptr++ = pad;
}
-
+
return dptr - dptr0;
}
@@ -541,7 +615,7 @@ pack_h(mrb_state *mrb, mrb_value src, mrb_value dst, mrb_int didx, long count, u
} else if (slen > count) {
slen = count;
}
-
+
dst = str_len_ensure(mrb, dst, didx + count);
dptr = RSTRING_PTR(dst) + didx;
@@ -1151,6 +1225,11 @@ mrb_pack_unpack(mrb_state *mrb, mrb_value str)
case PACK_DIR_DOUBLE:
srcidx += unpack_double(mrb, sptr, srclen - srcidx, result, flags);
break;
+ case PACK_DIR_UTF8:
+ srcidx += unpack_utf8(mrb, sptr, srclen - srcidx, result, flags);
+ break;
+ default:
+ mrb_raise(mrb, E_RUNTIME_ERROR, "mruby-pack's bug");
}
if (count > 0) {
count--;
diff --git a/test/pack.rb b/test/pack.rb
index 5e9932f4f..e9f5fb040 100644
--- a/test/pack.rb
+++ b/test/pack.rb
@@ -145,3 +145,17 @@ assert 'pack/unpack "I"' do
end
assert_pack 'I', str, [12345]
end
+
+assert 'pack/unpack "U"' do
+ assert_equal [], "".unpack("U")
+ assert_equal [], "".unpack("U*")
+ assert_equal [65, 66], "ABC".unpack("U2")
+ assert_equal [12371, 12435, 12395, 12385, 12399, 19990, 30028], "こんにちは世界".unpack("U*")
+
+ assert_equal "", [].pack("U")
+ assert_equal "", [].pack("U*")
+ assert_equal "AB", [65, 66, 67].pack("U2")
+ assert_equal "こんにちは世界", [12371, 12435, 12395, 12385, 12399, 19990, 30028].pack("U*")
+
+ assert_equal "\000", [0].pack("U")
+end