Merge pull request #3312 from nobu/feature/multi-unicode-escape

Feature/multi unicode escape
author: Yukihiro "Matz" Matsumoto <[email protected]> 2016-12-01 18:42:40 +0900
committer: GitHub <[email protected]> 2016-12-01 18:42:40 +0900
commit: f1cf6ef8179313b31a12ed4e5eba509d3b2aed0f (patch)
tree: f7af4f06597947fd8e5d31307bca554414a23234
parent: 4d807fc619e7feccd2bc079ef76a9e817e36fa59 (diff)
parent: 0f08914ac0d433545a4224ee1c3f8d3eb8d51e68 (diff)
download: mruby-f1cf6ef8179313b31a12ed4e5eba509d3b2aed0f.tar.gz
mruby-f1cf6ef8179313b31a12ed4e5eba509d3b2aed0f.zip
2 files changed, 77 insertions, 57 deletions
diff --git a/mrbgems/mruby-compiler/core/parse.y b/mrbgems/mruby-compiler/core/parse.y
index ef522d239..f0c45b85b 100644
--- a/mrbgems/mruby-compiler/core/parse.y
+++ b/mrbgems/mruby-compiler/core/parse.y
@@ -3752,6 +3752,44 @@ scan_hex(const int *start, int len, int *retlen)
   return retval;
 }
 
+static int32_t
+read_escape_unicode(parser_state *p, size_t limit)
+{
+  int32_t c;
+  int buf[9];
+  int i;
+
+  /* Look for opening brace */
+  i = 0;
+  buf[0] = nextc(p);
+  if (buf[0] < 0) goto eof;
+  if (ISXDIGIT(buf[0])) {
+    /* \uxxxx form */
+    for (i=1; i<limit; i++) {
+      buf[i] = nextc(p);
+      if (buf[i] < 0) goto eof;
+      if (!ISXDIGIT(buf[i])) {
+        pushback(p, buf[i]);
+        break;
+      }
+    }
+  }
+  else {
+    pushback(p, buf[0]);
+  }
+  c = scan_hex(buf, i, &i);
+  if (i == 0) {
+  eof:
+    yyerror(p, "Invalid escape character syntax");
+    return -1;
+  }
+  if (c < 0 || c > 0x10FFFF || (c & 0xFFFFF800) == 0xD800) {
+    yyerror(p, "Invalid Unicode code point");
+    return -1;
+  }
+  return c;
+}
+
 /* Return negative to indicate Unicode code point */
 static int32_t
 read_escape(parser_state *p)
@@ -3824,53 +3862,17 @@ read_escape(parser_state *p)
   return c;
 
   case 'u':     /* Unicode */
-  {
-    int buf[9];
-    int i;
-
-    /* Look for opening brace */
-    i = 0;
-    buf[0] = nextc(p);
-    if (buf[0] < 0) goto eof;
-    if (buf[0] == '{') {
+    if (peek(p, '{')) {
       /* \u{xxxxxxxx} form */
-      for (i=0; i<9; i++) {
-        buf[i] = nextc(p);
-        if (buf[i] < 0) goto eof;
-        if (buf[i] == '}') {
-          break;
-        }
-        else if (!ISXDIGIT(buf[i])) {
-          yyerror(p, "Invalid escape character syntax");
-          pushback(p, buf[i]);
-          return 0;
-        }
-      }
-    }
-    else if (ISXDIGIT(buf[0])) {
-      /* \uxxxx form */
-      for (i=1; i<4; i++) {
-        buf[i] = nextc(p);
-        if (buf[i] < 0) goto eof;
-        if (!ISXDIGIT(buf[i])) {
-          pushback(p, buf[i]);
-          break;
-        }
-      }
+      nextc(p);
+      c = read_escape_unicode(p, 8);
+      if (c < 0) return 0;
+      if (nextc(p) != '}') goto eof;
     }
     else {
-      pushback(p, buf[0]);
-    }
-    c = scan_hex(buf, i, &i);
-    if (i == 0) {
-      yyerror(p, "Invalid escape character syntax");
-      return 0;
+      c = read_escape_unicode(p, 4);
+      if (c < 0) return 0;
     }
-    if (c < 0 || c > 0x10FFFF || (c & 0xFFFFF800) == 0xD800) {
-      yyerror(p, "Invalid Unicode code point");
-      return 0;
-    }
-  }
   return -c;
 
   case 'b':/* backspace */
@@ -3993,6 +3995,20 @@ parse_string(parser_state *p)
           tokadd(p, '\\');
           tokadd(p, c);
         }
+        else if (c == 'u' && peek(p, '{')) {
+          /* \u{xxxx xxxx xxxx} form */
+          nextc(p);
+          while (1) {
+            do c = nextc(p); while (ISSPACE(c));
+            if (c == '}') break;
+            pushback(p, c);
+            c = read_escape_unicode(p, 8);
+            if (c < 0) break;
+            tokadd(p, -c);
+          }
+          if (hinf)
+            hinf->line_head = FALSE;
+        }
         else {
           pushback(p, c);
           tokadd(p, read_escape(p));
diff --git a/test/t/unicode.rb b/test/t/unicode.rb
index 7edd65ef2..8622ae08a 100644
--- a/test/t/unicode.rb
+++ b/test/t/unicode.rb
@@ -2,34 +2,38 @@
 
 assert('bare \u notation test') do
   # Mininum and maximum one byte characters
-  assert_equal("\u0000", "\x00")
-  assert_equal("\u007F", "\x7F")
+  assert_equal("\x00", "\u0000")
+  assert_equal("\x7F", "\u007F")
 
   # Mininum and maximum two byte characters
-  assert_equal("\u0080", "\xC2\x80")
-  assert_equal("\u07FF", "\xDF\xBF")
+  assert_equal("\xC2\x80", "\u0080")
+  assert_equal("\xDF\xBF", "\u07FF")
 
   # Mininum and maximum three byte characters
-  assert_equal("\u0800", "\xE0\xA0\x80")
-  assert_equal("\uFFFF", "\xEF\xBF\xBF")
+  assert_equal("\xE0\xA0\x80", "\u0800")
+  assert_equal("\xEF\xBF\xBF", "\uFFFF")
 
   # Four byte characters require the \U notation
 end
 
 assert('braced \u notation test') do
   # Mininum and maximum one byte characters
-  assert_equal("\u{0000}", "\x00")
-  assert_equal("\u{007F}", "\x7F")
+  assert_equal("\x00", "\u{0000}")
+  assert_equal("\x7F", "\u{007F}")
 
   # Mininum and maximum two byte characters
-  assert_equal("\u{0080}", "\xC2\x80")
-  assert_equal("\u{07FF}", "\xDF\xBF")
+  assert_equal("\xC2\x80", "\u{0080}")
+  assert_equal("\xDF\xBF", "\u{07FF}")
 
   # Mininum and maximum three byte characters
-  assert_equal("\u{0800}", "\xE0\xA0\x80")
-  assert_equal("\u{FFFF}", "\xEF\xBF\xBF")
+  assert_equal("\xE0\xA0\x80", "\u{0800}")
+  assert_equal("\xEF\xBF\xBF", "\u{FFFF}")
 
   # Mininum and maximum four byte characters
-  assert_equal("\u{10000}",  "\xF0\x90\x80\x80")
-  assert_equal("\u{10FFFF}", "\xF4\x8F\xBF\xBF")
+  assert_equal("\xF0\x90\x80\x80", "\u{10000}")
+  assert_equal("\xF4\x8F\xBF\xBF", "\u{10FFFF}")
+end
+
+assert('braced multiple \u notation test') do
+  assert_equal("ABC", "\u{41 42 43}")
 end
author	Yukihiro "Matz" Matsumoto <[email protected]>	2016-12-01 18:42:40 +0900
committer	GitHub <[email protected]>	2016-12-01 18:42:40 +0900
commit	f1cf6ef8179313b31a12ed4e5eba509d3b2aed0f (patch)
tree	f7af4f06597947fd8e5d31307bca554414a23234
parent	4d807fc619e7feccd2bc079ef76a9e817e36fa59 (diff)
parent	0f08914ac0d433545a4224ee1c3f8d3eb8d51e68 (diff)
download	mruby-f1cf6ef8179313b31a12ed4e5eba509d3b2aed0f.tar.gz mruby-f1cf6ef8179313b31a12ed4e5eba509d3b2aed0f.zip