Merge pull request #1930 from chasonr/unicode-escape

Implement \u notation for strings and regexes.
author: Yukihiro "Matz" Matsumoto <[email protected]> 2014-03-24 17:35:32 +0900
committer: Yukihiro "Matz" Matsumoto <[email protected]> 2014-03-24 17:35:32 +0900
commit: 2ce232144b5d3eacd9c3755306ee12cd28748dcf (patch)
tree: 64a68e6920faaed056364b1db6f46cf187fff731
parent: 53d6df3fdbee55c149b0f5d2c98ad8a1939d6c34 (diff)
parent: 8162295ba25adfdf3135b61e89629c46fcc19472 (diff)
download: mruby-2ce232144b5d3eacd9c3755306ee12cd28748dcf.tar.gz
mruby-2ce232144b5d3eacd9c3755306ee12cd28748dcf.zip
2 files changed, 157 insertions, 10 deletions
diff --git a/src/parse.y b/src/parse.y
index 8e7056b75..67d1aee2f 100644
--- a/src/parse.y
+++ b/src/parse.y
@@ -40,7 +40,7 @@ static void yyerror(parser_state *p, const char *s);
 static void yywarn(parser_state *p, const char *s);
 static void yywarning(parser_state *p, const char *s);
 static void backref_error(parser_state *p, node *n);
-static void tokadd(parser_state *p, int c);
+static void tokadd(parser_state *p, int32_t c);
 
 #ifndef isascii
 #define isascii(c) (((c) & ~0x7f) == 0)
@@ -3465,10 +3465,44 @@ newtok(parser_state *p)
 }
 
 static void
-tokadd(parser_state *p, int c)
+tokadd(parser_state *p, int32_t c)
 {
-  if (p->bidx < MRB_PARSER_BUF_SIZE) {
-    p->buf[p->bidx++] = c;
+  char utf8[4];
+  unsigned len;
+
+  /* mrb_assert(-0x10FFFF <= c && c <= 0xFF); */
+  if (c >= 0) {
+    /* Single byte from source or non-Unicode escape */
+    utf8[0] = (char)c;
+    len = 1;
+  } else {
+    /* Unicode character */
+    c = -c;
+    if (c < 0x80) {
+      utf8[0] = (char)c;
+      len = 1;
+    } else if (c < 0x800) {
+      utf8[0] = (char)(0xC0 | (c >> 6));
+      utf8[1] = (char)(0x80 | (c & 0x3F));
+      len = 2;
+    } else if (c < 0x10000) {
+      utf8[0] = (char)(0xE0 |  (c >> 12)        );
+      utf8[1] = (char)(0x80 | ((c >>  6) & 0x3F));
+      utf8[2] = (char)(0x80 | ( c        & 0x3F));
+      len = 3;
+    } else {
+      utf8[0] = (char)(0xF0 |  (c >> 18)        );
+      utf8[1] = (char)(0x80 | ((c >> 12) & 0x3F));
+      utf8[2] = (char)(0x80 | ((c >>  6) & 0x3F));
+      utf8[3] = (char)(0x80 | ( c        & 0x3F));
+      len = 4;
+    }
+  }
+  if (p->bidx+len <= MRB_PARSER_BUF_SIZE) {
+    unsigned i;
+    for (i = 0; i < len; i++) {
+      p->buf[p->bidx++] = utf8[i];
+    }
   }
 }
 
@@ -3522,15 +3556,15 @@ scan_oct(const int *start, int len, int *retlen)
   return retval;
 }
 
-static int
+static int32_t
 scan_hex(const int *start, int len, int *retlen)
 {
   static const char hexdigit[] = "0123456789abcdef0123456789ABCDEF";
   const int *s = start;
-  int retval = 0;
+  int32_t retval = 0;
   char *tmp;
 
-  /* mrb_assert(len <= 2) */
+  /* mrb_assert(len <= 8) */
   while (len-- && *s && (tmp = (char*)strchr(hexdigit, *s))) {
     retval <<= 4;
     retval |= (tmp - hexdigit) & 15;
@@ -3541,10 +3575,11 @@ scan_hex(const int *start, int len, int *retlen)
   return retval;
 }
 
-static int
+/* Return negative to indicate Unicode code point */
+static int32_t
 read_escape(parser_state *p)
 {
-  int c;
+  int32_t c;
 
   switch (c = nextc(p)) {
   case '\\':/* Backslash */
@@ -3611,6 +3646,53 @@ read_escape(parser_state *p)
   }
   return c;
 
+  case 'u':     /* Unicode */
+  {
+    int buf[9];
+    int i;
+
+    /* Look for opening brace */
+    i = 0;
+    buf[0] = nextc(p);
+    if (buf[0] < 0) goto eof;
+    if (buf[0] == '{') {
+      /* \u{xxxxxxxx} form */
+      for (i=0; i<9; i++) {
+        buf[i] = nextc(p);
+        if (buf[i] < 0) goto eof;
+        if (buf[i] == '}') {
+          break;
+        } else if (!ISXDIGIT(buf[i])) {
+          yyerror(p, "Invalid escape character syntax");
+          pushback(p, buf[i]);
+          return 0;
+        }
+      }
+    } else if (ISXDIGIT(buf[0])) {
+      /* \uxxxx form */
+      for (i=1; i<4; i++) {
+        buf[i] = nextc(p);
+        if (buf[i] < 0) goto eof;
+        if (!ISXDIGIT(buf[i])) {
+          pushback(p, buf[i]);
+          break;
+        }
+      }
+    } else {
+      pushback(p, buf[0]);
+    }
+    c = scan_hex(buf, i, &i);
+    if (i == 0) {
+      yyerror(p, "Invalid escape character syntax");
+      return 0;
+    }
+    if (c < 0 || c > 0x10FFFF || (c & 0xFFFFF800) == 0xD800) {
+      yyerror(p, "Invalid Unicode code point");
+      return 0;
+    }
+  }
+  return -c;
+
   case 'b':/* backspace */
     return '\010';
 
@@ -3726,9 +3808,14 @@ parse_string(parser_state *p)
         }
         else {
           if (type & STR_FUNC_REGEXP) {
+            if (c == 'u') {
+              pushback(p, c);
+              tokadd(p, read_escape(p));
+            } else {
             tokadd(p, '\\');
             if (c >= 0)
               tokadd(p, c);
+            }
           } else {
             pushback(p, c);
             tokadd(p, read_escape(p));
@@ -3932,7 +4019,7 @@ arg_ambiguous(parser_state *p)
 static int
 parser_yylex(parser_state *p)
 {
-  int c;
+  int32_t c;
   int space_seen = 0;
   int cmd_state;
   enum mrb_lex_state_enum last_state;
diff --git a/test/t/unicode.rb b/test/t/unicode.rb
new file mode 100644
index 000000000..a8e8c0e14
--- /dev/null
+++ b/test/t/unicode.rb
@@ -0,0 +1,60 @@
+# Test of the \u notation
+
+assert('bare \u notation test') do
+  # Mininum and maximum one byte characters
+  assert_equal("\u0000", "\x00")
+  assert_equal("\u007F", "\x7F")
+
+  # Mininum and maximum two byte characters
+  assert_equal("\u0080", "\xC2\x80")
+  assert_equal("\u07FF", "\xDF\xBF")
+
+  # Mininum and maximum three byte characters
+  assert_equal("\u0800", "\xE0\xA0\x80")
+  assert_equal("\uFFFF", "\xEF\xBF\xBF")
+
+  # Four byte characters require the \U notation
+end
+
+assert('braced \u notation test') do
+  # Mininum and maximum one byte characters
+  assert_equal("\u{0000}", "\x00")
+  assert_equal("\u{007F}", "\x7F")
+
+  # Mininum and maximum two byte characters
+  assert_equal("\u{0080}", "\xC2\x80")
+  assert_equal("\u{07FF}", "\xDF\xBF")
+
+  # Mininum and maximum three byte characters
+  assert_equal("\u{0800}", "\xE0\xA0\x80")
+  assert_equal("\u{FFFF}", "\xEF\xBF\xBF")
+
+  # Mininum and maximum four byte characters
+  assert_equal("\u{10000}",  "\xF0\x90\x80\x80")
+  assert_equal("\u{10FFFF}", "\xF4\x8F\xBF\xBF")
+end
+
+# Test regular expressions only if implemented
+begin
+  Regexp
+  have_regexp = true
+rescue NameError
+  have_regexp = false
+end
+if have_regexp then
+  assert('Testing \u in regular expressions') do
+    # The regular expression uses the unbraced notation where the string uses
+    # the braced notation, and vice versa, so these tests will fail if the \u
+    # modification is not applied
+
+    # Test of unbraced \u notation in a regular expression
+    assert_false(/\u0300/ =~ "\u{02FF}")
+    assert_true( /\u0300/ =~ "\u{0300}")
+    assert_false(/\u0300/ =~ "\u{0301}")
+
+    # Test of braced \u notation in a regular expression
+    assert_false(/\u{0300}/ =~ "\u02FF")
+    assert_true( /\u{0300}/ =~ "\u0300")
+    assert_false(/\u{0300}/ =~ "\u0301")
+  end
+end
author	Yukihiro "Matz" Matsumoto <[email protected]>	2014-03-24 17:35:32 +0900
committer	Yukihiro "Matz" Matsumoto <[email protected]>	2014-03-24 17:35:32 +0900
commit	2ce232144b5d3eacd9c3755306ee12cd28748dcf (patch)
tree	64a68e6920faaed056364b1db6f46cf187fff731
parent	53d6df3fdbee55c149b0f5d2c98ad8a1939d6c34 (diff)
parent	8162295ba25adfdf3135b61e89629c46fcc19472 (diff)
download	mruby-2ce232144b5d3eacd9c3755306ee12cd28748dcf.tar.gz mruby-2ce232144b5d3eacd9c3755306ee12cd28748dcf.zip