summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorYukihiro "Matz" Matsumoto <[email protected]>2014-03-24 17:35:32 +0900
committerYukihiro "Matz" Matsumoto <[email protected]>2014-03-24 17:35:32 +0900
commit2ce232144b5d3eacd9c3755306ee12cd28748dcf (patch)
tree64a68e6920faaed056364b1db6f46cf187fff731
parent53d6df3fdbee55c149b0f5d2c98ad8a1939d6c34 (diff)
parent8162295ba25adfdf3135b61e89629c46fcc19472 (diff)
downloadmruby-2ce232144b5d3eacd9c3755306ee12cd28748dcf.tar.gz
mruby-2ce232144b5d3eacd9c3755306ee12cd28748dcf.zip
Merge pull request #1930 from chasonr/unicode-escape
Implement \u notation for strings and regexes.
-rw-r--r--src/parse.y107
-rw-r--r--test/t/unicode.rb60
2 files changed, 157 insertions, 10 deletions
diff --git a/src/parse.y b/src/parse.y
index 8e7056b75..67d1aee2f 100644
--- a/src/parse.y
+++ b/src/parse.y
@@ -40,7 +40,7 @@ static void yyerror(parser_state *p, const char *s);
static void yywarn(parser_state *p, const char *s);
static void yywarning(parser_state *p, const char *s);
static void backref_error(parser_state *p, node *n);
-static void tokadd(parser_state *p, int c);
+static void tokadd(parser_state *p, int32_t c);
#ifndef isascii
#define isascii(c) (((c) & ~0x7f) == 0)
@@ -3465,10 +3465,44 @@ newtok(parser_state *p)
}
static void
-tokadd(parser_state *p, int c)
+tokadd(parser_state *p, int32_t c)
{
- if (p->bidx < MRB_PARSER_BUF_SIZE) {
- p->buf[p->bidx++] = c;
+ char utf8[4];
+ unsigned len;
+
+ /* mrb_assert(-0x10FFFF <= c && c <= 0xFF); */
+ if (c >= 0) {
+ /* Single byte from source or non-Unicode escape */
+ utf8[0] = (char)c;
+ len = 1;
+ } else {
+ /* Unicode character */
+ c = -c;
+ if (c < 0x80) {
+ utf8[0] = (char)c;
+ len = 1;
+ } else if (c < 0x800) {
+ utf8[0] = (char)(0xC0 | (c >> 6));
+ utf8[1] = (char)(0x80 | (c & 0x3F));
+ len = 2;
+ } else if (c < 0x10000) {
+ utf8[0] = (char)(0xE0 | (c >> 12) );
+ utf8[1] = (char)(0x80 | ((c >> 6) & 0x3F));
+ utf8[2] = (char)(0x80 | ( c & 0x3F));
+ len = 3;
+ } else {
+ utf8[0] = (char)(0xF0 | (c >> 18) );
+ utf8[1] = (char)(0x80 | ((c >> 12) & 0x3F));
+ utf8[2] = (char)(0x80 | ((c >> 6) & 0x3F));
+ utf8[3] = (char)(0x80 | ( c & 0x3F));
+ len = 4;
+ }
+ }
+ if (p->bidx+len <= MRB_PARSER_BUF_SIZE) {
+ unsigned i;
+ for (i = 0; i < len; i++) {
+ p->buf[p->bidx++] = utf8[i];
+ }
}
}
@@ -3522,15 +3556,15 @@ scan_oct(const int *start, int len, int *retlen)
return retval;
}
-static int
+static int32_t
scan_hex(const int *start, int len, int *retlen)
{
static const char hexdigit[] = "0123456789abcdef0123456789ABCDEF";
const int *s = start;
- int retval = 0;
+ int32_t retval = 0;
char *tmp;
- /* mrb_assert(len <= 2) */
+ /* mrb_assert(len <= 8) */
while (len-- && *s && (tmp = (char*)strchr(hexdigit, *s))) {
retval <<= 4;
retval |= (tmp - hexdigit) & 15;
@@ -3541,10 +3575,11 @@ scan_hex(const int *start, int len, int *retlen)
return retval;
}
-static int
+/* Return negative to indicate Unicode code point */
+static int32_t
read_escape(parser_state *p)
{
- int c;
+ int32_t c;
switch (c = nextc(p)) {
case '\\':/* Backslash */
@@ -3611,6 +3646,53 @@ read_escape(parser_state *p)
}
return c;
+ case 'u': /* Unicode */
+ {
+ int buf[9];
+ int i;
+
+ /* Look for opening brace */
+ i = 0;
+ buf[0] = nextc(p);
+ if (buf[0] < 0) goto eof;
+ if (buf[0] == '{') {
+ /* \u{xxxxxxxx} form */
+ for (i=0; i<9; i++) {
+ buf[i] = nextc(p);
+ if (buf[i] < 0) goto eof;
+ if (buf[i] == '}') {
+ break;
+ } else if (!ISXDIGIT(buf[i])) {
+ yyerror(p, "Invalid escape character syntax");
+ pushback(p, buf[i]);
+ return 0;
+ }
+ }
+ } else if (ISXDIGIT(buf[0])) {
+ /* \uxxxx form */
+ for (i=1; i<4; i++) {
+ buf[i] = nextc(p);
+ if (buf[i] < 0) goto eof;
+ if (!ISXDIGIT(buf[i])) {
+ pushback(p, buf[i]);
+ break;
+ }
+ }
+ } else {
+ pushback(p, buf[0]);
+ }
+ c = scan_hex(buf, i, &i);
+ if (i == 0) {
+ yyerror(p, "Invalid escape character syntax");
+ return 0;
+ }
+ if (c < 0 || c > 0x10FFFF || (c & 0xFFFFF800) == 0xD800) {
+ yyerror(p, "Invalid Unicode code point");
+ return 0;
+ }
+ }
+ return -c;
+
case 'b':/* backspace */
return '\010';
@@ -3726,9 +3808,14 @@ parse_string(parser_state *p)
}
else {
if (type & STR_FUNC_REGEXP) {
+ if (c == 'u') {
+ pushback(p, c);
+ tokadd(p, read_escape(p));
+ } else {
tokadd(p, '\\');
if (c >= 0)
tokadd(p, c);
+ }
} else {
pushback(p, c);
tokadd(p, read_escape(p));
@@ -3932,7 +4019,7 @@ arg_ambiguous(parser_state *p)
static int
parser_yylex(parser_state *p)
{
- int c;
+ int32_t c;
int space_seen = 0;
int cmd_state;
enum mrb_lex_state_enum last_state;
diff --git a/test/t/unicode.rb b/test/t/unicode.rb
new file mode 100644
index 000000000..a8e8c0e14
--- /dev/null
+++ b/test/t/unicode.rb
@@ -0,0 +1,60 @@
+# Test of the \u notation
+
+assert('bare \u notation test') do
+ # Mininum and maximum one byte characters
+ assert_equal("\u0000", "\x00")
+ assert_equal("\u007F", "\x7F")
+
+ # Mininum and maximum two byte characters
+ assert_equal("\u0080", "\xC2\x80")
+ assert_equal("\u07FF", "\xDF\xBF")
+
+ # Mininum and maximum three byte characters
+ assert_equal("\u0800", "\xE0\xA0\x80")
+ assert_equal("\uFFFF", "\xEF\xBF\xBF")
+
+ # Four byte characters require the \U notation
+end
+
+assert('braced \u notation test') do
+ # Mininum and maximum one byte characters
+ assert_equal("\u{0000}", "\x00")
+ assert_equal("\u{007F}", "\x7F")
+
+ # Mininum and maximum two byte characters
+ assert_equal("\u{0080}", "\xC2\x80")
+ assert_equal("\u{07FF}", "\xDF\xBF")
+
+ # Mininum and maximum three byte characters
+ assert_equal("\u{0800}", "\xE0\xA0\x80")
+ assert_equal("\u{FFFF}", "\xEF\xBF\xBF")
+
+ # Mininum and maximum four byte characters
+ assert_equal("\u{10000}", "\xF0\x90\x80\x80")
+ assert_equal("\u{10FFFF}", "\xF4\x8F\xBF\xBF")
+end
+
+# Test regular expressions only if implemented
+begin
+ Regexp
+ have_regexp = true
+rescue NameError
+ have_regexp = false
+end
+if have_regexp then
+ assert('Testing \u in regular expressions') do
+ # The regular expression uses the unbraced notation where the string uses
+ # the braced notation, and vice versa, so these tests will fail if the \u
+ # modification is not applied
+
+ # Test of unbraced \u notation in a regular expression
+ assert_false(/\u0300/ =~ "\u{02FF}")
+ assert_true( /\u0300/ =~ "\u{0300}")
+ assert_false(/\u0300/ =~ "\u{0301}")
+
+ # Test of braced \u notation in a regular expression
+ assert_false(/\u{0300}/ =~ "\u02FF")
+ assert_true( /\u{0300}/ =~ "\u0300")
+ assert_false(/\u{0300}/ =~ "\u0301")
+ end
+end