Merge pull request #4258 from Parashurama/add_unicode_escape

adds support for unicode hexcode in string literals.
author: Andreas Rumpf <rumpf_a@web.de> 2016-06-02 09:55:27 +0200
committer: Andreas Rumpf <rumpf_a@web.de> 2016-06-02 09:55:27 +0200
commit: ca6986b89c6d5a6cb3f33dda1237147596928cfe (patch)
tree: 2ea8211716c4987c0045dd30d23f222da34f9a22
parent: c11de219e5d4384a4449d226239daa4ed79cb5a6 (diff)
parent: 8ce9739f117c7807076bfffd20662ac11ded57a9 (diff)
download: Nim-ca6986b89c6d5a6cb3f33dda1237147596928cfe.tar.gz
2 files changed, 26 insertions, 3 deletions
diff --git a/compiler/lexer.nim b/compiler/lexer.nim
index 0a4c01ba8..8b201431e 100644
--- a/compiler/lexer.nim
+++ b/compiler/lexer.nim
@@ -138,6 +138,8 @@ proc getLineInfo*(L: TLexer, tok: TToken): TLineInfo {.inline.} =
 proc isKeyword*(kind: TTokType): bool =
   result = (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)
 
+template ones(n: expr): expr = ((1 shl n)-1) # for utf-8 conversion
+
 proc isNimIdentifier*(s: string): bool =
   if s[0] in SymStartChars:
     var i = 1
@@ -589,12 +591,29 @@ proc getEscapedChar(L: var TLexer, tok: var TToken) =
   of '\\':
     add(tok.literal, '\\')
     inc(L.bufpos)
-  of 'x', 'X':
+  of 'x', 'X', 'u', 'U':
+    var tp = L.buf[L.bufpos]
     inc(L.bufpos)
     var xi = 0
     handleHexChar(L, xi)
     handleHexChar(L, xi)
-    add(tok.literal, chr(xi))
+    if tp in {'u', 'U'}:
+      handleHexChar(L, xi)
+      handleHexChar(L, xi)
+      # inlined toUTF-8 to avoid unicode and strutils dependencies.
+      if xi <=% 127:
+        add(tok.literal, xi.char )
+      elif xi <=% 0x07FF:
+        add(tok.literal, ((xi shr 6) or 0b110_00000).char )
+        add(tok.literal, ((xi and ones(6)) or 0b10_0000_00).char )
+      elif xi <=% 0xFFFF:
+        add(tok.literal, (xi shr 12 or 0b1110_0000).char )
+        add(tok.literal, (xi shr 6 and ones(6) or 0b10_0000_00).char )
+        add(tok.literal, (xi and ones(6) or 0b10_0000_00).char )
+      else: # value is 0xFFFF
+        add(tok.literal, "\xef\xbf\xbf" )
+    else:
+      add(tok.literal, chr(xi))
   of '0'..'9':
     if matchTwoChars(L, '0', {'0'..'9'}):
       lexMessage(L, warnOctalEscape)
diff --git a/tests/lexer/tstrlits.nim b/tests/lexer/tstrlits.nim
index f5b7ce937..cc8872f60 100644
--- a/tests/lexer/tstrlits.nim
+++ b/tests/lexer/tstrlits.nim
@@ -1,6 +1,6 @@
 discard """
   file: "tstrlits.nim"
-  output: "a\"\"long string\"\"\"\"\"abc\"def"
+  output: "a\"\"long string\"\"\"\"\"abc\"def_'2'●"
 """
 # Test the new different string literals
 
@@ -11,9 +11,13 @@ const
 
   raw = r"abc""def"
 
+  escaped = "\x5f'\50'\u25cf"
+
+
 stdout.write(rawQuote)
 stdout.write(tripleEmpty)
 stdout.write(raw)
+stdout.write(escaped)
 #OUT a""long string"""""abc"def
author	Andreas Rumpf <rumpf_a@web.de>	2016-06-02 09:55:27 +0200
committer	Andreas Rumpf <rumpf_a@web.de>	2016-06-02 09:55:27 +0200
commit	ca6986b89c6d5a6cb3f33dda1237147596928cfe (patch)
tree	2ea8211716c4987c0045dd30d23f222da34f9a22
parent	c11de219e5d4384a4449d226239daa4ed79cb5a6 (diff)
parent	8ce9739f117c7807076bfffd20662ac11ded57a9 (diff)
download	Nim-ca6986b89c6d5a6cb3f33dda1237147596928cfe.tar.gz