summary refs log tree commit diff stats
path: root/compiler/lexer.nim
diff options
context:
space:
mode:
authorParashurama <Rhagdamaziel@ymail.com>2016-06-01 23:36:46 +0200
committerParashurama <Rhagdamaziel@ymail.com>2016-06-01 23:36:46 +0200
commit8ce9739f117c7807076bfffd20662ac11ded57a9 (patch)
treee8bf27900c3c2ae60e51f532f9f07374ca537db6 /compiler/lexer.nim
parent7f09d6bf1f0eafe2ce4336483035f76ae0ef3539 (diff)
downloadNim-8ce9739f117c7807076bfffd20662ac11ded57a9.tar.gz
adds support for unicode hexcode in string literals.
Diffstat (limited to 'compiler/lexer.nim')
-rw-r--r--compiler/lexer.nim23
1 files changed, 21 insertions, 2 deletions
diff --git a/compiler/lexer.nim b/compiler/lexer.nim
index 0a4c01ba8..8b201431e 100644
--- a/compiler/lexer.nim
+++ b/compiler/lexer.nim
@@ -138,6 +138,8 @@ proc getLineInfo*(L: TLexer, tok: TToken): TLineInfo {.inline.} =
 proc isKeyword*(kind: TTokType): bool =
   result = (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)
 
+template ones(n: expr): expr = ((1 shl n)-1) # for utf-8 conversion
+
 proc isNimIdentifier*(s: string): bool =
   if s[0] in SymStartChars:
     var i = 1
@@ -589,12 +591,29 @@ proc getEscapedChar(L: var TLexer, tok: var TToken) =
   of '\\':
     add(tok.literal, '\\')
     inc(L.bufpos)
-  of 'x', 'X':
+  of 'x', 'X', 'u', 'U':
+    var tp = L.buf[L.bufpos]
     inc(L.bufpos)
     var xi = 0
     handleHexChar(L, xi)
     handleHexChar(L, xi)
-    add(tok.literal, chr(xi))
+    if tp in {'u', 'U'}:
+      handleHexChar(L, xi)
+      handleHexChar(L, xi)
+      # inlined toUTF-8 to avoid unicode and strutils dependencies.
+      if xi <=% 127:
+        add(tok.literal, xi.char )
+      elif xi <=% 0x07FF:
+        add(tok.literal, ((xi shr 6) or 0b110_00000).char )
+        add(tok.literal, ((xi and ones(6)) or 0b10_0000_00).char )
+      elif xi <=% 0xFFFF:
+        add(tok.literal, (xi shr 12 or 0b1110_0000).char )
+        add(tok.literal, (xi shr 6 and ones(6) or 0b10_0000_00).char )
+        add(tok.literal, (xi and ones(6) or 0b10_0000_00).char )
+      else: # value is 0xFFFF
+        add(tok.literal, "\xef\xbf\xbf" )
+    else:
+      add(tok.literal, chr(xi))
   of '0'..'9':
     if matchTwoChars(L, '0', {'0'..'9'}):
       lexMessage(L, warnOctalEscape)