diff options
author | Andreas Rumpf <rumpf_a@web.de> | 2016-06-02 09:55:27 +0200 |
---|---|---|
committer | Andreas Rumpf <rumpf_a@web.de> | 2016-06-02 09:55:27 +0200 |
commit | ca6986b89c6d5a6cb3f33dda1237147596928cfe (patch) | |
tree | 2ea8211716c4987c0045dd30d23f222da34f9a22 | |
parent | c11de219e5d4384a4449d226239daa4ed79cb5a6 (diff) | |
parent | 8ce9739f117c7807076bfffd20662ac11ded57a9 (diff) | |
download | Nim-ca6986b89c6d5a6cb3f33dda1237147596928cfe.tar.gz |
Merge pull request #4258 from Parashurama/add_unicode_escape
adds support for unicode hexcode in string literals.
-rw-r--r-- | compiler/lexer.nim | 23 | ||||
-rw-r--r-- | tests/lexer/tstrlits.nim | 6 |
2 files changed, 26 insertions, 3 deletions
diff --git a/compiler/lexer.nim b/compiler/lexer.nim index 0a4c01ba8..8b201431e 100644 --- a/compiler/lexer.nim +++ b/compiler/lexer.nim @@ -138,6 +138,8 @@ proc getLineInfo*(L: TLexer, tok: TToken): TLineInfo {.inline.} = proc isKeyword*(kind: TTokType): bool = result = (kind >= tokKeywordLow) and (kind <= tokKeywordHigh) +template ones(n: expr): expr = ((1 shl n)-1) # for utf-8 conversion + proc isNimIdentifier*(s: string): bool = if s[0] in SymStartChars: var i = 1 @@ -589,12 +591,29 @@ proc getEscapedChar(L: var TLexer, tok: var TToken) = of '\\': add(tok.literal, '\\') inc(L.bufpos) - of 'x', 'X': + of 'x', 'X', 'u', 'U': + var tp = L.buf[L.bufpos] inc(L.bufpos) var xi = 0 handleHexChar(L, xi) handleHexChar(L, xi) - add(tok.literal, chr(xi)) + if tp in {'u', 'U'}: + handleHexChar(L, xi) + handleHexChar(L, xi) + # inlined toUTF-8 to avoid unicode and strutils dependencies. + if xi <=% 127: + add(tok.literal, xi.char ) + elif xi <=% 0x07FF: + add(tok.literal, ((xi shr 6) or 0b110_00000).char ) + add(tok.literal, ((xi and ones(6)) or 0b10_0000_00).char ) + elif xi <=% 0xFFFF: + add(tok.literal, (xi shr 12 or 0b1110_0000).char ) + add(tok.literal, (xi shr 6 and ones(6) or 0b10_0000_00).char ) + add(tok.literal, (xi and ones(6) or 0b10_0000_00).char ) + else: # value is 0xFFFF + add(tok.literal, "\xef\xbf\xbf" ) + else: + add(tok.literal, chr(xi)) of '0'..'9': if matchTwoChars(L, '0', {'0'..'9'}): lexMessage(L, warnOctalEscape) diff --git a/tests/lexer/tstrlits.nim b/tests/lexer/tstrlits.nim index f5b7ce937..cc8872f60 100644 --- a/tests/lexer/tstrlits.nim +++ b/tests/lexer/tstrlits.nim @@ -1,6 +1,6 @@ discard """ file: "tstrlits.nim" - output: "a\"\"long string\"\"\"\"\"abc\"def" + output: "a\"\"long string\"\"\"\"\"abc\"def_'2'●" """ # Test the new different string literals @@ -11,9 +11,13 @@ const raw = r"abc""def" + escaped = "\x5f'\50'\u25cf" + + stdout.write(rawQuote) stdout.write(tripleEmpty) stdout.write(raw) +stdout.write(escaped) #OUT a""long string"""""abc"def |