diff options
author | Daniel Hertz <daniel@dhertz.com> | 2015-10-13 14:35:17 -0400 |
---|---|---|
committer | Daniel Hertz <daniel@dhertz.com> | 2015-10-13 14:42:24 -0400 |
commit | e5bcd287f863f865b43a9c6807b085f44e00bbfe (patch) | |
tree | e4a2ae5b1ba718afbb8be4dde1b5909eaa7ed75a | |
parent | 7f4f37eaa20ea8aa5cf8c34e497aaa8245c590b3 (diff) | |
download | Nim-e5bcd287f863f865b43a9c6807b085f44e00bbfe.tar.gz |
Make sure the json module decodes UTF16 correctly
Javascript uses UTF-16 as its internal representation of strings, so JSON does so as well. This means that we could have surrogate pairs, with codepoints above 0xFFFF that take 2 ecape codes to decode.
-rw-r--r-- | lib/pure/json.nim | 30 |
1 files changed, 25 insertions, 5 deletions
diff --git a/lib/pure/json.nim b/lib/pure/json.nim index 06d5a13e2..e419ecebb 100644 --- a/lib/pure/json.nim +++ b/lib/pure/json.nim @@ -203,6 +203,15 @@ proc handleHexChar(c: char, x: var int): bool = of 'A'..'F': x = (x shl 4) or (ord(c) - ord('A') + 10) else: result = false # error +proc parseEscapedUTF16(buf: cstring, pos: var int): int = + result = 0 + #UTF-16 escape is always 4 bytes. + for _ in 0..3: + if handleHexChar(buf[pos], result): + inc(pos) + else: + return -1 + proc parseString(my: var JsonParser): TokKind = result = tkString var pos = my.bufpos + 1 @@ -238,11 +247,22 @@ proc parseString(my: var JsonParser): TokKind = inc(pos, 2) of 'u': inc(pos, 2) - var r: int - if handleHexChar(buf[pos], r): inc(pos) - if handleHexChar(buf[pos], r): inc(pos) - if handleHexChar(buf[pos], r): inc(pos) - if handleHexChar(buf[pos], r): inc(pos) + var r = parseEscapedUTF16(buf, pos) + if r < 0: + my.err = errInvalidToken + break + # Deal with surrogates + if (r and 0xfc00) == 0xd800: + if buf[pos] & buf[pos+1] != "\\u": + my.err = errInvalidToken + break + inc(pos, 2) + var s = parseEscapedUTF16(buf, pos) + if (s and 0xfc00) == 0xdc00 and s > 0: + r = 0x10000 + (((r - 0xd800) shl 10) or (s - 0xdc00)) + else: + my.err = errInvalidToken + break add(my.a, toUTF8(Rune(r))) else: # don't bother with the error |