summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorDaniel Hertz <daniel@dhertz.com>2015-10-13 14:35:17 -0400
committerDaniel Hertz <daniel@dhertz.com>2015-10-13 14:42:24 -0400
commite5bcd287f863f865b43a9c6807b085f44e00bbfe (patch)
treee4a2ae5b1ba718afbb8be4dde1b5909eaa7ed75a
parent7f4f37eaa20ea8aa5cf8c34e497aaa8245c590b3 (diff)
downloadNim-e5bcd287f863f865b43a9c6807b085f44e00bbfe.tar.gz
Make sure the json module decodes UTF16 correctly
Javascript uses UTF-16 as its internal representation of strings,
so JSON does so as well. This means that we could have surrogate
pairs, with codepoints above 0xFFFF that take 2 ecape codes to
decode.
-rw-r--r--lib/pure/json.nim30
1 files changed, 25 insertions, 5 deletions
diff --git a/lib/pure/json.nim b/lib/pure/json.nim
index 06d5a13e2..e419ecebb 100644
--- a/lib/pure/json.nim
+++ b/lib/pure/json.nim
@@ -203,6 +203,15 @@ proc handleHexChar(c: char, x: var int): bool =
   of 'A'..'F': x = (x shl 4) or (ord(c) - ord('A') + 10)
   else: result = false # error
 
+proc parseEscapedUTF16(buf: cstring, pos: var int): int =
+  result = 0
+  #UTF-16 escape is always 4 bytes.
+  for _ in 0..3:
+    if handleHexChar(buf[pos], result):
+      inc(pos)
+    else:
+      return -1
+
 proc parseString(my: var JsonParser): TokKind =
   result = tkString
   var pos = my.bufpos + 1
@@ -238,11 +247,22 @@ proc parseString(my: var JsonParser): TokKind =
         inc(pos, 2)
       of 'u':
         inc(pos, 2)
-        var r: int
-        if handleHexChar(buf[pos], r): inc(pos)
-        if handleHexChar(buf[pos], r): inc(pos)
-        if handleHexChar(buf[pos], r): inc(pos)
-        if handleHexChar(buf[pos], r): inc(pos)
+        var r = parseEscapedUTF16(buf, pos)
+        if r < 0:
+          my.err = errInvalidToken
+          break
+        # Deal with surrogates
+        if (r and 0xfc00) == 0xd800:
+          if buf[pos] & buf[pos+1] != "\\u":
+            my.err = errInvalidToken
+            break
+          inc(pos, 2)
+          var s = parseEscapedUTF16(buf, pos)
+          if (s and 0xfc00) == 0xdc00 and s > 0:
+            r = 0x10000 + (((r - 0xd800) shl 10) or (s - 0xdc00))
+          else:
+            my.err = errInvalidToken
+            break
         add(my.a, toUTF8(Rune(r)))
       else:
         # don't bother with the error