diff options
author | jangko <jangko128@gmail.com> | 2015-08-21 10:43:31 +0700 |
---|---|---|
committer | jangko <jangko128@gmail.com> | 2015-08-21 10:43:31 +0700 |
commit | 7c757599f1c9157a65e8e2238d4b11eedeeb01bf (patch) | |
tree | 3b5f0bc75b8f080ec77030625a3026fc80c9351d /lib | |
parent | c103eddc737a48d3de04e64c2086549b7ec33d6d (diff) | |
download | Nim-7c757599f1c9157a65e8e2238d4b11eedeeb01bf.tar.gz |
fixed UTF-16 to UTF-8 conversion in widestrs.nim
the source of problem for issue #3228 also add test for entire range of valid UTF-16 and test for invalid UTF-16 sequence
Diffstat (limited to 'lib')
-rw-r--r-- | lib/system/widestrs.nim | 26 |
1 files changed, 20 insertions, 6 deletions
diff --git a/lib/system/widestrs.nim b/lib/system/widestrs.nim index 94ae3e26b..77310b289 100644 --- a/lib/system/widestrs.nim +++ b/lib/system/widestrs.nim @@ -114,7 +114,7 @@ proc newWideCString*(s: cstring): WideCString = proc newWideCString*(s: string): WideCString = result = newWideCString(s, s.len) -proc `$`*(w: WideCString, estimate: int): string = +proc `$`*(w: WideCString, estimate: int, replacement: int = 0xFFFD): string = result = newStringOfCap(estimate + estimate shr 2) var i = 0 @@ -124,9 +124,18 @@ proc `$`*(w: WideCString, estimate: int): string = if ch >= UNI_SUR_HIGH_START and ch <= UNI_SUR_HIGH_END: # If the 16 bits following the high surrogate are in the source buffer... let ch2 = int(cast[uint16](w[i])) - ch = (((ch and halfMask) shl halfShift) + (ch2 and halfMask)) + halfBase - inc i - + + # If it's a low surrogate, convert to UTF32: + if ch2 >= UNI_SUR_LOW_START and ch2 <= UNI_SUR_LOW_END: + ch = (((ch and halfMask) shl halfShift) + (ch2 and halfMask)) + halfBase + inc i + else: + #invalid UTF-16 + ch = replacement + elif ch >= UNI_SUR_LOW_START and ch <= UNI_SUR_LOW_END: + #invalid UTF-16 + ch = replacement + if ch < 0x80: result.add chr(ch) elif ch < 0x800: @@ -136,11 +145,16 @@ proc `$`*(w: WideCString, estimate: int): string = result.add chr((ch shr 12) or 0xe0) result.add chr(((ch shr 6) and 0x3f) or 0x80) result.add chr((ch and 0x3f) or 0x80) - else: + elif ch <= 0x10FFFF: result.add chr((ch shr 18) or 0xf0) result.add chr(((ch shr 12) and 0x3f) or 0x80) result.add chr(((ch shr 6) and 0x3f) or 0x80) result.add chr((ch and 0x3f) or 0x80) - + else: + # replacement char(in case user give very large number): + result.add chr(0xFFFD shr 12 or 0b1110_0000) + result.add chr(0xFFFD shr 6 and ones(6) or 0b10_0000_00) + result.add chr(0xFFFD and ones(6) or 0b10_0000_00) + proc `$`*(s: WideCString): string = result = s $ 80 |