diff options
author | amzak <amzak@users.noreply.github.com> | 2019-08-06 10:30:47 +0300 |
---|---|---|
committer | Varriount <Varriount@users.noreply.github.com> | 2019-08-06 03:30:47 -0400 |
commit | 3d7d5cf1214f85d64d1efca9fc560912eb6af8a2 (patch) | |
tree | bc1e290b52a4816bb77d2e20b0724f36215aeadc /lib | |
parent | ce148e71ef49dab3d8e61499bce40fd5718ecff4 (diff) | |
download | Nim-3d7d5cf1214f85d64d1efca9fc560912eb6af8a2.tar.gz |
Fixed utf8<->utf16 conversions on windows (#11888)
Diffstat (limited to 'lib')
-rw-r--r-- | lib/pure/encodings.nim | 147 |
1 files changed, 108 insertions, 39 deletions
diff --git a/lib/pure/encodings.nim b/lib/pure/encodings.nim index 277d138e2..460ffbd4a 100644 --- a/lib/pure/encodings.nim +++ b/lib/pure/encodings.nim @@ -324,32 +324,33 @@ proc close*(c: EncodingConverter) = iconvClose(c) when defined(windows): - proc convert*(c: EncodingConverter, s: string): string = - ## converts `s` to `destEncoding` that was given to the converter `c`. It - ## assumed that `s` is in `srcEncoding`. - + proc convertToWideString(codePage: CodePage, s: string): string = # special case: empty string: needed because MultiByteToWideChar - # return 0 in case of error: + # return 0 in case of error if s.len == 0: return "" + # educated guess of capacity: var cap = s.len + s.len shr 2 result = newString(cap*2) # convert to utf-16 LE - var m = multiByteToWideChar(codePage = c.src, dwFlags = 0'i32, + var m = multiByteToWideChar(codePage, + dwFlags = 0'i32, lpMultiByteStr = cstring(s), cbMultiByte = cint(s.len), lpWideCharStr = cstring(result), cchWideChar = cint(cap)) if m == 0: # try again; ask for capacity: - cap = multiByteToWideChar(codePage = c.src, dwFlags = 0'i32, + cap = multiByteToWideChar(codePage, + dwFlags = 0'i32, lpMultiByteStr = cstring(s), cbMultiByte = cint(s.len), lpWideCharStr = nil, cchWideChar = cint(0)) # and do the conversion properly: result = newString(cap*2) - m = multiByteToWideChar(codePage = c.src, dwFlags = 0'i32, + m = multiByteToWideChar(codePage, + dwFlags = 0'i32, lpMultiByteStr = cstring(s), cbMultiByte = cint(s.len), lpWideCharStr = cstring(result), @@ -361,45 +362,60 @@ when defined(windows): else: assert(false) # cannot happen - # if already utf-16 LE, no further need to do something: - if int(c.dest) == 1200: return - # otherwise the fun starts again: - cap = s.len + s.len shr 2 - var res = newString(cap) - m = wideCharToMultiByte( - codePage = c.dest, - dwFlags = 0'i32, - lpWideCharStr = cstring(result), - cchWideChar = cint(result.len div 2), - lpMultiByteStr = cstring(res), - cbMultiByte = cap.cint) + proc convertFromWideString(codePage: CodePage, s: string): string = + let charCount = s.len div 2 + var cap = s.len + s.len shr 2 + result = newString(cap) + var m = wideCharToMultiByte(codePage, + dwFlags = 0'i32, + lpWideCharStr = cstring(s), + cchWideChar = cint(charCount), + lpMultiByteStr = cstring(result), + cbMultiByte = cap.cint) if m == 0: # try again; ask for capacity: - cap = wideCharToMultiByte( - codePage = c.dest, - dwFlags = 0'i32, - lpWideCharStr = cstring(result), - cchWideChar = cint(result.len div 2), - lpMultiByteStr = nil, - cbMultiByte = cint(0)) + cap = wideCharToMultiByte(codePage, + dwFlags = 0'i32, + lpWideCharStr = cstring(s), + cchWideChar = cint(charCount), + lpMultiByteStr = nil, + cbMultiByte = cint(0)) # and do the conversion properly: - res = newString(cap) - m = wideCharToMultiByte( - codePage = c.dest, - dwFlags = 0'i32, - lpWideCharStr = cstring(result), - cchWideChar = cint(result.len div 2), - lpMultiByteStr = cstring(res), - cbMultiByte = cap.cint) + result = newString(cap) + m = wideCharToMultiByte(codePage, + dwFlags = 0'i32, + lpWideCharStr = cstring(s), + cchWideChar = cint(charCount), + lpMultiByteStr = cstring(result), + cbMultiByte = cap.cint) if m == 0: raiseOSError(osLastError()) - setLen(res, m) - result = res + setLen(result, m) elif m <= cap: - setLen(res, m) - result = res + setLen(result, m) else: assert(false) # cannot happen + proc convertWin(codePageFrom: CodePage, codePageTo: CodePage, s: string): string = + # multiByteToWideChar does not support encoding from code pages below + let unsupported = [1201, 12000, 12001] + + if int(codePageFrom) in unsupported: + let message = "encoding from " & codePageToName(codePageFrom) & " is not supported on windows" + raise newException(EncodingError, message) + + if int(codePageTo) in unsupported: + let message = "encoding to " & codePageToName(codePageTo) & " is not supported on windows" + raise newException(EncodingError, message) + + # in case it's already UTF-16 little endian - conversion can be simplified + let wideString = if int(codePageFrom) == 1200: s else: convertToWideString(codePageFrom, s) + return if int(codePageTo) == 1200: wideString else: convertFromWideString(codePageTo, wideString) + + proc convert*(c: EncodingConverter, s: string): string = + ## converts `s` to `destEncoding` that was given to the converter `c`. It + ## assumed that `s` is in `srcEncoding`. + ## utf-16BE, utf-32 conversions not supported on windows + result = convertWin(c.src, c.dest, s) else: proc convert*(c: EncodingConverter, s: string): string = result = newString(s.len) @@ -445,6 +461,7 @@ proc convert*(s: string, destEncoding = "UTF-8", ## converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`. ## This opens a converter, uses it and closes it again and is thus more ## convienent but also likely less efficient than re-using a converter. + ## utf-16BE, utf-32 conversions not supported on windows var c = open(destEncoding, srcEncoding) try: result = convert(c, s) @@ -461,3 +478,55 @@ when not defined(testing) and isMainModule: echo "Forced ibm850 encoding: ", ibm850 echo "Current encoding: ", current echo "From ibm850 to current: ", convert(ibm850, current, "ibm850") + +when not defined(testing) and isMainModule and defined(windows): + block should_throw_on_unsupported_conversions: + let original = "some string" + + doAssertRaises(EncodingError): + discard convert(original, "utf-8", "utf-32") + + doAssertRaises(EncodingError): + discard convert(original, "utf-8", "unicodeFFFE") + + doAssertRaises(EncodingError): + discard convert(original, "utf-8", "utf-32BE") + + doAssertRaises(EncodingError): + discard convert(original, "unicodeFFFE", "utf-8") + + doAssertRaises(EncodingError): + discard convert(original, "utf-32", "utf-8") + + doAssertRaises(EncodingError): + discard convert(original, "utf-32BE", "utf-8") + + block should_convert_from_utf16_to_utf8: + let original = "\x42\x04\x35\x04\x41\x04\x42\x04" # utf-16 little endian test string "тест" + let result = convert(original, "utf-8", "utf-16") + doAssert(result == "\xd1\x82\xd0\xb5\xd1\x81\xd1\x82") + + block should_convert_from_utf16_to_win1251: + let original = "\x42\x04\x35\x04\x41\x04\x42\x04" # utf-16 little endian test string "тест" + let result = convert(original, "windows-1251", "utf-16") + doAssert(result == "\xf2\xe5\xf1\xf2") + + block should_convert_from_win1251_to_koi8r: + let original = "\xf2\xe5\xf1\xf2" # win1251 test string "тест" + let result = convert(original, "koi8-r", "windows-1251") + doAssert(result == "\xd4\xc5\xd3\xd4") + + block should_convert_from_koi8r_to_win1251: + let original = "\xd4\xc5\xd3\xd4" # koi8r test string "тест" + let result = convert(original, "windows-1251", "koi8-r") + doAssert(result == "\xf2\xe5\xf1\xf2") + + block should_convert_from_utf8_to_win1251: + let original = "\xd1\x82\xd0\xb5\xd1\x81\xd1\x82" # utf-8 test string "тест" + let result = convert(original, "windows-1251", "utf-8") + doAssert(result == "\xf2\xe5\xf1\xf2") + + block should_convert_from_utf8_to_utf16: + let original = "\xd1\x82\xd0\xb5\xd1\x81\xd1\x82" # utf-8 test string "тест" + let result = convert(original, "utf-16", "utf-8") + doAssert(result == "\x42\x04\x35\x04\x41\x04\x42\x04") \ No newline at end of file |