diff options
Diffstat (limited to 'lib/pure/encodings.nim')
-rw-r--r-- | lib/pure/encodings.nim | 142 |
1 files changed, 53 insertions, 89 deletions
diff --git a/lib/pure/encodings.nim b/lib/pure/encodings.nim index 1d8512018..bbadca655 100644 --- a/lib/pure/encodings.nim +++ b/lib/pure/encodings.nim @@ -7,15 +7,46 @@ # distribution, for details about the copyright. # -## Converts between different character encodings. On UNIX, this uses +## Routines for converting between different character encodings. On UNIX, this uses ## the `iconv`:idx: library, on Windows the Windows API. +## +## The following example shows how to change character encodings. +runnableExamples: + when defined(windows): + let + orig = "öäüß" + # convert `orig` from "UTF-8" to "CP1252" + cp1252 = convert(orig, "CP1252", "UTF-8") + # convert `cp1252` from "CP1252" to "ibm850" + ibm850 = convert(cp1252, "ibm850", "CP1252") + current = getCurrentEncoding() + assert orig == "\195\182\195\164\195\188\195\159" + assert ibm850 == "\148\132\129\225" + assert convert(ibm850, current, "ibm850") == orig + +## The example below uses a reuseable `EncodingConverter` object which is +## created by `open` with `destEncoding` and `srcEncoding` specified. You can use +## `convert` on this object multiple times. +runnableExamples: + when defined(windows): + var fromGB2312 = open("utf-8", "gb2312") + let first = "\203\173\197\194\163\191\210\187" & + "\203\242\209\204\211\234\200\206\198\189\201\250" + assert fromGB2312.convert(first) == "谁怕?一蓑烟雨任平生" + + let second = "\211\208\176\215\205\183\200\231" & + "\208\194\163\172\199\227\184\199\200\231\185\202" + assert fromGB2312.convert(second) == "有白头如新,倾盖如故" -import os + +import std/os +when defined(nimPreviewSlimSystem): + import std/assertions when not defined(windows): type ConverterObj = object - EncodingConverter* = ptr ConverterObj ## can convert between two character sets + EncodingConverter* = ptr ConverterObj ## Can convert between two character sets. else: type @@ -24,11 +55,11 @@ else: dest, src: CodePage type - EncodingError* = object of ValueError ## exception that is raised - ## for encoding errors + EncodingError* = object of ValueError ## Exception that is raised + ## for encoding errors. when defined(windows): - import parseutils, strutils + import std/[parseutils, strutils] proc eqEncodingNames(a, b: string): bool = var i = 0 var j = 0 @@ -72,6 +103,7 @@ when defined(windows): (875, "cp875"), # IBM EBCDIC Greek Modern (932, "shift_jis"), # ANSI/OEM Japanese; Japanese (Shift-JIS) (936, "gb2312"), # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) + (936, "gbk"), # Alias for GB2312 encoding (949, "ks_c_5601-1987"), # ANSI/OEM Korean (Unified Hangul Code) (950, "big5"), # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) (1026, "IBM1026"), # IBM EBCDIC Turkish (Latin 5) @@ -281,8 +313,10 @@ else: var errno {.importc, header: "<errno.h>".}: cint - when defined(freebsd) or defined(netbsd): + when defined(bsd): {.pragma: importIconv, cdecl, header: "<iconv.h>".} + when defined(openbsd): + {.passL: "-liconv".} else: {.pragma: importIconv, cdecl, dynlib: iconvDll.} @@ -295,7 +329,7 @@ else: importc: "iconv", importIconv.} proc getCurrentEncoding*(uiApp = false): string = - ## retrieves the current encoding. On Unix, always "UTF-8" is returned. + ## Retrieves the current encoding. On Unix, "UTF-8" is always returned. ## The `uiApp` parameter is Windows specific. If true, the UI's code-page ## is returned, if false, the Console's code-page is returned. when defined(windows): @@ -304,11 +338,11 @@ proc getCurrentEncoding*(uiApp = false): string = result = "UTF-8" proc open*(destEncoding = "UTF-8", srcEncoding = "CP1252"): EncodingConverter = - ## opens a converter that can convert from `srcEncoding` to `destEncoding`. - ## Raises `IOError` if it cannot fulfill the request. + ## Opens a converter that can convert from `srcEncoding` to `destEncoding`. + ## Raises `EncodingError` if it cannot fulfill the request. when not defined(windows): result = iconvOpen(destEncoding, srcEncoding) - if result == nil: + if result == cast[EncodingConverter](-1): raise newException(EncodingError, "cannot create encoding converter from " & srcEncoding & " to " & destEncoding) @@ -323,7 +357,7 @@ proc open*(destEncoding = "UTF-8", srcEncoding = "CP1252"): EncodingConverter = "cannot find encoding " & srcEncoding) proc close*(c: EncodingConverter) = - ## frees the resources the converter `c` holds. + ## Frees the resources the converter `c` holds. when not defined(windows): iconvClose(c) @@ -418,12 +452,13 @@ when defined(windows): else: convertFromWideString(codePageTo, wideString) proc convert*(c: EncodingConverter, s: string): string = - ## converts `s` to `destEncoding` that was given to the converter `c`. It - ## assumed that `s` is in `srcEncoding`. - ## utf-16BE, utf-32 conversions not supported on windows result = convertWin(c.src, c.dest, s) else: proc convert*(c: EncodingConverter, s: string): string = + ## Converts `s` to `destEncoding` that was given to the converter `c`. It + ## assumes that `s` is in `srcEncoding`. + ## + ## .. warning:: UTF-16BE and UTF-32 conversions are not supported on Windows. result = newString(s.len) var inLen = csize_t len(s) var outLen = csize_t len(result) @@ -464,84 +499,13 @@ else: proc convert*(s: string, destEncoding = "UTF-8", srcEncoding = "CP1252"): string = - ## converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`. + ## Converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`. ## This opens a converter, uses it and closes it again and is thus more ## convenient but also likely less efficient than re-using a converter. - ## utf-16BE, utf-32 conversions not supported on windows + ## + ## .. warning:: UTF-16BE and UTF-32 conversions are not supported on Windows. var c = open(destEncoding, srcEncoding) try: result = convert(c, s) finally: close(c) - -when not defined(testing) and isMainModule: - let - orig = "öäüß" - cp1252 = convert(orig, "CP1252", "UTF-8") - ibm850 = convert(cp1252, "ibm850", "CP1252") - current = getCurrentEncoding() - echo "Original string from source code: ", orig - echo "Forced ibm850 encoding: ", ibm850 - echo "Current encoding: ", current - echo "From ibm850 to current: ", convert(ibm850, current, "ibm850") - -when not defined(testing) and isMainModule and defined(windows): - block should_throw_on_unsupported_conversions: - let original = "some string" - - doAssertRaises(EncodingError): - discard convert(original, "utf-8", "utf-32") - - doAssertRaises(EncodingError): - discard convert(original, "utf-8", "unicodeFFFE") - - doAssertRaises(EncodingError): - discard convert(original, "utf-8", "utf-32BE") - - doAssertRaises(EncodingError): - discard convert(original, "unicodeFFFE", "utf-8") - - doAssertRaises(EncodingError): - discard convert(original, "utf-32", "utf-8") - - doAssertRaises(EncodingError): - discard convert(original, "utf-32BE", "utf-8") - - block should_convert_from_utf16_to_utf8: - let original = "\x42\x04\x35\x04\x41\x04\x42\x04" # utf-16 little endian test string "тест" - let result = convert(original, "utf-8", "utf-16") - doAssert(result == "\xd1\x82\xd0\xb5\xd1\x81\xd1\x82") - - block should_convert_from_utf16_to_win1251: - let original = "\x42\x04\x35\x04\x41\x04\x42\x04" # utf-16 little endian test string "тест" - let result = convert(original, "windows-1251", "utf-16") - doAssert(result == "\xf2\xe5\xf1\xf2") - - block should_convert_from_win1251_to_koi8r: - let original = "\xf2\xe5\xf1\xf2" # win1251 test string "тест" - let result = convert(original, "koi8-r", "windows-1251") - doAssert(result == "\xd4\xc5\xd3\xd4") - - block should_convert_from_koi8r_to_win1251: - let original = "\xd4\xc5\xd3\xd4" # koi8r test string "тест" - let result = convert(original, "windows-1251", "koi8-r") - doAssert(result == "\xf2\xe5\xf1\xf2") - - block should_convert_from_utf8_to_win1251: - let original = "\xd1\x82\xd0\xb5\xd1\x81\xd1\x82" # utf-8 test string "тест" - let result = convert(original, "windows-1251", "utf-8") - doAssert(result == "\xf2\xe5\xf1\xf2") - - block should_convert_from_utf8_to_utf16: - let original = "\xd1\x82\xd0\xb5\xd1\x81\xd1\x82" # utf-8 test string "тест" - let result = convert(original, "utf-16", "utf-8") - doAssert(result == "\x42\x04\x35\x04\x41\x04\x42\x04") - - block should_handle_empty_string_for_any_conversion: - let original = "" - var result = convert(original, "utf-16", "utf-8") - doAssert(result == "") - result = convert(original, "utf-8", "utf-16") - doAssert(result == "") - result = convert(original, "windows-1251", "koi8-r") - doAssert(result == "") |