summary refs log tree commit diff stats
path: root/lib/pure/encodings.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/pure/encodings.nim')
-rw-r--r--lib/pure/encodings.nim174
1 files changed, 68 insertions, 106 deletions
diff --git a/lib/pure/encodings.nim b/lib/pure/encodings.nim
index e3be1764b..bbadca655 100644
--- a/lib/pure/encodings.nim
+++ b/lib/pure/encodings.nim
@@ -7,15 +7,46 @@
 #    distribution, for details about the copyright.
 #
 
-## Converts between different character encodings. On UNIX, this uses
+## Routines for converting between different character encodings. On UNIX, this uses
 ## the `iconv`:idx: library, on Windows the Windows API.
+##
+## The following example shows how to change character encodings.
+runnableExamples:
+  when defined(windows):
+    let
+      orig = "öäüß"
+      # convert `orig` from "UTF-8" to "CP1252"
+      cp1252 = convert(orig, "CP1252", "UTF-8")
+      # convert `cp1252` from "CP1252" to "ibm850"
+      ibm850 = convert(cp1252, "ibm850", "CP1252")
+      current = getCurrentEncoding()
+    assert orig == "\195\182\195\164\195\188\195\159"
+    assert ibm850 == "\148\132\129\225"
+    assert convert(ibm850, current, "ibm850") == orig
+
+## The example below uses a reuseable `EncodingConverter` object which is
+## created by `open` with `destEncoding` and `srcEncoding` specified. You can use
+## `convert` on this object multiple times.
+runnableExamples:
+  when defined(windows):
+    var fromGB2312 = open("utf-8", "gb2312")
+    let first = "\203\173\197\194\163\191\210\187" &
+        "\203\242\209\204\211\234\200\206\198\189\201\250"
+    assert fromGB2312.convert(first) == "谁怕?一蓑烟雨任平生"
+
+    let second = "\211\208\176\215\205\183\200\231" &
+        "\208\194\163\172\199\227\184\199\200\231\185\202"
+    assert fromGB2312.convert(second) == "有白头如新,倾盖如故"
 
-import os, parseutils, strutils
+
+import std/os
+when defined(nimPreviewSlimSystem):
+  import std/assertions
 
 when not defined(windows):
   type
     ConverterObj = object
-    EncodingConverter* = ptr ConverterObj ## can convert between two character sets
+    EncodingConverter* = ptr ConverterObj ## Can convert between two character sets.
 
 else:
   type
@@ -24,10 +55,11 @@ else:
       dest, src: CodePage
 
 type
-  EncodingError* = object of ValueError ## exception that is raised
-                                        ## for encoding errors
+  EncodingError* = object of ValueError ## Exception that is raised
+                                        ## for encoding errors.
 
 when defined(windows):
+  import std/[parseutils, strutils]
   proc eqEncodingNames(a, b: string): bool =
     var i = 0
     var j = 0
@@ -71,6 +103,7 @@ when defined(windows):
       (875, "cp875"),          # IBM EBCDIC Greek Modern
       (932, "shift_jis"),      # ANSI/OEM Japanese; Japanese (Shift-JIS)
       (936, "gb2312"),         # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
+      (936, "gbk"),            # Alias for GB2312 encoding
       (949, "ks_c_5601-1987"), # ANSI/OEM Korean (Unified Hangul Code)
       (950, "big5"),           # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
       (1026, "IBM1026"),       # IBM EBCDIC Turkish (Latin 5)
@@ -280,8 +313,10 @@ else:
 
   var errno {.importc, header: "<errno.h>".}: cint
 
-  when defined(freebsd) or defined(netbsd):
+  when defined(bsd):
     {.pragma: importIconv, cdecl, header: "<iconv.h>".}
+    when defined(openbsd):
+      {.passL: "-liconv".}
   else:
     {.pragma: importIconv, cdecl, dynlib: iconvDll.}
 
@@ -289,15 +324,12 @@ else:
     importc: "iconv_open", importIconv.}
   proc iconvClose(c: EncodingConverter) {.
     importc: "iconv_close", importIconv.}
-  proc iconv(c: EncodingConverter, inbuf: var cstring, inbytesLeft: var int,
-             outbuf: var cstring, outbytesLeft: var int): int {.
-    importc: "iconv", importIconv.}
-  proc iconv(c: EncodingConverter, inbuf: pointer, inbytesLeft: pointer,
-             outbuf: var cstring, outbytesLeft: var int): int {.
+  proc iconv(c: EncodingConverter, inbuf: ptr cstring, inbytesLeft: ptr csize_t,
+             outbuf: ptr cstring, outbytesLeft: ptr csize_t): csize_t {.
     importc: "iconv", importIconv.}
 
 proc getCurrentEncoding*(uiApp = false): string =
-  ## retrieves the current encoding. On Unix, always "UTF-8" is returned.
+  ## Retrieves the current encoding. On Unix, "UTF-8" is always returned.
   ## The `uiApp` parameter is Windows specific. If true, the UI's code-page
   ## is returned, if false, the Console's code-page is returned.
   when defined(windows):
@@ -306,11 +338,11 @@ proc getCurrentEncoding*(uiApp = false): string =
     result = "UTF-8"
 
 proc open*(destEncoding = "UTF-8", srcEncoding = "CP1252"): EncodingConverter =
-  ## opens a converter that can convert from `srcEncoding` to `destEncoding`.
-  ## Raises `IOError` if it cannot fulfill the request.
+  ## Opens a converter that can convert from `srcEncoding` to `destEncoding`.
+  ## Raises `EncodingError` if it cannot fulfill the request.
   when not defined(windows):
     result = iconvOpen(destEncoding, srcEncoding)
-    if result == nil:
+    if result == cast[EncodingConverter](-1):
       raise newException(EncodingError,
         "cannot create encoding converter from " &
         srcEncoding & " to " & destEncoding)
@@ -325,7 +357,7 @@ proc open*(destEncoding = "UTF-8", srcEncoding = "CP1252"): EncodingConverter =
         "cannot find encoding " & srcEncoding)
 
 proc close*(c: EncodingConverter) =
-  ## frees the resources the converter `c` holds.
+  ## Frees the resources the converter `c` holds.
   when not defined(windows):
     iconvClose(c)
 
@@ -420,21 +452,22 @@ when defined(windows):
            else: convertFromWideString(codePageTo, wideString)
 
   proc convert*(c: EncodingConverter, s: string): string =
-    ## converts `s` to `destEncoding` that was given to the converter `c`. It
-    ## assumed that `s` is in `srcEncoding`.
-    ## utf-16BE, utf-32 conversions not supported on windows
     result = convertWin(c.src, c.dest, s)
 else:
   proc convert*(c: EncodingConverter, s: string): string =
+    ## Converts `s` to `destEncoding` that was given to the converter `c`. It
+    ## assumes that `s` is in `srcEncoding`.
+    ##
+    ## .. warning:: UTF-16BE and UTF-32 conversions are not supported on Windows.
     result = newString(s.len)
-    var inLen = len(s)
-    var outLen = len(result)
+    var inLen = csize_t len(s)
+    var outLen = csize_t len(result)
     var src = cstring(s)
     var dst = cstring(result)
-    var iconvres: int
+    var iconvres: csize_t
     while inLen > 0:
-      iconvres = iconv(c, src, inLen, dst, outLen)
-      if iconvres == -1:
+      iconvres = iconv(c, addr src, addr inLen, addr dst, addr outLen)
+      if iconvres == high(csize_t):
         var lerr = errno
         if lerr == EILSEQ or lerr == EINVAL:
           # unknown char, skip
@@ -445,105 +478,34 @@ else:
           dec(outLen)
         elif lerr == E2BIG:
           var offset = cast[int](dst) - cast[int](cstring(result))
-          setLen(result, len(result)+inLen*2+5)
+          setLen(result, len(result) + inLen.int * 2 + 5)
           # 5 is minimally one utf-8 char
           dst = cast[cstring](cast[int](cstring(result)) + offset)
-          outLen = len(result) - offset
+          outLen = csize_t(len(result) - offset)
         else:
           raiseOSError(lerr.OSErrorCode)
     # iconv has a buffer that needs flushing, specially if the last char is
     # not '\0'
-    discard iconv(c, nil, nil, dst, outLen)
-    if iconvres == cint(-1) and errno == E2BIG:
+    discard iconv(c, nil, nil, addr dst, addr outLen)
+    if iconvres == high(csize_t) and errno == E2BIG:
       var offset = cast[int](dst) - cast[int](cstring(result))
-      setLen(result, len(result)+inLen*2+5)
+      setLen(result, len(result) + inLen.int * 2 + 5)
       # 5 is minimally one utf-8 char
       dst = cast[cstring](cast[int](cstring(result)) + offset)
-      outLen = len(result) - offset
-      discard iconv(c, nil, nil, dst, outLen)
+      outLen = csize_t(len(result) - offset)
+      discard iconv(c, nil, nil, addr dst, addr outLen)
     # trim output buffer
-    setLen(result, len(result) - outLen)
+    setLen(result, len(result) - outLen.int)
 
 proc convert*(s: string, destEncoding = "UTF-8",
                          srcEncoding = "CP1252"): string =
-  ## converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`.
+  ## Converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`.
   ## This opens a converter, uses it and closes it again and is thus more
   ## convenient but also likely less efficient than re-using a converter.
-  ## utf-16BE, utf-32 conversions not supported on windows
+  ##
+  ## .. warning:: UTF-16BE and UTF-32 conversions are not supported on Windows.
   var c = open(destEncoding, srcEncoding)
   try:
     result = convert(c, s)
   finally:
     close(c)
-
-when not defined(testing) and isMainModule:
-  let
-    orig = "öäüß"
-    cp1252 = convert(orig, "CP1252", "UTF-8")
-    ibm850 = convert(cp1252, "ibm850", "CP1252")
-    current = getCurrentEncoding()
-  echo "Original string from source code: ", orig
-  echo "Forced ibm850 encoding: ", ibm850
-  echo "Current encoding: ", current
-  echo "From ibm850 to current: ", convert(ibm850, current, "ibm850")
-
-when not defined(testing) and isMainModule and defined(windows):
-  block should_throw_on_unsupported_conversions:
-    let original = "some string"
-
-    doAssertRaises(EncodingError):
-      discard convert(original, "utf-8", "utf-32")
-
-    doAssertRaises(EncodingError):
-      discard convert(original, "utf-8", "unicodeFFFE")
-
-    doAssertRaises(EncodingError):
-      discard convert(original, "utf-8", "utf-32BE")
-
-    doAssertRaises(EncodingError):
-      discard convert(original, "unicodeFFFE", "utf-8")
-
-    doAssertRaises(EncodingError):
-      discard convert(original, "utf-32", "utf-8")
-
-    doAssertRaises(EncodingError):
-      discard convert(original, "utf-32BE", "utf-8")
-
-  block should_convert_from_utf16_to_utf8:
-    let original = "\x42\x04\x35\x04\x41\x04\x42\x04" # utf-16 little endian test string "тест"
-    let result = convert(original, "utf-8", "utf-16")
-    doAssert(result == "\xd1\x82\xd0\xb5\xd1\x81\xd1\x82")
-
-  block should_convert_from_utf16_to_win1251:
-    let original = "\x42\x04\x35\x04\x41\x04\x42\x04" # utf-16 little endian test string "тест"
-    let result = convert(original, "windows-1251", "utf-16")
-    doAssert(result == "\xf2\xe5\xf1\xf2")
-
-  block should_convert_from_win1251_to_koi8r:
-    let original = "\xf2\xe5\xf1\xf2" # win1251 test string "тест"
-    let result = convert(original, "koi8-r", "windows-1251")
-    doAssert(result == "\xd4\xc5\xd3\xd4")
-
-  block should_convert_from_koi8r_to_win1251:
-    let original = "\xd4\xc5\xd3\xd4" # koi8r test string "тест"
-    let result = convert(original, "windows-1251", "koi8-r")
-    doAssert(result == "\xf2\xe5\xf1\xf2")
-
-  block should_convert_from_utf8_to_win1251:
-    let original = "\xd1\x82\xd0\xb5\xd1\x81\xd1\x82" # utf-8 test string "тест"
-    let result = convert(original, "windows-1251", "utf-8")
-    doAssert(result == "\xf2\xe5\xf1\xf2")
-
-  block should_convert_from_utf8_to_utf16:
-    let original = "\xd1\x82\xd0\xb5\xd1\x81\xd1\x82" # utf-8 test string "тест"
-    let result = convert(original, "utf-16", "utf-8")
-    doAssert(result == "\x42\x04\x35\x04\x41\x04\x42\x04")
-
-  block should_handle_empty_string_for_any_conversion:
-    let original = ""
-    var result = convert(original, "utf-16", "utf-8")
-    doAssert(result == "")
-    result = convert(original, "utf-8", "utf-16")
-    doAssert(result == "")
-    result = convert(original, "windows-1251", "koi8-r")
-    doAssert(result == "")