added encodings stdlib

author: Araq <rumpf_a@web.de> 2011-06-16 02:03:33 +0200
committer: Araq <rumpf_a@web.de> 2011-06-16 02:03:33 +0200
commit: de659eba65b5ad098a403449c4b2b1067d04a2cf (patch)
tree: 0674a5c9e9569295a7bea36b746624af1b14cf70
parent: 9a8b39c85b22518a9c353dcf5d693e701ac272b7 (diff)
download: Nim-de659eba65b5ad098a403449c4b2b1067d04a2cf.tar.gz
5 files changed, 453 insertions, 4 deletions
diff --git a/doc/lib.txt b/doc/lib.txt
index 46fc85da5..a205f173e 100755
--- a/doc/lib.txt
+++ b/doc/lib.txt
@@ -79,6 +79,10 @@ String handling
 
 * `unicode <unicode.html>`_
   This module provides support to handle the Unicode UTF-8 encoding.
+  
+* `encodings <encodings.html>`_
+  Converts between different character encodings. On UNIX, this uses 
+  the ``iconv`` library, on Windows the Windows API.
 
 * `pegs <pegs.html>`_
   This module contains procedures and operators for handling PEGs.
diff --git a/lib/pure/encodings.nim b/lib/pure/encodings.nim
new file mode 100644
index 000000000..74c3043c8
--- /dev/null
+++ b/lib/pure/encodings.nim
@@ -0,0 +1,445 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2011 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Converts between different character encodings. On UNIX, this uses 
+## the `iconv`:idx: library, on Windows the Windows API.
+
+import os, parseutils, strutils
+
+when not defined(windows):
+  type
+    TConverter = object {.pure, final.}
+    PConverter* = ptr TConverter ## can convert between two character sets
+    
+else:
+  type
+    TCodePage = distinct int32
+    PConverter* = object {.pure.}
+      dest, src: TCodePage
+    
+type
+  EInvalidEncoding* = object of EInvalidValue ## exception that is raised
+                                              ## for encoding errors
+
+when defined(windows):
+  proc EqEncodingNames(a, b: string): bool =
+    var i = 0
+    var j = 0
+    while i < a.len and j < b.len:
+      if a[i] in {'-', '_'}: inc i
+      if b[j] in {'-', '_'}: inc j
+      if a[i].tolower != b[j].tolower: return false
+      inc i
+      inc j
+    result = i == a.len and j == b.len
+
+  const 
+    winEncodings = [
+      (037, "IBM037"), # IBM EBCDIC US-Canada 
+      (437, "IBM437"), # OEM United States 
+      (500, "IBM500"), # IBM EBCDIC International 
+      (708, "ASMO-708"), # Arabic (ASMO 708) 
+      (709, "ASMO_449"), # Arabic (ASMO-449+, BCON V4) 
+      (710, ""), # Arabic - Transparent Arabic 
+      (720, "DOS-720"), # Arabic (Transparent ASMO); Arabic (DOS) 
+      (737, "ibm737"), # OEM Greek (formerly 437G); Greek (DOS) 
+      (775, "ibm775"), # OEM Baltic; Baltic (DOS) 
+      (850, "ibm850"), # OEM Multilingual Latin 1; Western European (DOS) 
+      (852, "ibm852"), # OEM Latin 2; Central European (DOS) 
+      (855, "IBM855"), # OEM Cyrillic (primarily Russian) 
+      (857, "ibm857"), # OEM Turkish; Turkish (DOS) 
+      (858, "IBM00858"), # OEM Multilingual Latin 1 + Euro symbol 
+      (860, "IBM860"), # OEM Portuguese; Portuguese (DOS) 
+      (861, "ibm861"), # OEM Icelandic; Icelandic (DOS) 
+      (862, "DOS-862"), # OEM Hebrew; Hebrew (DOS) 
+      (863, "IBM863"), # OEM French Canadian; French Canadian (DOS) 
+      (864, "IBM864"), # OEM Arabic; Arabic (864) 
+      (865, "IBM865"), # OEM Nordic; Nordic (DOS) 
+      (866, "cp866"), # OEM Russian; Cyrillic (DOS) 
+      (869, "ibm869"), # OEM Modern Greek; Greek, Modern (DOS) 
+      (870, "IBM870"), # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 
+      (874, "windows-874"), # ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) 
+      (875, "cp875"), # IBM EBCDIC Greek Modern 
+      (932, "shift_jis"), # ANSI/OEM Japanese; Japanese (Shift-JIS) 
+      (936, "gb2312"), # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) 
+      (949, "ks_c_5601-1987"), # ANSI/OEM Korean (Unified Hangul Code) 
+      (950, "big5"), # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) 
+      (1026, "IBM1026"), # IBM EBCDIC Turkish (Latin 5) 
+      (1047, "IBM01047"), # IBM EBCDIC Latin 1/Open System 
+      (1140, "IBM01140"), # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) 
+      (1141, "IBM01141"), # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) 
+      (1142, "IBM01142"), # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) 
+      (1143, "IBM01143"), # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) 
+      (1144, "IBM01144"), # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) 
+      (1145, "IBM01145"), # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) 
+      (1146, "IBM01146"), # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) 
+      (1147, "IBM01147"), # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) 
+      (1148, "IBM01148"), # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) 
+      (1149, "IBM01149"), # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) 
+      (1200, "utf-16"), # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications 
+      (1201, "unicodeFFFE"), # Unicode UTF-16, big endian byte order; available only to managed applications 
+      (1250, "windows-1250"), # ANSI Central European; Central European (Windows) 
+      (1251, "windows-1251"), # ANSI Cyrillic; Cyrillic (Windows) 
+      (1252, "windows-1252"), # ANSI Latin 1; Western European (Windows) 
+      (1253, "windows-1253"), # ANSI Greek; Greek (Windows) 
+      (1254, "windows-1254"), # ANSI Turkish; Turkish (Windows) 
+      (1255, "windows-1255"), # ANSI Hebrew; Hebrew (Windows) 
+      (1256, "windows-1256"), # ANSI Arabic; Arabic (Windows) 
+      (1257, "windows-1257"), # ANSI Baltic; Baltic (Windows) 
+      (1258, "windows-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows) 
+      (1361, "Johab"), # Korean (Johab) 
+      (10000, "macintosh"), # MAC Roman; Western European (Mac) 
+      (10001, "x-mac-japanese"), # Japanese (Mac) 
+      (10002, "x-mac-chinesetrad"), # MAC Traditional Chinese (Big5); Chinese Traditional (Mac) 
+      (10003, "x-mac-korean"), # Korean (Mac) 
+      (10004, "x-mac-arabic"), # Arabic (Mac) 
+      (10005, "x-mac-hebrew"), # Hebrew (Mac) 
+      (10006, "x-mac-greek"), # Greek (Mac) 
+      (10007, "x-mac-cyrillic"), # Cyrillic (Mac) 
+      (10008, "x-mac-chinesesimp"), # MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) 
+      (10010, "x-mac-romanian"), # Romanian (Mac) 
+      (10017, "x-mac-ukrainian"), # Ukrainian (Mac) 
+      (10021, "x-mac-thai"), # Thai (Mac) 
+      (10029, "x-mac-ce"), # MAC Latin 2; Central European (Mac) 
+      (10079, "x-mac-icelandic"), # Icelandic (Mac) 
+      (10081, "x-mac-turkish"), # Turkish (Mac) 
+      (10082, "x-mac-croatian"), # Croatian (Mac) 
+      (12000, "utf-32"), # Unicode UTF-32, little endian byte order; available only to managed applications 
+      (12001, "utf-32BE"), # Unicode UTF-32, big endian byte order; available only to managed applications 
+      (20000, "x-Chinese_CNS"), # CNS Taiwan; Chinese Traditional (CNS) 
+      (20001, "x-cp20001"), # TCA Taiwan 
+      (20002, "x_Chinese-Eten"), # Eten Taiwan; Chinese Traditional (Eten) 
+      (20003, "x-cp20003"), # IBM5550 Taiwan 
+      (20004, "x-cp20004"), # TeleText Taiwan 
+      (20005, "x-cp20005"), # Wang Taiwan 
+      (20105, "x-IA5"), # IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) 
+      (20106, "x-IA5-German"), # IA5 German (7-bit) 
+      (20107, "x-IA5-Swedish"), # IA5 Swedish (7-bit) 
+      (20108, "x-IA5-Norwegian"), # IA5 Norwegian (7-bit) 
+      (20127, "us-ascii"), # US-ASCII (7-bit) 
+      (20261, "x-cp20261"), # T.61 
+      (20269, "x-cp20269"), # ISO 6937 Non-Spacing Accent 
+      (20273, "IBM273"), # IBM EBCDIC Germany 
+      (20277, "IBM277"), # IBM EBCDIC Denmark-Norway 
+      (20278, "IBM278"), # IBM EBCDIC Finland-Sweden 
+      (20280, "IBM280"), # IBM EBCDIC Italy 
+      (20284, "IBM284"), # IBM EBCDIC Latin America-Spain 
+      (20285, "IBM285"), # IBM EBCDIC United Kingdom 
+      (20290, "IBM290"), # IBM EBCDIC Japanese Katakana Extended 
+      (20297, "IBM297"), # IBM EBCDIC France 
+      (20420, "IBM420"), # IBM EBCDIC Arabic 
+      (20423, "IBM423"), # IBM EBCDIC Greek 
+      (20424, "IBM424"), # IBM EBCDIC Hebrew 
+      (20833, "x-EBCDIC-KoreanExtended"), # IBM EBCDIC Korean Extended 
+      (20838, "IBM-Thai"), # IBM EBCDIC Thai 
+      (20866, "koi8-r"), # Russian (KOI8-R); Cyrillic (KOI8-R) 
+      (20871, "IBM871"), # IBM EBCDIC Icelandic 
+      (20880, "IBM880"), # IBM EBCDIC Cyrillic Russian 
+      (20905, "IBM905"), # IBM EBCDIC Turkish 
+      (20924, "IBM00924"), # IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) 
+      (20932, "EUC-JP"), # Japanese (JIS 0208-1990 and 0121-1990) 
+      (20936, "x-cp20936"), # Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) 
+      (20949, "x-cp20949"), # Korean Wansung 
+      (21025, "cp1025"), # IBM EBCDIC Cyrillic Serbian-Bulgarian 
+      (21027, ""), # (deprecated) 
+      (21866, "koi8-u"), # Ukrainian (KOI8-U); Cyrillic (KOI8-U) 
+      (28591, "iso-8859-1"), # ISO 8859-1 Latin 1; Western European (ISO) 
+      (28592, "iso-8859-2"), # ISO 8859-2 Central European; Central European (ISO) 
+      (28593, "iso-8859-3"), # ISO 8859-3 Latin 3 
+      (28594, "iso-8859-4"), # ISO 8859-4 Baltic 
+      (28595, "iso-8859-5"), # ISO 8859-5 Cyrillic 
+      (28596, "iso-8859-6"), # ISO 8859-6 Arabic 
+      (28597, "iso-8859-7"), # ISO 8859-7 Greek 
+      (28598, "iso-8859-8"), # ISO 8859-8 Hebrew; Hebrew (ISO-Visual) 
+      (28599, "iso-8859-9"), # ISO 8859-9 Turkish 
+      (28603, "iso-8859-13"), # ISO 8859-13 Estonian 
+      (28605, "iso-8859-15"), # ISO 8859-15 Latin 9 
+      (29001, "x-Europa"), # Europa 3 
+      (38598, "iso-8859-8-i"), # ISO 8859-8 Hebrew; Hebrew (ISO-Logical) 
+      (50220, "iso-2022-jp"), # ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) 
+      (50221, "csISO2022JP"), # ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) 
+      (50222, "iso-2022-jp"), # ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) 
+      (50225, "iso-2022-kr"), # ISO 2022 Korean 
+      (50227, "x-cp50227"), # ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) 
+      (50229, ""), # ISO 2022 Traditional Chinese 
+      (50930, ""), # EBCDIC Japanese (Katakana) Extended 
+      (50931, ""), # EBCDIC US-Canada and Japanese 
+      (50933, ""), # EBCDIC Korean Extended and Korean 
+      (50935, ""), # EBCDIC Simplified Chinese Extended and Simplified Chinese 
+      (50936, ""), # EBCDIC Simplified Chinese 
+      (50937, ""), # EBCDIC US-Canada and Traditional Chinese 
+      (50939, ""), # EBCDIC Japanese (Latin) Extended and Japanese 
+      (51932, "euc-jp"), # EUC Japanese 
+      (51936, "EUC-CN"), # EUC Simplified Chinese; Chinese Simplified (EUC) 
+      (51949, "euc-kr"), # EUC Korean 
+      (51950, ""), # EUC Traditional Chinese 
+      (52936, "hz-gb-2312"), # HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) 
+      (54936, "GB18030"), # Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) 
+      (57002, "x-iscii-de"), # ISCII Devanagari 
+      (57003, "x-iscii-be"), # ISCII Bengali 
+      (57004, "x-iscii-ta"), # ISCII Tamil 
+      (57005, "x-iscii-te"), # ISCII Telugu 
+      (57006, "x-iscii-as"), # ISCII Assamese 
+      (57007, "x-iscii-or"), # ISCII Oriya 
+      (57008, "x-iscii-ka"), # ISCII Kannada 
+      (57009, "x-iscii-ma"), # ISCII Malayalam 
+      (57010, "x-iscii-gu"), # ISCII Gujarati 
+      (57011, "x-iscii-pa"), # ISCII Punjabi 
+      (65000, "utf-7"), # Unicode (UTF-7) 
+      (65001, "utf-8")] # Unicode (UTF-8) 
+  
+  when false:
+    # not needed yet:
+    type
+      TCpInfo = object {.pure.}
+        MaxCharSize: int32
+        DefaultChar: array[0..1, char]
+        LeadByte: array[0..12-1, char]
+
+    proc GetCPInfo(CodePage: TCodePage, lpCPInfo: var TCpInfo): int32 {.
+      stdcall, importc: "GetCPInfo", dynlib: "kernel32".}
+  
+  proc nameToCodePage(name: string): TCodePage =
+    var nameAsInt: int
+    if parseInt(name, nameAsInt) == 0: nameAsInt = -1
+    for no, na in items(winEncodings):
+      if no == nameAsInt or EqEncodingNames(na, name): return TCodePage(no)
+    result = TCodePage(-1)
+    
+  proc codePageToName(c: TCodePage): string =
+    for no, na in items(winEncodings):
+      if no == int(c): 
+        return if na.len != 0: na else: $no
+    result = ""
+  
+  proc GetACP(): TCodePage {.stdcall, importc: "GetACP", dynlib: "kernel32".}
+  
+  proc MultiByteToWideChar(
+    CodePage: TCodePage,
+    dwFlags: int32,
+    lpMultiByteStr: cstring,
+    cbMultiByte: cint,
+    lpWideCharStr: cstring,
+    cchWideChar: cint): cint {.
+      stdcall, importc: "MultiByteToWideChar", dynlib: "kernel32".}
+
+  proc WideCharToMultiByte(
+    CodePage: TCodePage,
+    dwFlags: int32,
+    lpWideCharStr: cstring,
+    cchWideChar: cint,
+    lpMultiByteStr: cstring,
+    cbMultiByte: cint,
+    lpDefaultChar: cstring=nil,
+    lpUsedDefaultChar: pointer=nil): cint {.
+      stdcall, importc: "WideCharToMultiByte", dynlib: "kernel32".}
+  
+else:
+  when defined(haiku):
+    const iconvDll = "(libc.so.6|libiconv.so|libtextencoding.so)"
+  else:
+    const iconvDll = "(libc.so.6|libiconv.so)"
+
+  when defined(macosx) and defined(powerpc32):
+    const prefix = "lib"
+  else:
+    const prefix = ""
+
+  const
+    E2BIG = 7.cint
+    EINVAL = 22.cint
+  when defined(linux):
+    const EILSEQ = 84.cint
+  elif defined(macosx):
+    const EILSEQ = 92.cint
+  elif defined(bsd):
+    const EILSEQ = 86.cint
+  elif defined(solaris):
+    const EILSEQ = 88.cint
+
+  var errno {.importc, header: "<errno.h>".}: cint
+
+  proc iconvOpen(tocode, fromcode: cstring): PConverter {.
+    importc: prefix & "iconv_open", cdecl, dynlib: iconvDll.}
+  proc iconvClose(c: PConverter) {.
+    importc: prefix & "iconv_close", cdecl, dynlib: iconvDll.}
+  proc iconv(c: PConverter, inbuf: var cstring, inbytesLeft: var int,
+             outbuf: var cstring, outbytesLeft: var int): int {.
+    importc: prefix & "iconv", cdecl, dynlib: iconvDll.}
+  proc iconv(c: PConverter, inbuf: pointer, inbytesLeft: pointer,
+             outbuf: var cstring, outbytesLeft: var int): int {.
+    importc: prefix & "iconv", cdecl, dynlib: iconvDll.}
+  
+proc getCurrentEncoding*(): string =
+  ## retrieves the current encoding. On Unix, always "UTF-8" is returned.
+  when defined(windows):
+    result = codePageToName(GetACP())
+  else:
+    result = "UTF-8"
+  
+proc open*(destEncoding = "UTF-8", srcEncoding = "CP1252"): PConverter =
+  ## opens a converter that can convert from `srcEncoding` to `destEncoding`.
+  ## Raises `EIO` if it cannot fullfill the request.
+  when not defined(windows):
+    result = iconvOpen(srcEncoding, destEncoding)
+    if result == nil:
+      raise newException(EInvalidEncoding, 
+        "cannot create encoding converter from " & 
+        srcEncoding & " to " & destEncoding)
+  else:
+    result.dest = nameToCodePage(destEncoding)
+    result.src = nameToCodePage(srcEncoding)
+    if int(result.dest) == -1:
+      raise newException(EInvalidEncoding, 
+        "cannot find encoding " & destEncoding)
+    if int(result.src) == -1:
+      raise newException(EInvalidEncoding, 
+        "cannot find encoding " & srcEncoding)
+
+proc close*(c: PConverter) =
+  ## frees the resources the converter `c` holds.
+  when not defined(windows):
+    iconvClose(c)
+
+when defined(windows):
+
+  proc convert*(c: PConverter, s: string): string =
+    ## converts `s` to `destEncoding` that was given to the converter `c`. It
+    ## assumed that `s` is in `srcEncoding`.
+    
+    # special case: empty string: needed because MultiByteToWideChar
+    # return 0 in case of error:
+    if s.len == 0: return ""
+    # educated guess of capacity:
+    var cap = s.len + s.len shr 2
+    result = newStringOfCap(cap*2)
+    # convert to utf-16 LE
+    var m = MultiByteToWideChar(CodePage = c.src, dwFlags = 0'i32, 
+                                lpMultiByteStr = cstring(s),
+                                cbMultiByte = cint(s.len),
+                                lpWideCharStr = cstring(result),
+                                cchWideChar = cint(cap))
+    if m == 0: 
+      # try again; ask for capacity:
+      cap = MultiByteToWideChar(CodePage = c.src, dwFlags = 0'i32, 
+                                lpMultiByteStr = cstring(s),
+                                cbMultiByte = cint(s.len),
+                                lpWideCharStr = nil,
+                                cchWideChar = cint(0))
+      # and do the conversion properly:
+      result = newStringOfCap(cap*2)
+      m = MultiByteToWideChar(CodePage = c.src, dwFlags = 0'i32, 
+                              lpMultiByteStr = cstring(s),
+                              cbMultiByte = cint(s.len),
+                              lpWideCharStr = cstring(result),
+                              cchWideChar = cint(cap))
+      if m == 0: OSError()
+      setLen(result, m*2)
+    elif m <= cap:
+      setLen(result, m*2)
+    else:
+      assert(false) # cannot happen
+    
+    # if already utf-16 LE, no further need to do something:
+    if int(c.dest) == 1200: return
+    # otherwise the fun starts again:
+    cap = s.len + s.len shr 2
+    var res = newStringOfCap(cap)
+    m = WideCharToMultiByte(
+      CodePage = c.dest,
+      dwFlags = 0'i32,
+      lpWideCharStr = cstring(result),
+      cchWideChar = cint(result.len div 2),
+      lpMultiByteStr = cstring(res),
+      cbMultiByte = cap)
+    if m == 0:
+      # try again; ask for capacity:
+      cap = WideCharToMultiByte(
+        CodePage = c.dest,
+        dwFlags = 0'i32,
+        lpWideCharStr = cstring(result),
+        cchWideChar = cint(result.len div 2),
+        lpMultiByteStr = nil,
+        cbMultiByte = cint(0))
+      # and do the conversion properly:
+      res = newStringOfCap(cap)
+      m = WideCharToMultiByte(
+        CodePage = c.dest,
+        dwFlags = 0'i32,
+        lpWideCharStr = cstring(result),
+        cchWideChar = cint(result.len div 2),
+        lpMultiByteStr = cstring(res),
+        cbMultiByte = cap)
+      if m == 0: OSError()
+      setLen(res, m)
+      result = res
+    elif m <= cap:
+      setLen(res, m)
+      result = res
+    else:
+      assert(false) # cannot happen
+
+else:
+
+  proc convert*(c: PConverter, s: string): string =
+    result = newString(s.len)
+    var inLen = len(S)
+    var outLen = len(result)
+    var src = cstring(S)
+    var dst = cstring(result)
+    var iconvres: int
+    while InLen > 0:
+      iconvres = iconv(c, src, inLen, dst, outLen)
+      if iconvres == -1:
+        var lerr = errno
+        if lerr == EILSEQ or lerr == EINVAL:
+          # unknown char, skip
+          Dst[0] = Src[0]
+          src = cast[cstring](cast[int](src) + 1)
+          dst = cast[cstring](cast[int](dst) + 1)
+          dec(inLen)
+          dec(outLen)
+        elif lerr == E2BIG:
+          var offset = cast[int](dst) - cast[int](cstring(result))
+          setLen(result, len(result)+inLen*2+5)
+          # 5 is minimally one utf-8 char
+          dst = cast[cstring](cast[int](cstring(result)) + offset)
+          outLen = len(result) - offset
+        else:
+          OSError()
+    # iconv has a buffer that needs flushing, specially if the last char is 
+    # not '\0'
+    discard iconv(c, nil, nil, dst, outlen)
+    if iconvres == Cint(-1) and errno == E2BIG:
+      var offset = cast[int](dst) - cast[int](cstring(result))
+      setLen(result, len(result)+inLen*2+5)
+      # 5 is minimally one utf-8 char
+      dst = cast[cstring](cast[int](cstring(result)) + offset)
+      outLen = len(result) - offset
+      discard iconv(c, nil, nil, dst, outlen)
+    # trim output buffer
+    setLen(result, len(result) - outlen)
+
+proc convert*(s: string, destEncoding = "UTF-8", 
+                         srcEncoding = "CP1252"): string =
+  ## converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`.
+  ## This opens a converter, uses it and closes it again and is thus more
+  ## convienent but also likely less efficient than re-using a converter.
+  var c = open(destEncoding, srcEncoding)
+  try:
+    result = convert(c, s)
+  finally:
+    close(c)
+
+when IsMainModule:
+  var orig = "öäüß"
+  var crap = convert(orig, "CP1252", "UTF-8")
+  echo convert(crap)
+
+
diff --git a/todo.txt b/todo.txt
index 8826877ad..7b7feba10 100755
--- a/todo.txt
+++ b/todo.txt
@@ -4,10 +4,9 @@ High priority (version 0.8.12)
 
 * add --deadlock_prevention:on|off switch? timeout for locks?
 * built-in serialization
-- bug: invoking a generic iterator twice triggers a code gen bug
+- bug: invoking a generic iterator twice triggers a code gen bug (titer2)
 - pegs: the anchor '^' does not work because many procs use a linear search
   and matchLen()
-- conversion between character sets
 
 
 version 0.9.0
@@ -43,6 +42,7 @@ version 0.9.XX
 
 - distinct types for array/seq indexes
 - GC: marker procs for native Nimrod GC and Boehm GC
+- code concerning 'assert' is wasteful and unnecessarily complex
 - implicit ref/ptr->var conversion; the compiler may store an object
   implicitly on the heap for write barrier efficiency
 - resizing of strings/sequences could take into account the memory that
@@ -52,7 +52,6 @@ version 0.9.XX
   is hard because of partial evaluation --> symbol files will fix this as
   a side effect
 - EcmaScript needs a new and better code gen: simply adapt the C code gen to it
-- prefer proc in current module over other procs with same overloading result?
 - generalized case statement (requires better transf)
 - tlastmod returns wrong results on BSD (Linux, MacOS X: works)
 - nested tuple unpacking
diff --git a/web/news.txt b/web/news.txt
index 321550dfe..84cc34487 100755
--- a/web/news.txt
+++ b/web/news.txt
@@ -54,6 +54,7 @@ Additions
 - Added ``intsets`` module which contains a specialized int set data type.
 - Added ``scgi`` module.
 - Added ``smtp`` module.
+- Added ``encodings`` module.
 - Added ``re.findAll``, ``pegs.findAll``.
 - Added ``os.findExe``.
 - Added ``parseutils.parseUntil`` and ``parseutils.parseWhile``.
diff --git a/web/nimrod.ini b/web/nimrod.ini
index 50716012e..b092f7453 100755
--- a/web/nimrod.ini
+++ b/web/nimrod.ini
@@ -39,7 +39,7 @@ srcdoc: "pure/xmlparser;pure/htmlparser;pure/xmltree;pure/colors"
 srcdoc: "pure/json;pure/base64;pure/scgi;pure/redis;impure/graphics"
 srcdoc: "impure/rdstdin;wrappers/zmq;wrappers/sphinx"
 srcdoc: "pure/collections/tables;pure/collections/sets;pure/collections/lists"
-srcdoc: "pure/collections/intsets"
+srcdoc: "pure/collections/intsets;pure/encodings"
 
 webdoc: "wrappers/libcurl;pure/md5;wrappers/mysql;wrappers/iup"
 webdoc: "wrappers/sqlite3;wrappers/postgres;wrappers/tinyc"
author	Araq <rumpf_a@web.de>	2011-06-16 02:03:33 +0200
committer	Araq <rumpf_a@web.de>	2011-06-16 02:03:33 +0200
commit	de659eba65b5ad098a403449c4b2b1067d04a2cf (patch)
tree	0674a5c9e9569295a7bea36b746624af1b14cf70
parent	9a8b39c85b22518a9c353dcf5d693e701ac272b7 (diff)
download	Nim-de659eba65b5ad098a403449c4b2b1067d04a2cf.tar.gz