summary refs log blame commit diff stats
path: root/lib/pure/encodings.nim
blob: ffdfaa9b45adc5d799fb7a567243a911aae1c161 (plain) (tree)






























































































                                                                                                                              










                                                                             


























































































































































































































































































































































                                                                                                                    

                                        
 
#
#
#            Nimrod's Runtime Library
#        (c) Copyright 2011 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## Converts between different character encodings. On UNIX, this uses 
## the `iconv`:idx: library, on Windows the Windows API.

import os, parseutils, strutils

when not defined(windows):
  type
    TConverter = object {.pure, final.}
    PConverter* = ptr TConverter ## can convert between two character sets
    
else:
  type
    TCodePage = distinct int32
    PConverter* = object {.pure.}
      dest, src: TCodePage
    
type
  EInvalidEncoding* = object of EInvalidValue ## exception that is raised
                                              ## for encoding errors

when defined(windows):
  proc EqEncodingNames(a, b: string): bool =
    var i = 0
    var j = 0
    while i < a.len and j < b.len:
      if a[i] in {'-', '_'}: inc i
      if b[j] in {'-', '_'}: inc j
      if a[i].tolower != b[j].tolower: return false
      inc i
      inc j
    result = i == a.len and j == b.len

  const 
    winEncodings = [
      (037, "IBM037"), # IBM EBCDIC US-Canada 
      (437, "IBM437"), # OEM United States 
      (500, "IBM500"), # IBM EBCDIC International 
      (708, "ASMO-708"), # Arabic (ASMO 708) 
      (709, "ASMO_449"), # Arabic (ASMO-449+, BCON V4) 
      (710, ""), # Arabic - Transparent Arabic 
      (720, "DOS-720"), # Arabic (Transparent ASMO); Arabic (DOS) 
      (737, "ibm737"), # OEM Greek (formerly 437G); Greek (DOS) 
      (775, "ibm775"), # OEM Baltic; Baltic (DOS) 
      (850, "ibm850"), # OEM Multilingual Latin 1; Western European (DOS) 
      (852, "ibm852"), # OEM Latin 2; Central European (DOS) 
      (855, "IBM855"), # OEM Cyrillic (primarily Russian) 
      (857, "ibm857"), # OEM Turkish; Turkish (DOS) 
      (858, "IBM00858"), # OEM Multilingual Latin 1 + Euro symbol 
      (860, "IBM860"), # OEM Portuguese; Portuguese (DOS) 
      (861, "ibm861"), # OEM Icelandic; Icelandic (DOS) 
      (862, "DOS-862"), # OEM Hebrew; Hebrew (DOS) 
      (863, "IBM863"), # OEM French Canadian; French Canadian (DOS) 
      (864, "IBM864"), # OEM Arabic; Arabic (864) 
      (865, "IBM865"), # OEM Nordic; Nordic (DOS) 
      (866, "cp866"), # OEM Russian; Cyrillic (DOS) 
      (869, "ibm869"), # OEM Modern Greek; Greek, Modern (DOS) 
      (870, "IBM870"), # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 
      (874, "windows-874"), # ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) 
      (875, "cp875"), # IBM EBCDIC Greek Modern 
      (932, "shift_jis"), # ANSI/OEM Japanese; Japanese (Shift-JIS) 
      (936, "gb2312"), # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) 
      (949, "ks_c_5601-1987"), # ANSI/OEM Korean (Unified Hangul Code) 
      (950, "big5"), # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) 
      (1026, "IBM1026"), # IBM EBCDIC Turkish (Latin 5) 
      (1047, "IBM01047"), # IBM EBCDIC Latin 1/Open System 
      (1140, "IBM01140"), # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) 
      (1141, "IBM01141"), # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) 
      (1142, "IBM01142"), # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) 
      (1143, "IBM01143"), # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) 
      (1144, "IBM01144"), # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) 
      (1145, "IBM01145"), # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) 
      (1146, "IBM01146"), # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) 
      (1147, "IBM01147"), # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) 
      (1148, "IBM01148"), # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) 
      (1149, "IBM01149"), # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) 
      (1200, "utf-16"), # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications 
      (1201, "unicodeFFFE"), # Unicode UTF-16, big endian byte order; available only to managed applications 
      (1250, "windows-1250"), # ANSI Central European; Central European (Windows) 
      (1251, "windows-1251"), # ANSI Cyrillic; Cyrillic (Windows) 
      (1252, "windows-1252"), # ANSI Latin 1; Western European (Windows) 
      (1253, "windows-1253"), # ANSI Greek; Greek (Windows) 
      (1254, "windows-1254"), # ANSI Turkish; Turkish (Windows) 
      (1255, "windows-1255"), # ANSI Hebrew; Hebrew (Windows) 
      (1256, "windows-1256"), # ANSI Arabic; Arabic (Windows) 
      (1257, "windows-1257"), # ANSI Baltic; Baltic (Windows) 
      (1258, "windows-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows) 

      (1250, "cp-1250"), # ANSI Central European; Central European (Windows) 
      (1251, "cp-1251"), # ANSI Cyrillic; Cyrillic (Windows) 
      (1252, "cp-1252"), # ANSI Latin 1; Western European (Windows) 
      (1253, "cp-1253"), # ANSI Greek; Greek (Windows) 
      (1254, "cp-1254"), # ANSI Turkish; Turkish (Windows) 
      (1255, "cp-1255"), # ANSI Hebrew; Hebrew (Windows) 
      (1256, "cp-1256"), # ANSI Arabic; Arabic (Windows) 
      (1257, "cp-1257"), # ANSI Baltic; Baltic (Windows) 
      (1258, "cp-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows) 

      (1361, "Johab"), # Korean (Johab) 
      (10000, "macintosh"), # MAC Roman; Western European (Mac) 
      (10001, "x-mac-japanese"), # Japanese (Mac) 
      (10002, "x-mac-chinesetrad"), # MAC Traditional Chinese (Big5); Chinese Traditional (Mac) 
      (10003, "x-mac-korean"), # Korean (Mac) 
      (10004, "x-mac-arabic"), # Arabic (Mac) 
      (10005, "x-mac-hebrew"), # Hebrew (Mac) 
      (10006, "x-mac-greek"), # Greek (Mac) 
      (10007, "x-mac-cyrillic"), # Cyrillic (Mac) 
      (10008, "x-mac-chinesesimp"), # MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) 
      (10010, "x-mac-romanian"), # Romanian (Mac) 
      (10017, "x-mac-ukrainian"), # Ukrainian (Mac) 
      (10021, "x-mac-thai"), # Thai (Mac) 
      (10029, "x-mac-ce"), # MAC Latin 2; Central European (Mac) 
      (10079, "x-mac-icelandic"), # Icelandic (Mac) 
      (10081, "x-mac-turkish"), # Turkish (Mac) 
      (10082, "x-mac-croatian"), # Croatian (Mac) 
      (12000, "utf-32"), # Unicode UTF-32, little endian byte order; available only to managed applications 
      (12001, "utf-32BE"), # Unicode UTF-32, big endian byte order; available only to managed applications 
      (20000, "x-Chinese_CNS"), # CNS Taiwan; Chinese Traditional (CNS) 
      (20001, "x-cp20001"), # TCA Taiwan 
      (20002, "x_Chinese-Eten"), # Eten Taiwan; Chinese Traditional (Eten) 
      (20003, "x-cp20003"), # IBM5550 Taiwan 
      (20004, "x-cp20004"), # TeleText Taiwan 
      (20005, "x-cp20005"), # Wang Taiwan 
      (20105, "x-IA5"), # IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) 
      (20106, "x-IA5-German"), # IA5 German (7-bit) 
      (20107, "x-IA5-Swedish"), # IA5 Swedish (7-bit) 
      (20108, "x-IA5-Norwegian"), # IA5 Norwegian (7-bit) 
      (20127, "us-ascii"), # US-ASCII (7-bit) 
      (20261, "x-cp20261"), # T.61 
      (20269, "x-cp20269"), # ISO 6937 Non-Spacing Accent 
      (20273, "IBM273"), # IBM EBCDIC Germany 
      (20277, "IBM277"), # IBM EBCDIC Denmark-Norway 
      (20278, "IBM278"), # IBM EBCDIC Finland-Sweden 
      (20280, "IBM280"), # IBM EBCDIC Italy 
      (20284, "IBM284"), # IBM EBCDIC Latin America-Spain 
      (20285, "IBM285"), # IBM EBCDIC United Kingdom 
      (20290, "IBM290"), # IBM EBCDIC Japanese Katakana Extended 
      (20297, "IBM297"), # IBM EBCDIC France 
      (20420, "IBM420"), # IBM EBCDIC Arabic 
      (20423, "IBM423"), # IBM EBCDIC Greek 
      (20424, "IBM424"), # IBM EBCDIC Hebrew 
      (20833, "x-EBCDIC-KoreanExtended"), # IBM EBCDIC Korean Extended 
      (20838, "IBM-Thai"), # IBM EBCDIC Thai 
      (20866, "koi8-r"), # Russian (KOI8-R); Cyrillic (KOI8-R) 
      (20871, "IBM871"), # IBM EBCDIC Icelandic 
      (20880, "IBM880"), # IBM EBCDIC Cyrillic Russian 
      (20905, "IBM905"), # IBM EBCDIC Turkish 
      (20924, "IBM00924"), # IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) 
      (20932, "EUC-JP"), # Japanese (JIS 0208-1990 and 0121-1990) 
      (20936, "x-cp20936"), # Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) 
      (20949, "x-cp20949"), # Korean Wansung 
      (21025, "cp1025"), # IBM EBCDIC Cyrillic Serbian-Bulgarian 
      (21027, ""), # (deprecated) 
      (21866, "koi8-u"), # Ukrainian (KOI8-U); Cyrillic (KOI8-U) 
      (28591, "iso-8859-1"), # ISO 8859-1 Latin 1; Western European (ISO) 
      (28592, "iso-8859-2"), # ISO 8859-2 Central European; Central European (ISO) 
      (28593, "iso-8859-3"), # ISO 8859-3 Latin 3 
      (28594, "iso-8859-4"), # ISO 8859-4 Baltic 
      (28595, "iso-8859-5"), # ISO 8859-5 Cyrillic 
      (28596, "iso-8859-6"), # ISO 8859-6 Arabic 
      (28597, "iso-8859-7"), # ISO 8859-7 Greek 
      (28598, "iso-8859-8"), # ISO 8859-8 Hebrew; Hebrew (ISO-Visual) 
      (28599, "iso-8859-9"), # ISO 8859-9 Turkish 
      (28603, "iso-8859-13"), # ISO 8859-13 Estonian 
      (28605, "iso-8859-15"), # ISO 8859-15 Latin 9 
      (29001, "x-Europa"), # Europa 3 
      (38598, "iso-8859-8-i"), # ISO 8859-8 Hebrew; Hebrew (ISO-Logical) 
      (50220, "iso-2022-jp"), # ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) 
      (50221, "csISO2022JP"), # ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) 
      (50222, "iso-2022-jp"), # ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) 
      (50225, "iso-2022-kr"), # ISO 2022 Korean 
      (50227, "x-cp50227"), # ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) 
      (50229, ""), # ISO 2022 Traditional Chinese 
      (50930, ""), # EBCDIC Japanese (Katakana) Extended 
      (50931, ""), # EBCDIC US-Canada and Japanese 
      (50933, ""), # EBCDIC Korean Extended and Korean 
      (50935, ""), # EBCDIC Simplified Chinese Extended and Simplified Chinese 
      (50936, ""), # EBCDIC Simplified Chinese 
      (50937, ""), # EBCDIC US-Canada and Traditional Chinese 
      (50939, ""), # EBCDIC Japanese (Latin) Extended and Japanese 
      (51932, "euc-jp"), # EUC Japanese 
      (51936, "EUC-CN"), # EUC Simplified Chinese; Chinese Simplified (EUC) 
      (51949, "euc-kr"), # EUC Korean 
      (51950, ""), # EUC Traditional Chinese 
      (52936, "hz-gb-2312"), # HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) 
      (54936, "GB18030"), # Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) 
      (57002, "x-iscii-de"), # ISCII Devanagari 
      (57003, "x-iscii-be"), # ISCII Bengali 
      (57004, "x-iscii-ta"), # ISCII Tamil 
      (57005, "x-iscii-te"), # ISCII Telugu 
      (57006, "x-iscii-as"), # ISCII Assamese 
      (57007, "x-iscii-or"), # ISCII Oriya 
      (57008, "x-iscii-ka"), # ISCII Kannada 
      (57009, "x-iscii-ma"), # ISCII Malayalam 
      (57010, "x-iscii-gu"), # ISCII Gujarati 
      (57011, "x-iscii-pa"), # ISCII Punjabi 
      (65000, "utf-7"), # Unicode (UTF-7) 
      (65001, "utf-8")] # Unicode (UTF-8) 
  
  when false:
    # not needed yet:
    type
      TCpInfo = object {.pure.}
        MaxCharSize: int32
        DefaultChar: array[0..1, char]
        LeadByte: array[0..12-1, char]

    proc GetCPInfo(CodePage: TCodePage, lpCPInfo: var TCpInfo): int32 {.
      stdcall, importc: "GetCPInfo", dynlib: "kernel32".}
  
  proc nameToCodePage(name: string): TCodePage =
    var nameAsInt: int
    if parseInt(name, nameAsInt) == 0: nameAsInt = -1
    for no, na in items(winEncodings):
      if no == nameAsInt or EqEncodingNames(na, name): return TCodePage(no)
    result = TCodePage(-1)
    
  proc codePageToName(c: TCodePage): string =
    for no, na in items(winEncodings):
      if no == int(c): 
        return if na.len != 0: na else: $no
    result = ""
  
  proc GetACP(): TCodePage {.stdcall, importc: "GetACP", dynlib: "kernel32".}
  
  proc MultiByteToWideChar(
    CodePage: TCodePage,
    dwFlags: int32,
    lpMultiByteStr: cstring,
    cbMultiByte: cint,
    lpWideCharStr: cstring,
    cchWideChar: cint): cint {.
      stdcall, importc: "MultiByteToWideChar", dynlib: "kernel32".}

  proc WideCharToMultiByte(
    CodePage: TCodePage,
    dwFlags: int32,
    lpWideCharStr: cstring,
    cchWideChar: cint,
    lpMultiByteStr: cstring,
    cbMultiByte: cint,
    lpDefaultChar: cstring=nil,
    lpUsedDefaultChar: pointer=nil): cint {.
      stdcall, importc: "WideCharToMultiByte", dynlib: "kernel32".}
  
else:
  when defined(haiku):
    const iconvDll = "(libc.so.6|libiconv.so|libtextencoding.so)"
  else:
    const iconvDll = "(libc.so.6|libiconv.so)"

  when defined(macosx) and defined(powerpc32):
    const prefix = "lib"
  else:
    const prefix = ""

  const
    E2BIG = 7.cint
    EINVAL = 22.cint
  when defined(linux):
    const EILSEQ = 84.cint
  elif defined(macosx):
    const EILSEQ = 92.cint
  elif defined(bsd):
    const EILSEQ = 86.cint
  elif defined(solaris):
    const EILSEQ = 88.cint

  var errno {.importc, header: "<errno.h>".}: cint

  proc iconvOpen(tocode, fromcode: cstring): PConverter {.
    importc: prefix & "iconv_open", cdecl, dynlib: iconvDll.}
  proc iconvClose(c: PConverter) {.
    importc: prefix & "iconv_close", cdecl, dynlib: iconvDll.}
  proc iconv(c: PConverter, inbuf: var cstring, inbytesLeft: var int,
             outbuf: var cstring, outbytesLeft: var int): int {.
    importc: prefix & "iconv", cdecl, dynlib: iconvDll.}
  proc iconv(c: PConverter, inbuf: pointer, inbytesLeft: pointer,
             outbuf: var cstring, outbytesLeft: var int): int {.
    importc: prefix & "iconv", cdecl, dynlib: iconvDll.}
  
proc getCurrentEncoding*(): string =
  ## retrieves the current encoding. On Unix, always "UTF-8" is returned.
  when defined(windows):
    result = codePageToName(GetACP())
  else:
    result = "UTF-8"
  
proc open*(destEncoding = "UTF-8", srcEncoding = "CP1252"): PConverter =
  ## opens a converter that can convert from `srcEncoding` to `destEncoding`.
  ## Raises `EIO` if it cannot fullfill the request.
  when not defined(windows):
    result = iconvOpen(srcEncoding, destEncoding)
    if result == nil:
      raise newException(EInvalidEncoding, 
        "cannot create encoding converter from " & 
        srcEncoding & " to " & destEncoding)
  else:
    result.dest = nameToCodePage(destEncoding)
    result.src = nameToCodePage(srcEncoding)
    if int(result.dest) == -1:
      raise newException(EInvalidEncoding, 
        "cannot find encoding " & destEncoding)
    if int(result.src) == -1:
      raise newException(EInvalidEncoding, 
        "cannot find encoding " & srcEncoding)

proc close*(c: PConverter) =
  ## frees the resources the converter `c` holds.
  when not defined(windows):
    iconvClose(c)

when defined(windows):

  proc convert*(c: PConverter, s: string): string =
    ## converts `s` to `destEncoding` that was given to the converter `c`. It
    ## assumed that `s` is in `srcEncoding`.
    
    # special case: empty string: needed because MultiByteToWideChar
    # return 0 in case of error:
    if s.len == 0: return ""
    # educated guess of capacity:
    var cap = s.len + s.len shr 2
    result = newStringOfCap(cap*2)
    # convert to utf-16 LE
    var m = MultiByteToWideChar(CodePage = c.src, dwFlags = 0'i32, 
                                lpMultiByteStr = cstring(s),
                                cbMultiByte = cint(s.len),
                                lpWideCharStr = cstring(result),
                                cchWideChar = cint(cap))
    if m == 0: 
      # try again; ask for capacity:
      cap = MultiByteToWideChar(CodePage = c.src, dwFlags = 0'i32, 
                                lpMultiByteStr = cstring(s),
                                cbMultiByte = cint(s.len),
                                lpWideCharStr = nil,
                                cchWideChar = cint(0))
      # and do the conversion properly:
      result = newStringOfCap(cap*2)
      m = MultiByteToWideChar(CodePage = c.src, dwFlags = 0'i32, 
                              lpMultiByteStr = cstring(s),
                              cbMultiByte = cint(s.len),
                              lpWideCharStr = cstring(result),
                              cchWideChar = cint(cap))
      if m == 0: OSError()
      setLen(result, m*2)
    elif m <= cap:
      setLen(result, m*2)
    else:
      assert(false) # cannot happen
    
    # if already utf-16 LE, no further need to do something:
    if int(c.dest) == 1200: return
    # otherwise the fun starts again:
    cap = s.len + s.len shr 2
    var res = newStringOfCap(cap)
    m = WideCharToMultiByte(
      CodePage = c.dest,
      dwFlags = 0'i32,
      lpWideCharStr = cstring(result),
      cchWideChar = cint(result.len div 2),
      lpMultiByteStr = cstring(res),
      cbMultiByte = cap)
    if m == 0:
      # try again; ask for capacity:
      cap = WideCharToMultiByte(
        CodePage = c.dest,
        dwFlags = 0'i32,
        lpWideCharStr = cstring(result),
        cchWideChar = cint(result.len div 2),
        lpMultiByteStr = nil,
        cbMultiByte = cint(0))
      # and do the conversion properly:
      res = newStringOfCap(cap)
      m = WideCharToMultiByte(
        CodePage = c.dest,
        dwFlags = 0'i32,
        lpWideCharStr = cstring(result),
        cchWideChar = cint(result.len div 2),
        lpMultiByteStr = cstring(res),
        cbMultiByte = cap)
      if m == 0: OSError()
      setLen(res, m)
      result = res
    elif m <= cap:
      setLen(res, m)
      result = res
    else:
      assert(false) # cannot happen

else:

  proc convert*(c: PConverter, s: string): string =
    result = newString(s.len)
    var inLen = len(S)
    var outLen = len(result)
    var src = cstring(S)
    var dst = cstring(result)
    var iconvres: int
    while InLen > 0:
      iconvres = iconv(c, src, inLen, dst, outLen)
      if iconvres == -1:
        var lerr = errno
        if lerr == EILSEQ or lerr == EINVAL:
          # unknown char, skip
          Dst[0] = Src[0]
          src = cast[cstring](cast[int](src) + 1)
          dst = cast[cstring](cast[int](dst) + 1)
          dec(inLen)
          dec(outLen)
        elif lerr == E2BIG:
          var offset = cast[int](dst) - cast[int](cstring(result))
          setLen(result, len(result)+inLen*2+5)
          # 5 is minimally one utf-8 char
          dst = cast[cstring](cast[int](cstring(result)) + offset)
          outLen = len(result) - offset
        else:
          OSError()
    # iconv has a buffer that needs flushing, specially if the last char is 
    # not '\0'
    discard iconv(c, nil, nil, dst, outlen)
    if iconvres == Cint(-1) and errno == E2BIG:
      var offset = cast[int](dst) - cast[int](cstring(result))
      setLen(result, len(result)+inLen*2+5)
      # 5 is minimally one utf-8 char
      dst = cast[cstring](cast[int](cstring(result)) + offset)
      outLen = len(result) - offset
      discard iconv(c, nil, nil, dst, outlen)
    # trim output buffer
    setLen(result, len(result) - outlen)

proc convert*(s: string, destEncoding = "UTF-8", 
                         srcEncoding = "CP1252"): string =
  ## converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`.
  ## This opens a converter, uses it and closes it again and is thus more
  ## convienent but also likely less efficient than re-using a converter.
  var c = open(destEncoding, srcEncoding)
  try:
    result = convert(c, s)
  finally:
    close(c)

when IsMainModule:
  var orig = "öäüß"
  var crap = convert(orig, "CP1252", "UTF-8")
  echo convert(crap, "ibm850", "CP1252")
  echo getCurrentEncoding()