diff options
Diffstat (limited to 'lib/pure/encodings.nim')
-rw-r--r-- | lib/pure/encodings.nim | 619 |
1 files changed, 333 insertions, 286 deletions
diff --git a/lib/pure/encodings.nim b/lib/pure/encodings.nim index 25c7ad9ef..bbadca655 100644 --- a/lib/pure/encodings.nim +++ b/lib/pure/encodings.nim @@ -7,15 +7,46 @@ # distribution, for details about the copyright. # -## Converts between different character encodings. On UNIX, this uses +## Routines for converting between different character encodings. On UNIX, this uses ## the `iconv`:idx: library, on Windows the Windows API. +## +## The following example shows how to change character encodings. +runnableExamples: + when defined(windows): + let + orig = "öäüß" + # convert `orig` from "UTF-8" to "CP1252" + cp1252 = convert(orig, "CP1252", "UTF-8") + # convert `cp1252` from "CP1252" to "ibm850" + ibm850 = convert(cp1252, "ibm850", "CP1252") + current = getCurrentEncoding() + assert orig == "\195\182\195\164\195\188\195\159" + assert ibm850 == "\148\132\129\225" + assert convert(ibm850, current, "ibm850") == orig + +## The example below uses a reuseable `EncodingConverter` object which is +## created by `open` with `destEncoding` and `srcEncoding` specified. You can use +## `convert` on this object multiple times. +runnableExamples: + when defined(windows): + var fromGB2312 = open("utf-8", "gb2312") + let first = "\203\173\197\194\163\191\210\187" & + "\203\242\209\204\211\234\200\206\198\189\201\250" + assert fromGB2312.convert(first) == "谁怕?一蓑烟雨任平生" + + let second = "\211\208\176\215\205\183\200\231" & + "\208\194\163\172\199\227\184\199\200\231\185\202" + assert fromGB2312.convert(second) == "有白头如新,倾盖如故" + -import os, parseutils, strutils +import std/os +when defined(nimPreviewSlimSystem): + import std/assertions when not defined(windows): type ConverterObj = object - EncodingConverter* = ptr ConverterObj ## can convert between two character sets + EncodingConverter* = ptr ConverterObj ## Can convert between two character sets. else: type @@ -24,216 +55,220 @@ else: dest, src: CodePage type - EncodingError* = object of ValueError ## exception that is raised - ## for encoding errors - -{.deprecated: [EInvalidEncoding: EncodingError, PConverter: EncodingConverter].} + EncodingError* = object of ValueError ## Exception that is raised + ## for encoding errors. when defined(windows): + import std/[parseutils, strutils] proc eqEncodingNames(a, b: string): bool = var i = 0 var j = 0 while i < a.len and j < b.len: if a[i] in {'-', '_'}: inc i if b[j] in {'-', '_'}: inc j - if a[i].toLower != b[j].toLower: return false + if i < a.len and j < b.len and + a[i].toLowerAscii != b[j].toLowerAscii: + return false inc i inc j result = i == a.len and j == b.len - const + const winEncodings = [ - (1, "OEMCP"), # current OEM codepage - (037, "IBM037"), # IBM EBCDIC US-Canada - (437, "IBM437"), # OEM United States - (500, "IBM500"), # IBM EBCDIC International - (708, "ASMO-708"), # Arabic (ASMO 708) - (709, "ASMO_449"), # Arabic (ASMO-449+, BCON V4) - (710, ""), # Arabic - Transparent Arabic - (720, "DOS-720"), # Arabic (Transparent ASMO); Arabic (DOS) - (737, "ibm737"), # OEM Greek (formerly 437G); Greek (DOS) - (775, "ibm775"), # OEM Baltic; Baltic (DOS) - (850, "ibm850"), # OEM Multilingual Latin 1; Western European (DOS) - (852, "ibm852"), # OEM Latin 2; Central European (DOS) - (855, "IBM855"), # OEM Cyrillic (primarily Russian) - (857, "ibm857"), # OEM Turkish; Turkish (DOS) - (858, "IBM00858"), # OEM Multilingual Latin 1 + Euro symbol - (860, "IBM860"), # OEM Portuguese; Portuguese (DOS) - (861, "ibm861"), # OEM Icelandic; Icelandic (DOS) - (862, "DOS-862"), # OEM Hebrew; Hebrew (DOS) - (863, "IBM863"), # OEM French Canadian; French Canadian (DOS) - (864, "IBM864"), # OEM Arabic; Arabic (864) - (865, "IBM865"), # OEM Nordic; Nordic (DOS) - (866, "cp866"), # OEM Russian; Cyrillic (DOS) - (869, "ibm869"), # OEM Modern Greek; Greek, Modern (DOS) - (870, "IBM870"), # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 - (874, "windows-874"), # ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) - (875, "cp875"), # IBM EBCDIC Greek Modern - (932, "shift_jis"), # ANSI/OEM Japanese; Japanese (Shift-JIS) - (936, "gb2312"), # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) - (949, "ks_c_5601-1987"), # ANSI/OEM Korean (Unified Hangul Code) - (950, "big5"), # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) - (1026, "IBM1026"), # IBM EBCDIC Turkish (Latin 5) - (1047, "IBM01047"), # IBM EBCDIC Latin 1/Open System - (1140, "IBM01140"), # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) - (1141, "IBM01141"), # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) - (1142, "IBM01142"), # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) - (1143, "IBM01143"), # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) - (1144, "IBM01144"), # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) - (1145, "IBM01145"), # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) - (1146, "IBM01146"), # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) - (1147, "IBM01147"), # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) - (1148, "IBM01148"), # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) - (1149, "IBM01149"), # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) - (1200, "utf-16"), # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications - (1201, "unicodeFFFE"), # Unicode UTF-16, big endian byte order; available only to managed applications - (1250, "windows-1250"), # ANSI Central European; Central European (Windows) - (1251, "windows-1251"), # ANSI Cyrillic; Cyrillic (Windows) - (1252, "windows-1252"), # ANSI Latin 1; Western European (Windows) - (1253, "windows-1253"), # ANSI Greek; Greek (Windows) - (1254, "windows-1254"), # ANSI Turkish; Turkish (Windows) - (1255, "windows-1255"), # ANSI Hebrew; Hebrew (Windows) - (1256, "windows-1256"), # ANSI Arabic; Arabic (Windows) - (1257, "windows-1257"), # ANSI Baltic; Baltic (Windows) - (1258, "windows-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows) - - (1250, "cp-1250"), # ANSI Central European; Central European (Windows) - (1251, "cp-1251"), # ANSI Cyrillic; Cyrillic (Windows) - (1252, "cp-1252"), # ANSI Latin 1; Western European (Windows) - (1253, "cp-1253"), # ANSI Greek; Greek (Windows) - (1254, "cp-1254"), # ANSI Turkish; Turkish (Windows) - (1255, "cp-1255"), # ANSI Hebrew; Hebrew (Windows) - (1256, "cp-1256"), # ANSI Arabic; Arabic (Windows) - (1257, "cp-1257"), # ANSI Baltic; Baltic (Windows) - (1258, "cp-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows) - - (1361, "Johab"), # Korean (Johab) - (10000, "macintosh"), # MAC Roman; Western European (Mac) - (10001, "x-mac-japanese"), # Japanese (Mac) - (10002, "x-mac-chinesetrad"), # MAC Traditional Chinese (Big5); Chinese Traditional (Mac) - (10003, "x-mac-korean"), # Korean (Mac) - (10004, "x-mac-arabic"), # Arabic (Mac) - (10005, "x-mac-hebrew"), # Hebrew (Mac) - (10006, "x-mac-greek"), # Greek (Mac) - (10007, "x-mac-cyrillic"), # Cyrillic (Mac) - (10008, "x-mac-chinesesimp"), # MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) - (10010, "x-mac-romanian"), # Romanian (Mac) - (10017, "x-mac-ukrainian"), # Ukrainian (Mac) - (10021, "x-mac-thai"), # Thai (Mac) - (10029, "x-mac-ce"), # MAC Latin 2; Central European (Mac) - (10079, "x-mac-icelandic"), # Icelandic (Mac) - (10081, "x-mac-turkish"), # Turkish (Mac) - (10082, "x-mac-croatian"), # Croatian (Mac) - (12000, "utf-32"), # Unicode UTF-32, little endian byte order; available only to managed applications - (12001, "utf-32BE"), # Unicode UTF-32, big endian byte order; available only to managed applications - (20000, "x-Chinese_CNS"), # CNS Taiwan; Chinese Traditional (CNS) - (20001, "x-cp20001"), # TCA Taiwan - (20002, "x_Chinese-Eten"), # Eten Taiwan; Chinese Traditional (Eten) - (20003, "x-cp20003"), # IBM5550 Taiwan - (20004, "x-cp20004"), # TeleText Taiwan - (20005, "x-cp20005"), # Wang Taiwan - (20105, "x-IA5"), # IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) - (20106, "x-IA5-German"), # IA5 German (7-bit) - (20107, "x-IA5-Swedish"), # IA5 Swedish (7-bit) - (20108, "x-IA5-Norwegian"), # IA5 Norwegian (7-bit) - (20127, "us-ascii"), # US-ASCII (7-bit) - (20261, "x-cp20261"), # T.61 - (20269, "x-cp20269"), # ISO 6937 Non-Spacing Accent - (20273, "IBM273"), # IBM EBCDIC Germany - (20277, "IBM277"), # IBM EBCDIC Denmark-Norway - (20278, "IBM278"), # IBM EBCDIC Finland-Sweden - (20280, "IBM280"), # IBM EBCDIC Italy - (20284, "IBM284"), # IBM EBCDIC Latin America-Spain - (20285, "IBM285"), # IBM EBCDIC United Kingdom - (20290, "IBM290"), # IBM EBCDIC Japanese Katakana Extended - (20297, "IBM297"), # IBM EBCDIC France - (20420, "IBM420"), # IBM EBCDIC Arabic - (20423, "IBM423"), # IBM EBCDIC Greek - (20424, "IBM424"), # IBM EBCDIC Hebrew - (20833, "x-EBCDIC-KoreanExtended"), # IBM EBCDIC Korean Extended - (20838, "IBM-Thai"), # IBM EBCDIC Thai - (20866, "koi8-r"), # Russian (KOI8-R); Cyrillic (KOI8-R) - (20871, "IBM871"), # IBM EBCDIC Icelandic - (20880, "IBM880"), # IBM EBCDIC Cyrillic Russian - (20905, "IBM905"), # IBM EBCDIC Turkish - (20924, "IBM00924"), # IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) - (20932, "EUC-JP"), # Japanese (JIS 0208-1990 and 0121-1990) - (20936, "x-cp20936"), # Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) - (20949, "x-cp20949"), # Korean Wansung - (21025, "cp1025"), # IBM EBCDIC Cyrillic Serbian-Bulgarian - (21027, ""), # (deprecated) - (21866, "koi8-u"), # Ukrainian (KOI8-U); Cyrillic (KOI8-U) - (28591, "iso-8859-1"), # ISO 8859-1 Latin 1; Western European (ISO) - (28592, "iso-8859-2"), # ISO 8859-2 Central European; Central European (ISO) - (28593, "iso-8859-3"), # ISO 8859-3 Latin 3 - (28594, "iso-8859-4"), # ISO 8859-4 Baltic - (28595, "iso-8859-5"), # ISO 8859-5 Cyrillic - (28596, "iso-8859-6"), # ISO 8859-6 Arabic - (28597, "iso-8859-7"), # ISO 8859-7 Greek - (28598, "iso-8859-8"), # ISO 8859-8 Hebrew; Hebrew (ISO-Visual) - (28599, "iso-8859-9"), # ISO 8859-9 Turkish - (28603, "iso-8859-13"), # ISO 8859-13 Estonian - (28605, "iso-8859-15"), # ISO 8859-15 Latin 9 - (29001, "x-Europa"), # Europa 3 - (38598, "iso-8859-8-i"), # ISO 8859-8 Hebrew; Hebrew (ISO-Logical) - (50220, "iso-2022-jp"), # ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) - (50221, "csISO2022JP"), # ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) - (50222, "iso-2022-jp"), # ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) - (50225, "iso-2022-kr"), # ISO 2022 Korean - (50227, "x-cp50227"), # ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) - (50229, ""), # ISO 2022 Traditional Chinese - (50930, ""), # EBCDIC Japanese (Katakana) Extended - (50931, ""), # EBCDIC US-Canada and Japanese - (50933, ""), # EBCDIC Korean Extended and Korean - (50935, ""), # EBCDIC Simplified Chinese Extended and Simplified Chinese - (50936, ""), # EBCDIC Simplified Chinese - (50937, ""), # EBCDIC US-Canada and Traditional Chinese - (50939, ""), # EBCDIC Japanese (Latin) Extended and Japanese - (51932, "euc-jp"), # EUC Japanese - (51936, "EUC-CN"), # EUC Simplified Chinese; Chinese Simplified (EUC) - (51949, "euc-kr"), # EUC Korean - (51950, ""), # EUC Traditional Chinese - (52936, "hz-gb-2312"), # HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) - (54936, "GB18030"), # Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) - (57002, "x-iscii-de"), # ISCII Devanagari - (57003, "x-iscii-be"), # ISCII Bengali - (57004, "x-iscii-ta"), # ISCII Tamil - (57005, "x-iscii-te"), # ISCII Telugu - (57006, "x-iscii-as"), # ISCII Assamese - (57007, "x-iscii-or"), # ISCII Oriya - (57008, "x-iscii-ka"), # ISCII Kannada - (57009, "x-iscii-ma"), # ISCII Malayalam - (57010, "x-iscii-gu"), # ISCII Gujarati - (57011, "x-iscii-pa"), # ISCII Punjabi - (65000, "utf-7"), # Unicode (UTF-7) - (65001, "utf-8")] # Unicode (UTF-8) - + (1, "OEMCP"), # current OEM codepage + (037, "IBM037"), # IBM EBCDIC US-Canada + (437, "IBM437"), # OEM United States + (500, "IBM500"), # IBM EBCDIC International + (708, "ASMO-708"), # Arabic (ASMO 708) + (709, "ASMO_449"), # Arabic (ASMO-449+, BCON V4) + (710, ""), # Arabic - Transparent Arabic + (720, "DOS-720"), # Arabic (Transparent ASMO); Arabic (DOS) + (737, "ibm737"), # OEM Greek (formerly 437G); Greek (DOS) + (775, "ibm775"), # OEM Baltic; Baltic (DOS) + (850, "ibm850"), # OEM Multilingual Latin 1; Western European (DOS) + (852, "ibm852"), # OEM Latin 2; Central European (DOS) + (855, "IBM855"), # OEM Cyrillic (primarily Russian) + (857, "ibm857"), # OEM Turkish; Turkish (DOS) + (858, "IBM00858"), # OEM Multilingual Latin 1 + Euro symbol + (860, "IBM860"), # OEM Portuguese; Portuguese (DOS) + (861, "ibm861"), # OEM Icelandic; Icelandic (DOS) + (862, "DOS-862"), # OEM Hebrew; Hebrew (DOS) + (863, "IBM863"), # OEM French Canadian; French Canadian (DOS) + (864, "IBM864"), # OEM Arabic; Arabic (864) + (865, "IBM865"), # OEM Nordic; Nordic (DOS) + (866, "cp866"), # OEM Russian; Cyrillic (DOS) + (869, "ibm869"), # OEM Modern Greek; Greek, Modern (DOS) + (870, "IBM870"), # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 + (874, "windows-874"), # ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) + (875, "cp875"), # IBM EBCDIC Greek Modern + (932, "shift_jis"), # ANSI/OEM Japanese; Japanese (Shift-JIS) + (936, "gb2312"), # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) + (936, "gbk"), # Alias for GB2312 encoding + (949, "ks_c_5601-1987"), # ANSI/OEM Korean (Unified Hangul Code) + (950, "big5"), # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) + (1026, "IBM1026"), # IBM EBCDIC Turkish (Latin 5) + (1047, "IBM01047"), # IBM EBCDIC Latin 1/Open System + (1140, "IBM01140"), # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) + (1141, "IBM01141"), # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) + (1142, "IBM01142"), # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) + (1143, "IBM01143"), # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) + (1144, "IBM01144"), # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) + (1145, "IBM01145"), # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) + (1146, "IBM01146"), # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) + (1147, "IBM01147"), # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) + (1148, "IBM01148"), # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) + (1149, "IBM01149"), # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) + (1200, "utf-16"), # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications + (1201, "unicodeFFFE"), # Unicode UTF-16, big endian byte order; available only to managed applications + (1250, "windows-1250"), # ANSI Central European; Central European (Windows) + (1251, "windows-1251"), # ANSI Cyrillic; Cyrillic (Windows) + (1252, "windows-1252"), # ANSI Latin 1; Western European (Windows) + (1253, "windows-1253"), # ANSI Greek; Greek (Windows) + (1254, "windows-1254"), # ANSI Turkish; Turkish (Windows) + (1255, "windows-1255"), # ANSI Hebrew; Hebrew (Windows) + (1256, "windows-1256"), # ANSI Arabic; Arabic (Windows) + (1257, "windows-1257"), # ANSI Baltic; Baltic (Windows) + (1258, "windows-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows) + + (1250, "cp-1250"), # ANSI Central European; Central European (Windows) + (1251, "cp-1251"), # ANSI Cyrillic; Cyrillic (Windows) + (1252, "cp-1252"), # ANSI Latin 1; Western European (Windows) + (1253, "cp-1253"), # ANSI Greek; Greek (Windows) + (1254, "cp-1254"), # ANSI Turkish; Turkish (Windows) + (1255, "cp-1255"), # ANSI Hebrew; Hebrew (Windows) + (1256, "cp-1256"), # ANSI Arabic; Arabic (Windows) + (1257, "cp-1257"), # ANSI Baltic; Baltic (Windows) + (1258, "cp-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows) + + (1361, "Johab"), # Korean (Johab) + (10000, "macintosh"), # MAC Roman; Western European (Mac) + (10001, "x-mac-japanese"), # Japanese (Mac) + (10002, "x-mac-chinesetrad"), # MAC Traditional Chinese (Big5); Chinese Traditional (Mac) + (10003, "x-mac-korean"), # Korean (Mac) + (10004, "x-mac-arabic"), # Arabic (Mac) + (10005, "x-mac-hebrew"), # Hebrew (Mac) + (10006, "x-mac-greek"), # Greek (Mac) + (10007, "x-mac-cyrillic"), # Cyrillic (Mac) + (10008, "x-mac-chinesesimp"), # MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) + (10010, "x-mac-romanian"), # Romanian (Mac) + (10017, "x-mac-ukrainian"), # Ukrainian (Mac) + (10021, "x-mac-thai"), # Thai (Mac) + (10029, "x-mac-ce"), # MAC Latin 2; Central European (Mac) + (10079, "x-mac-icelandic"), # Icelandic (Mac) + (10081, "x-mac-turkish"), # Turkish (Mac) + (10082, "x-mac-croatian"), # Croatian (Mac) + (12000, "utf-32"), # Unicode UTF-32, little endian byte order; available only to managed applications + (12001, "utf-32BE"), # Unicode UTF-32, big endian byte order; available only to managed applications + (20000, "x-Chinese_CNS"), # CNS Taiwan; Chinese Traditional (CNS) + (20001, "x-cp20001"), # TCA Taiwan + (20002, "x_Chinese-Eten"), # Eten Taiwan; Chinese Traditional (Eten) + (20003, "x-cp20003"), # IBM5550 Taiwan + (20004, "x-cp20004"), # TeleText Taiwan + (20005, "x-cp20005"), # Wang Taiwan + (20105, "x-IA5"), # IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) + (20106, "x-IA5-German"), # IA5 German (7-bit) + (20107, "x-IA5-Swedish"), # IA5 Swedish (7-bit) + (20108, "x-IA5-Norwegian"), # IA5 Norwegian (7-bit) + (20127, "us-ascii"), # US-ASCII (7-bit) + (20261, "x-cp20261"), # T.61 + (20269, "x-cp20269"), # ISO 6937 Non-Spacing Accent + (20273, "IBM273"), # IBM EBCDIC Germany + (20277, "IBM277"), # IBM EBCDIC Denmark-Norway + (20278, "IBM278"), # IBM EBCDIC Finland-Sweden + (20280, "IBM280"), # IBM EBCDIC Italy + (20284, "IBM284"), # IBM EBCDIC Latin America-Spain + (20285, "IBM285"), # IBM EBCDIC United Kingdom + (20290, "IBM290"), # IBM EBCDIC Japanese Katakana Extended + (20297, "IBM297"), # IBM EBCDIC France + (20420, "IBM420"), # IBM EBCDIC Arabic + (20423, "IBM423"), # IBM EBCDIC Greek + (20424, "IBM424"), # IBM EBCDIC Hebrew + (20833, "x-EBCDIC-KoreanExtended"), # IBM EBCDIC Korean Extended + (20838, "IBM-Thai"), # IBM EBCDIC Thai + (20866, "koi8-r"), # Russian (KOI8-R); Cyrillic (KOI8-R) + (20871, "IBM871"), # IBM EBCDIC Icelandic + (20880, "IBM880"), # IBM EBCDIC Cyrillic Russian + (20905, "IBM905"), # IBM EBCDIC Turkish + (20924, "IBM00924"), # IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) + (20932, "EUC-JP"), # Japanese (JIS 0208-1990 and 0121-1990) + (20936, "x-cp20936"), # Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) + (20949, "x-cp20949"), # Korean Wansung + (21025, "cp1025"), # IBM EBCDIC Cyrillic Serbian-Bulgarian + (21027, ""), # (deprecated) + (21866, "koi8-u"), # Ukrainian (KOI8-U); Cyrillic (KOI8-U) + (28591, "iso-8859-1"), # ISO 8859-1 Latin 1; Western European (ISO) + (28592, "iso-8859-2"), # ISO 8859-2 Central European; Central European (ISO) + (28593, "iso-8859-3"), # ISO 8859-3 Latin 3 + (28594, "iso-8859-4"), # ISO 8859-4 Baltic + (28595, "iso-8859-5"), # ISO 8859-5 Cyrillic + (28596, "iso-8859-6"), # ISO 8859-6 Arabic + (28597, "iso-8859-7"), # ISO 8859-7 Greek + (28598, "iso-8859-8"), # ISO 8859-8 Hebrew; Hebrew (ISO-Visual) + (28599, "iso-8859-9"), # ISO 8859-9 Turkish + (28603, "iso-8859-13"), # ISO 8859-13 Estonian + (28605, "iso-8859-15"), # ISO 8859-15 Latin 9 + (29001, "x-Europa"), # Europa 3 + (38598, "iso-8859-8-i"), # ISO 8859-8 Hebrew; Hebrew (ISO-Logical) + (50220, "iso-2022-jp"), # ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) + (50221, "csISO2022JP"), # ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) + (50222, "iso-2022-jp"), # ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) + (50225, "iso-2022-kr"), # ISO 2022 Korean + (50227, "x-cp50227"), # ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) + (50229, ""), # ISO 2022 Traditional Chinese + (50930, ""), # EBCDIC Japanese (Katakana) Extended + (50931, ""), # EBCDIC US-Canada and Japanese + (50933, ""), # EBCDIC Korean Extended and Korean + (50935, ""), # EBCDIC Simplified Chinese Extended and Simplified Chinese + (50936, ""), # EBCDIC Simplified Chinese + (50937, ""), # EBCDIC US-Canada and Traditional Chinese + (50939, ""), # EBCDIC Japanese (Latin) Extended and Japanese + (51932, "euc-jp"), # EUC Japanese + (51936, "EUC-CN"), # EUC Simplified Chinese; Chinese Simplified (EUC) + (51949, "euc-kr"), # EUC Korean + (51950, ""), # EUC Traditional Chinese + (52936, "hz-gb-2312"), # HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) + (54936, "GB18030"), # Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) + (57002, "x-iscii-de"), # ISCII Devanagari + (57003, "x-iscii-be"), # ISCII Bengali + (57004, "x-iscii-ta"), # ISCII Tamil + (57005, "x-iscii-te"), # ISCII Telugu + (57006, "x-iscii-as"), # ISCII Assamese + (57007, "x-iscii-or"), # ISCII Oriya + (57008, "x-iscii-ka"), # ISCII Kannada + (57009, "x-iscii-ma"), # ISCII Malayalam + (57010, "x-iscii-gu"), # ISCII Gujarati + (57011, "x-iscii-pa"), # ISCII Punjabi + (65000, "utf-7"), # Unicode (UTF-7) + (65001, "utf-8")] # Unicode (UTF-8) + when false: # not needed yet: type - TCpInfo = object + CpInfo = object maxCharSize: int32 defaultChar: array[0..1, char] leadByte: array[0..12-1, char] - proc getCPInfo(codePage: CodePage, lpCPInfo: var TCpInfo): int32 {. + proc getCPInfo(codePage: CodePage, lpCPInfo: var CpInfo): int32 {. stdcall, importc: "GetCPInfo", dynlib: "kernel32".} - - proc nameToCodePage(name: string): CodePage = + + proc nameToCodePage*(name: string): CodePage = var nameAsInt: int if parseInt(name, nameAsInt) == 0: nameAsInt = -1 for no, na in items(winEncodings): if no == nameAsInt or eqEncodingNames(na, name): return CodePage(no) result = CodePage(-1) - - proc codePageToName(c: CodePage): string = + + proc codePageToName*(c: CodePage): string = for no, na in items(winEncodings): if no == int(c): return if na.len != 0: na else: $no result = "" - + proc getACP(): CodePage {.stdcall, importc: "GetACP", dynlib: "kernel32".} - + proc getGetConsoleCP(): CodePage {.stdcall, importc: "GetConsoleCP", + dynlib: "kernel32".} + proc multiByteToWideChar( codePage: CodePage, dwFlags: int32, @@ -250,23 +285,18 @@ when defined(windows): cchWideChar: cint, lpMultiByteStr: cstring, cbMultiByte: cint, - lpDefaultChar: cstring=nil, - lpUsedDefaultChar: pointer=nil): cint {. + lpDefaultChar: cstring = nil, + lpUsedDefaultChar: pointer = nil): cint {. stdcall, importc: "WideCharToMultiByte", dynlib: "kernel32".} - + else: when defined(haiku): - const iconvDll = "(libc.so.6|libiconv.so|libtextencoding.so)" + const iconvDll = "libiconv.so" elif defined(macosx): const iconvDll = "libiconv.dylib" else: const iconvDll = "(libc.so.6|libiconv.so)" - when defined(macosx) and defined(powerpc): - const prefix = "lib" - else: - const prefix = "" - const E2BIG = 7.cint EINVAL = 22.cint @@ -278,78 +308,83 @@ else: const EILSEQ = 86.cint elif defined(solaris): const EILSEQ = 88.cint + elif defined(haiku): + const EILSEQ = -2147454938.cint var errno {.importc, header: "<errno.h>".}: cint + when defined(bsd): + {.pragma: importIconv, cdecl, header: "<iconv.h>".} + when defined(openbsd): + {.passL: "-liconv".} + else: + {.pragma: importIconv, cdecl, dynlib: iconvDll.} + proc iconvOpen(tocode, fromcode: cstring): EncodingConverter {. - importc: prefix & "iconv_open", cdecl, dynlib: iconvDll.} + importc: "iconv_open", importIconv.} proc iconvClose(c: EncodingConverter) {. - importc: prefix & "iconv_close", cdecl, dynlib: iconvDll.} - proc iconv(c: EncodingConverter, inbuf: var cstring, inbytesLeft: var int, - outbuf: var cstring, outbytesLeft: var int): int {. - importc: prefix & "iconv", cdecl, dynlib: iconvDll.} - proc iconv(c: EncodingConverter, inbuf: pointer, inbytesLeft: pointer, - outbuf: var cstring, outbytesLeft: var int): int {. - importc: prefix & "iconv", cdecl, dynlib: iconvDll.} - -proc getCurrentEncoding*(): string = - ## retrieves the current encoding. On Unix, always "UTF-8" is returned. + importc: "iconv_close", importIconv.} + proc iconv(c: EncodingConverter, inbuf: ptr cstring, inbytesLeft: ptr csize_t, + outbuf: ptr cstring, outbytesLeft: ptr csize_t): csize_t {. + importc: "iconv", importIconv.} + +proc getCurrentEncoding*(uiApp = false): string = + ## Retrieves the current encoding. On Unix, "UTF-8" is always returned. + ## The `uiApp` parameter is Windows specific. If true, the UI's code-page + ## is returned, if false, the Console's code-page is returned. when defined(windows): - result = codePageToName(getACP()) + result = codePageToName(if uiApp: getACP() else: getGetConsoleCP()) else: result = "UTF-8" - + proc open*(destEncoding = "UTF-8", srcEncoding = "CP1252"): EncodingConverter = - ## opens a converter that can convert from `srcEncoding` to `destEncoding`. - ## Raises `EIO` if it cannot fulfill the request. + ## Opens a converter that can convert from `srcEncoding` to `destEncoding`. + ## Raises `EncodingError` if it cannot fulfill the request. when not defined(windows): result = iconvOpen(destEncoding, srcEncoding) - if result == nil: - raise newException(EncodingError, - "cannot create encoding converter from " & + if result == cast[EncodingConverter](-1): + raise newException(EncodingError, + "cannot create encoding converter from " & srcEncoding & " to " & destEncoding) else: result.dest = nameToCodePage(destEncoding) result.src = nameToCodePage(srcEncoding) if int(result.dest) == -1: - raise newException(EncodingError, + raise newException(EncodingError, "cannot find encoding " & destEncoding) if int(result.src) == -1: - raise newException(EncodingError, + raise newException(EncodingError, "cannot find encoding " & srcEncoding) proc close*(c: EncodingConverter) = - ## frees the resources the converter `c` holds. + ## Frees the resources the converter `c` holds. when not defined(windows): iconvClose(c) when defined(windows): - proc convert*(c: EncodingConverter, s: string): string = - ## converts `s` to `destEncoding` that was given to the converter `c`. It - ## assumed that `s` is in `srcEncoding`. - - # special case: empty string: needed because MultiByteToWideChar - # return 0 in case of error: - if s.len == 0: return "" + proc convertToWideString(codePage: CodePage, s: string): string = # educated guess of capacity: var cap = s.len + s.len shr 2 - result = newStringOfCap(cap*2) + result = newString(cap*2) # convert to utf-16 LE - var m = multiByteToWideChar(codePage = c.src, dwFlags = 0'i32, + var m = multiByteToWideChar(codePage, + dwFlags = 0'i32, lpMultiByteStr = cstring(s), cbMultiByte = cint(s.len), lpWideCharStr = cstring(result), cchWideChar = cint(cap)) - if m == 0: + if m == 0: # try again; ask for capacity: - cap = multiByteToWideChar(codePage = c.src, dwFlags = 0'i32, + cap = multiByteToWideChar(codePage, + dwFlags = 0'i32, lpMultiByteStr = cstring(s), cbMultiByte = cint(s.len), lpWideCharStr = nil, cchWideChar = cint(0)) # and do the conversion properly: - result = newStringOfCap(cap*2) - m = multiByteToWideChar(codePage = c.src, dwFlags = 0'i32, + result = newString(cap*2) + m = multiByteToWideChar(codePage, + dwFlags = 0'i32, lpMultiByteStr = cstring(s), cbMultiByte = cint(s.len), lpWideCharStr = cstring(result), @@ -360,57 +395,79 @@ when defined(windows): setLen(result, m*2) else: assert(false) # cannot happen - - # if already utf-16 LE, no further need to do something: - if int(c.dest) == 1200: return - # otherwise the fun starts again: - cap = s.len + s.len shr 2 - var res = newStringOfCap(cap) - m = wideCharToMultiByte( - codePage = c.dest, - dwFlags = 0'i32, - lpWideCharStr = cstring(result), - cchWideChar = cint(result.len div 2), - lpMultiByteStr = cstring(res), - cbMultiByte = cap.cint) + + proc convertFromWideString(codePage: CodePage, s: string): string = + let charCount = s.len div 2 + var cap = s.len + s.len shr 2 + result = newString(cap) + var m = wideCharToMultiByte(codePage, + dwFlags = 0'i32, + lpWideCharStr = cstring(s), + cchWideChar = cint(charCount), + lpMultiByteStr = cstring(result), + cbMultiByte = cap.cint) if m == 0: # try again; ask for capacity: - cap = wideCharToMultiByte( - codePage = c.dest, - dwFlags = 0'i32, - lpWideCharStr = cstring(result), - cchWideChar = cint(result.len div 2), - lpMultiByteStr = nil, - cbMultiByte = cint(0)) + cap = wideCharToMultiByte(codePage, + dwFlags = 0'i32, + lpWideCharStr = cstring(s), + cchWideChar = cint(charCount), + lpMultiByteStr = nil, + cbMultiByte = cint(0)) # and do the conversion properly: - res = newStringOfCap(cap) - m = wideCharToMultiByte( - codePage = c.dest, - dwFlags = 0'i32, - lpWideCharStr = cstring(result), - cchWideChar = cint(result.len div 2), - lpMultiByteStr = cstring(res), - cbMultiByte = cap.cint) + result = newString(cap) + m = wideCharToMultiByte(codePage, + dwFlags = 0'i32, + lpWideCharStr = cstring(s), + cchWideChar = cint(charCount), + lpMultiByteStr = cstring(result), + cbMultiByte = cap.cint) if m == 0: raiseOSError(osLastError()) - setLen(res, m) - result = res + setLen(result, m) elif m <= cap: - setLen(res, m) - result = res + setLen(result, m) else: assert(false) # cannot happen + proc convertWin(codePageFrom: CodePage, codePageTo: CodePage, + s: string): string = + # special case: empty string: needed because MultiByteToWideChar, WideCharToMultiByte + # return 0 in case of error + if s.len == 0: return "" + # multiByteToWideChar does not support encoding from code pages below + let unsupported = [1201, 12000, 12001] + + if int(codePageFrom) in unsupported: + let message = "encoding from " & codePageToName(codePageFrom) & " is not supported on windows" + raise newException(EncodingError, message) + + if int(codePageTo) in unsupported: + let message = "encoding to " & codePageToName(codePageTo) & " is not supported on windows" + raise newException(EncodingError, message) + + # in case it's already UTF-16 little endian - conversion can be simplified + let wideString = if int(codePageFrom) == 1200: s + else: convertToWideString(codePageFrom, s) + return if int(codePageTo) == 1200: wideString + else: convertFromWideString(codePageTo, wideString) + + proc convert*(c: EncodingConverter, s: string): string = + result = convertWin(c.src, c.dest, s) else: proc convert*(c: EncodingConverter, s: string): string = + ## Converts `s` to `destEncoding` that was given to the converter `c`. It + ## assumes that `s` is in `srcEncoding`. + ## + ## .. warning:: UTF-16BE and UTF-32 conversions are not supported on Windows. result = newString(s.len) - var inLen = len(s) - var outLen = len(result) + var inLen = csize_t len(s) + var outLen = csize_t len(result) var src = cstring(s) var dst = cstring(result) - var iconvres: int + var iconvres: csize_t while inLen > 0: - iconvres = iconv(c, src, inLen, dst, outLen) - if iconvres == -1: + iconvres = iconv(c, addr src, addr inLen, addr dst, addr outLen) + if iconvres == high(csize_t): var lerr = errno if lerr == EILSEQ or lerr == EINVAL: # unknown char, skip @@ -421,44 +478,34 @@ else: dec(outLen) elif lerr == E2BIG: var offset = cast[int](dst) - cast[int](cstring(result)) - setLen(result, len(result)+inLen*2+5) + setLen(result, len(result) + inLen.int * 2 + 5) # 5 is minimally one utf-8 char dst = cast[cstring](cast[int](cstring(result)) + offset) - outLen = len(result) - offset + outLen = csize_t(len(result) - offset) else: raiseOSError(lerr.OSErrorCode) - # iconv has a buffer that needs flushing, specially if the last char is + # iconv has a buffer that needs flushing, specially if the last char is # not '\0' - discard iconv(c, nil, nil, dst, outLen) - if iconvres == cint(-1) and errno == E2BIG: + discard iconv(c, nil, nil, addr dst, addr outLen) + if iconvres == high(csize_t) and errno == E2BIG: var offset = cast[int](dst) - cast[int](cstring(result)) - setLen(result, len(result)+inLen*2+5) + setLen(result, len(result) + inLen.int * 2 + 5) # 5 is minimally one utf-8 char dst = cast[cstring](cast[int](cstring(result)) + offset) - outLen = len(result) - offset - discard iconv(c, nil, nil, dst, outLen) + outLen = csize_t(len(result) - offset) + discard iconv(c, nil, nil, addr dst, addr outLen) # trim output buffer - setLen(result, len(result) - outLen) + setLen(result, len(result) - outLen.int) -proc convert*(s: string, destEncoding = "UTF-8", +proc convert*(s: string, destEncoding = "UTF-8", srcEncoding = "CP1252"): string = - ## converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`. + ## Converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`. ## This opens a converter, uses it and closes it again and is thus more - ## convienent but also likely less efficient than re-using a converter. + ## convenient but also likely less efficient than re-using a converter. + ## + ## .. warning:: UTF-16BE and UTF-32 conversions are not supported on Windows. var c = open(destEncoding, srcEncoding) try: result = convert(c, s) finally: close(c) - -when isMainModule: - let - orig = "öäüß" - cp1252 = convert(orig, "CP1252", "UTF-8") - ibm850 = convert(cp1252, "ibm850", "CP1252") - current = getCurrentEncoding() - echo "Original string from source code: ", orig - echo "Forced ibm850 encoding: ", ibm850 - echo "Current encoding: ", current - echo "From ibm850 to current: ", convert(ibm850, current, "ibm850") - |