version 0.7.2

author: Andreas Rumpf <rumpf_a@web.de> 2008-12-12 14:02:27 +0100
committer: Andreas Rumpf <rumpf_a@web.de> 2008-12-12 14:02:27 +0100
commit: ddaedab835fa7ea3457f21a772d636921defdc46 (patch)
tree: 8f96b5a3a6700704e0a64bdcdedee1d2caf68517 /lib/unicode.nim
parent: 2cd136cf7a0210e3cfde7a6f8ba32c9f09560047 (diff)
download: Nim-ddaedab835fa7ea3457f21a772d636921defdc46.tar.gz
1 files changed, 62 insertions, 12 deletions
diff --git a/lib/unicode.nim b/lib/unicode.nim
index de3b80c94..e6665fbe2 100644
--- a/lib/unicode.nim
+++ b/lib/unicode.nim
@@ -7,6 +7,8 @@
 #    distribution, for details about the copyright.
 #
 
+## This module provides a way to handle various Unicode (or other) encodings.
+
 type
   TUniChar* = int32 ## type that can hold any Unicode character
   TUniChar16* = int16 ## 16 bit Unicode character
@@ -17,19 +19,15 @@ proc uniCharLen*(s: string): int =
   ## returns the number of Unicode characters of the string `s`.
   var i = 0
   while i < len(s):
-    if ord(s[i]) <= 127:
-      inc(i)
-    elif ord(s[i]) shr 5 == 0b110:
-      inc(i, 2)
-    elif ord(s[i]) shr 4 == 0b1110:
-      inc(i, 3)
-    elif ord(s[i]) shr 3 == 0b11110:
-      inc(i, 4)
-    else:
-      assert(false)
+    if ord(s[i]) <= 127: inc(i)
+    elif ord(s[i]) shr 5 == 0b110: inc(i, 2)
+    elif ord(s[i]) shr 4 == 0b1110: inc(i, 3)
+    elif ord(s[i]) shr 3 == 0b11110: inc(i, 4)
+    else: assert(false)
     inc(result)
 
 proc uniCharAt*(s: string, i: int): TUniChar =
+  ## returns the unicode character in `s` at byte index `i`
   if ord(s[i]) <= 127:
     result = ord(s[i])
   elif ord(s[i]) shr 5 == 0b110:
@@ -53,8 +51,7 @@ proc uniCharAt*(s: string, i: int): TUniChar =
     assert(false)
 
 iterator unichars*(s: string): TUniChar =
-  ## iterates over any unicode character of the string `s`. Fastest possible
-  ## method.
+  ## iterates over any unicode character of the string `s`.
   var
     i = 0
     result: TUniChar
@@ -79,6 +76,59 @@ iterator unichars*(s: string): TUniChar =
     else:
       assert(false)
     yield result
+    
+type
+  TCharacterSet = enum
+    cs8859_1, cs8859_2
+    
+const
+  characterSetToName: array [TCharacterSet, string] = [
+    "ISO/IEC 8859-1:1998",
+    "ISO 8859-2:1999",
+    "",
+    ""
+    ]
+    
+  cs8859_2toUnicode: array [0xA1..0xff, TUniChar16] = [
+    0x0104'i16, 0x02D8'i16, 0x0141'i16, 0x00A4'i16, 0x013D'i16, 0x015A'i16,  
+    0x00A7'i16, 0x00A8'i16, 0x0160'i16, 0x015E'i16, 0x0164'i16, 0x0179'i16,  
+    0x00AD'i16, 0x017D'i16, 0x017B'i16, 0x00B0'i16, 0x0105'i16, 0x02DB'i16,  
+    0x0142'i16, 0x00B4'i16, 0x013E'i16, 0x015B'i16, 0x02C7'i16, 0x00B8'i16,  
+    0x0161'i16, 0x015F'i16, 0x0165'i16, 0x017A'i16, 0x02DD'i16, 0x017E'i16,  
+    0x017C'i16, 0x0154'i16, 0x00C1'i16, 0x00C2'i16, 0x0102'i16, 0x00C4'i16,  
+    0x0139'i16, 0x0106'i16, 0x00C7'i16, 0x010C'i16, 0x00C9'i16, 0x0118'i16,  
+    0x00CB'i16, 0x011A'i16, 0x00CD'i16, 0x00CE'i16, 0x010E'i16, 0x0110'i16,  
+    0x0143'i16, 0x0147'i16, 0x00D3'i16, 0x00D4'i16, 0x0150'i16, 0x00D6'i16,  
+    0x00D7'i16, 0x0158'i16, 0x016E'i16, 0x00DA'i16, 0x0170'i16, 0x00DC'i16,  
+    0x00DD'i16, 0x0162'i16, 0x00DF'i16, 0x0155'i16, 0x00E1'i16, 0x00E2'i16,  
+    0x0103'i16, 0x00E4'i16, 0x013A'i16, 0x0107'i16, 0x00E7'i16, 0x010D'i16,  
+    0x00E9'i16, 0x0119'i16, 0x00EB'i16, 0x011B'i16, 0x00ED'i16, 0x00EE'i16,  
+    0x010F'i16, 0x0111'i16, 0x0144'i16, 0x0148'i16, 0x00F3'i16, 0x00F4'i16,  
+    0x0151'i16, 0x00F6'i16, 0x00F7'i16, 0x0159'i16, 0x016F'i16, 0x00FA'i16,  
+    0x0171'i16, 0x00FC'i16, 0x00FD'i16, 0x0163'i16, 0x02D9'i16]
+    
+proc searchTable(tab: openarray[TUniChar16], u: TUniChar16): int8 = 
+  var idx = find(tab, u)
+  assert(idx > 0)
+  result = toU8(idx)
+    
+proc csToUnicode(cs: TCharacterSet, c: int8): TUniChar16 = 
+  case cs
+  of cs8859_1: result = ze16(c) # no table lookup necessary
+  of cs8859_2: 
+    if c <=% 0xA0'i8: 
+      result = ze16(c)
+    else:
+      result = cs8859_2toUnicode[ze(c)]
+
+proc unicodeToCS(cs: TCharacterSet, u: TUniChar16): int8 = 
+  case cs
+  of cs8859_1: result = toU8(u) # no table lookup necessary
+  of cs8859_2:
+    if u <=% 0x00A0'i16: 
+      result = toU8(u)
+    else:
+      result = searchTable(cs8859_2toUnicode, u) +% 0xA1'8
 
 proc utf8toLocale*(s: string): string
 proc localeToUtf8*(s: string): string
author	Andreas Rumpf <rumpf_a@web.de>	2008-12-12 14:02:27 +0100
committer	Andreas Rumpf <rumpf_a@web.de>	2008-12-12 14:02:27 +0100
commit	ddaedab835fa7ea3457f21a772d636921defdc46 (patch)
tree	8f96b5a3a6700704e0a64bdcdedee1d2caf68517 /lib/unicode.nim
parent	2cd136cf7a0210e3cfde7a6f8ba32c9f09560047 (diff)
download	Nim-ddaedab835fa7ea3457f21a772d636921defdc46.tar.gz