summary refs log tree commit diff stats
path: root/lib/system/widestrs.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/system/widestrs.nim')
-rw-r--r--lib/system/widestrs.nim359
1 files changed, 124 insertions, 235 deletions
diff --git a/lib/system/widestrs.nim b/lib/system/widestrs.nim
index 588093d10..cf1f0910c 100644
--- a/lib/system/widestrs.nim
+++ b/lib/system/widestrs.nim
@@ -12,249 +12,138 @@
 

 type

   TUtf16Char* = distinct int16

-  WideCString* = ptr array[0.. 1_000_000, TUtf16Char]

+  WideCString* = ref array[0.. 1_000_000, TUtf16Char]

 

 proc len*(w: WideCString): int =

   ## returns the length of a widestring. This traverses the whole string to

   ## find the binary zero end marker!

   while int16(w[result]) != 0'i16: inc result

 

-when true:

-  const

-    UNI_REPLACEMENT_CHAR = TUtf16Char(0xFFFD'i16)

-    UNI_MAX_BMP = 0x0000FFFF

-    UNI_MAX_UTF16 = 0x0010FFFF

-    UNI_MAX_UTF32 = 0x7FFFFFFF

-    UNI_MAX_LEGAL_UTF32 = 0x0010FFFF

-

-    halfShift = 10

-    halfBase = 0x0010000

-    halfMask = 0x3FF

-

-    UNI_SUR_HIGH_START = 0xD800

-    UNI_SUR_HIGH_END = 0xDBFF

-    UNI_SUR_LOW_START = 0xDC00

-    UNI_SUR_LOW_END = 0xDFFF

-

-  template ones(n: expr): expr = ((1 shl n)-1)

-

-  template fastRuneAt(s: cstring, i: int, result: expr, doInc = true) =

-    ## Returns the unicode character ``s[i]`` in `result`. If ``doInc == true``

-    ## `i` is incremented by the number of bytes that have been processed.

-    bind ones

-

-    if ord(s[i]) <=% 127:

-      result = ord(s[i])

-      when doInc: inc(i)

-    elif ord(s[i]) shr 5 == 0b110:

-      #assert(ord(s[i+1]) shr 6 == 0b10)

-      result = (ord(s[i]) and (ones(5))) shl 6 or (ord(s[i+1]) and ones(6))

-      when doInc: inc(i, 2)

-    elif ord(s[i]) shr 4 == 0b1110:

-      #assert(ord(s[i+1]) shr 6 == 0b10)

-      #assert(ord(s[i+2]) shr 6 == 0b10)

-      result = (ord(s[i]) and ones(4)) shl 12 or

-               (ord(s[i+1]) and ones(6)) shl 6 or

-               (ord(s[i+2]) and ones(6))

-      when doInc: inc(i, 3)

-    elif ord(s[i]) shr 3 == 0b11110:

-      #assert(ord(s[i+1]) shr 6 == 0b10)

-      #assert(ord(s[i+2]) shr 6 == 0b10)

-      #assert(ord(s[i+3]) shr 6 == 0b10)

-      result = (ord(s[i]) and ones(3)) shl 18 or

-               (ord(s[i+1]) and ones(6)) shl 12 or

-               (ord(s[i+2]) and ones(6)) shl 6 or

-               (ord(s[i+3]) and ones(6))

-      when doInc: inc(i, 4)

-    else:

-      result = 0xFFFD

-      when doInc: inc(i)

-

-  iterator runes(s: cstring): int =

-    var

-      i = 0

-      result: int

-    while s[i] != '\0':

-      fastRuneAt(s, i, result, true)

-      yield result

-

-  proc allocWideCString*(source: cstring, L: int): WideCString =

-    ## free after usage with `dealloc`.

-    result = cast[wideCString](alloc(L * 4 + 2))

-    var d = 0

-    for ch in runes(source):

-      if ch <=% UNI_MAX_BMP:

-        if ch >=% UNI_SUR_HIGH_START and ch <=% UNI_SUR_LOW_END:

-          result[d] = UNI_REPLACEMENT_CHAR

-        else:

-          result[d] = TUtf16Char(toU16(ch))

-      elif ch >% UNI_MAX_UTF16:

+const

+  UNI_REPLACEMENT_CHAR = TUtf16Char(0xFFFD'i16)

+  UNI_MAX_BMP = 0x0000FFFF

+  UNI_MAX_UTF16 = 0x0010FFFF

+  UNI_MAX_UTF32 = 0x7FFFFFFF

+  UNI_MAX_LEGAL_UTF32 = 0x0010FFFF

+

+  halfShift = 10

+  halfBase = 0x0010000

+  halfMask = 0x3FF

+

+  UNI_SUR_HIGH_START = 0xD800

+  UNI_SUR_HIGH_END = 0xDBFF

+  UNI_SUR_LOW_START = 0xDC00

+  UNI_SUR_LOW_END = 0xDFFF

+

+template ones(n: expr): expr = ((1 shl n)-1)

+

+template fastRuneAt(s: cstring, i: int, result: expr, doInc = true) =

+  ## Returns the unicode character ``s[i]`` in `result`. If ``doInc == true``

+  ## `i` is incremented by the number of bytes that have been processed.

+  bind ones

+

+  if ord(s[i]) <=% 127:

+    result = ord(s[i])

+    when doInc: inc(i)

+  elif ord(s[i]) shr 5 == 0b110:

+    #assert(ord(s[i+1]) shr 6 == 0b10)

+    result = (ord(s[i]) and (ones(5))) shl 6 or (ord(s[i+1]) and ones(6))

+    when doInc: inc(i, 2)

+  elif ord(s[i]) shr 4 == 0b1110:

+    #assert(ord(s[i+1]) shr 6 == 0b10)

+    #assert(ord(s[i+2]) shr 6 == 0b10)

+    result = (ord(s[i]) and ones(4)) shl 12 or

+             (ord(s[i+1]) and ones(6)) shl 6 or

+             (ord(s[i+2]) and ones(6))

+    when doInc: inc(i, 3)

+  elif ord(s[i]) shr 3 == 0b11110:

+    #assert(ord(s[i+1]) shr 6 == 0b10)

+    #assert(ord(s[i+2]) shr 6 == 0b10)

+    #assert(ord(s[i+3]) shr 6 == 0b10)

+    result = (ord(s[i]) and ones(3)) shl 18 or

+             (ord(s[i+1]) and ones(6)) shl 12 or

+             (ord(s[i+2]) and ones(6)) shl 6 or

+             (ord(s[i+3]) and ones(6))

+    when doInc: inc(i, 4)

+  else:

+    result = 0xFFFD

+    when doInc: inc(i)

+

+iterator runes(s: cstring): int =

+  var

+    i = 0

+    result: int

+  while s[i] != '\0':

+    fastRuneAt(s, i, result, true)

+    yield result

+

+proc newWideCString*(source: cstring, L: int): WideCString =

+  unsafeNew(result, L * 4 + 2)
+  #result = cast[wideCString](alloc(L * 4 + 2))

+  var d = 0

+  for ch in runes(source):

+    if ch <=% UNI_MAX_BMP:

+      if ch >=% UNI_SUR_HIGH_START and ch <=% UNI_SUR_LOW_END:

         result[d] = UNI_REPLACEMENT_CHAR

       else:

-        let ch = ch -% halfBase

-        result[d] = TUtf16Char(toU16((ch shr halfShift) +% UNI_SUR_HIGH_START))

-        inc d

-        result[d] = TUtf16Char(toU16((ch and halfMask) +% UNI_SUR_LOW_START))

+        result[d] = TUtf16Char(toU16(ch))

+    elif ch >% UNI_MAX_UTF16:

+      result[d] = UNI_REPLACEMENT_CHAR

+    else:

+      let ch = ch -% halfBase

+      result[d] = TUtf16Char(toU16((ch shr halfShift) +% UNI_SUR_HIGH_START))

       inc d

-    result[d] = TUtf16Char(0'i16)

-

-  proc allocWideCString*(s: cstring): WideCString =

-    ## free after usage with `dealloc`.

-    if s.isNil: return nil

-

-    when not defined(c_strlen):

-      proc c_strlen(a: CString): int {.nodecl, noSideEffect, importc: "strlen".}

-

-    let L = cstrlen(s)

-    result = allocWideCString(s, L)

-

-  proc allocWideCString*(s: string): WideCString =

-    ## free after usage with `dealloc`.

-    result = allocWideCString(s, s.len)

-

-  proc `$`*(w: wideCString, estimate: int): string =

-    result = newStringOfCap(estimate + estimate shr 2)

-

-    var i = 0

-    while w[i].int16 != 0'i16:

-      var ch = w[i].int

-      inc i

-      if ch >=% UNI_SUR_HIGH_START and ch <=% UNI_SUR_HIGH_END:

-        # If the 16 bits following the high surrogate are in the source buffer...

-        let ch2 = w[i].int

-        # If it's a low surrogate, convert to UTF32:

-        if ch2 >=% UNI_SUR_LOW_START and ch2 <=% UNI_SUR_LOW_END:

-          ch = ((ch -% UNI_SUR_HIGH_START) shr halfShift) +%

-                (ch2 -% UNI_SUR_LOW_START) +% halfBase

-          inc i

-          

-      if ch <=% 127:

-        result.add chr(ch)

-      elif ch <=% 0x07FF:

-        result.add chr((ch shr 6) or 0b110_00000)

-        result.add chr((ch and ones(6)) or 0b10_000000)

-      elif ch <=% 0xFFFF:

-        result.add chr(ch shr 12 or 0b1110_0000)

-        result.add chr(ch shr 6 and ones(6) or 0b10_0000_00)

-        result.add chr(ch and ones(6) or 0b10_0000_00)

-      elif ch <=% 0x0010FFFF:

-        result.add chr(ch shr 18 or 0b1111_0000)

-        result.add chr(ch shr 12 and ones(6) or 0b10_0000_00)

-        result.add chr(ch shr 6 and ones(6) or 0b10_0000_00)

-        result.add chr(ch and ones(6) or 0b10_0000_00)

-      else:

-        # replacement char:

-        result.add chr(0xFFFD shr 12 or 0b1110_0000)

-        result.add chr(0xFFFD shr 6 and ones(6) or 0b10_0000_00)

-        result.add chr(0xFFFD and ones(6) or 0b10_0000_00)

-

-  proc `$`*(s: WideCString): string =

-    result = s $ 80

-

-else:

-  const

-    utf8Encoding = 65001

-    

-  proc MultiByteToWideChar*(

-    CodePage: int32,

-    dwFlags: int32,

-    lpMultiByteStr: cstring,

-    cbMultiByte: cint,

-    lpWideCharStr: WideCString,

-    cchWideChar: cint): cint {.

-      stdcall, importc: "MultiByteToWideChar", dynlib: "kernel32".}

-

-  proc WideCharToMultiByte*(

-    CodePage: int32,

-    dwFlags: int32,

-    lpWideCharStr: WideCString,

-    cchWideChar: cint,

-    lpMultiByteStr: cstring,

-    cbMultiByte: cint,

-    lpDefaultChar: cstring=nil,

-    lpUsedDefaultChar: pointer=nil): cint {.

-      stdcall, importc: "WideCharToMultiByte", dynlib: "kernel32".}

-

-  proc raiseEncodingError() {.noinline, noreturn.} =

-    raise newException(EOS, "error in unicode conversion")

-

-  proc `$`*(s: WideCString, len: int): string =

-    # special case: empty string: needed because MultiByteToWideChar

-    # returns 0 in case of error:

-    if len == 0: return ""

-

-    # educated guess of capacity:

-    var cap = len + len shr 2

-    result = newStringOfCap(cap)

-    

-    let m = WideCharToMultiByte(

-      CodePage = utf8Encoding,

-      dwFlags = 0'i32,

-      lpWideCharStr = s,

-      cchWideChar = cint(len),

-      lpMultiByteStr = cstring(result),

-      cbMultiByte = cap)

-    if m == 0:

-      # try again; ask for capacity:

-      cap = WideCharToMultiByte(

-        CodePage = utf8Encoding,

-        dwFlags = 0'i32,

-        lpWideCharStr = s,

-        cchWideChar = cint(len),

-        lpMultiByteStr = nil,

-        cbMultiByte = cint(0))

-      # and do the conversion properly:

-      result = newStringOfCap(cap)

-      let m = WideCharToMultiByte(

-        CodePage = utf8Encoding,

-        dwFlags = 0'i32,

-        lpWideCharStr = s,

-        cchWideChar = cint(len),

-        lpMultiByteStr = cstring(result),

-        cbMultiByte = cap)

-      if m == 0: raiseEncodingError()

-      setLen(result, m)

-    elif m <= cap:

-      setLen(result, m)

+      result[d] = TUtf16Char(toU16((ch and halfMask) +% UNI_SUR_LOW_START))

+    inc d

+  result[d] = TUtf16Char(0'i16)

+

+proc newWideCString*(s: cstring): WideCString =

+  if s.isNil: return nil

+

+  when not defined(c_strlen):

+    proc c_strlen(a: CString): int {.nodecl, noSideEffect, importc: "strlen".}

+

+  let L = cstrlen(s)

+  result = newWideCString(s, L)

+

+proc newWideCString*(s: string): WideCString =

+  result = newWideCString(s, s.len)

+

+proc `$`*(w: wideCString, estimate: int): string =

+  result = newStringOfCap(estimate + estimate shr 2)

+

+  var i = 0

+  while w[i].int16 != 0'i16:

+    var ch = w[i].int

+    inc i

+    if ch >=% UNI_SUR_HIGH_START and ch <=% UNI_SUR_HIGH_END:

+      # If the 16 bits following the high surrogate are in the source buffer...

+      let ch2 = w[i].int

+      # If it's a low surrogate, convert to UTF32:

+      if ch2 >=% UNI_SUR_LOW_START and ch2 <=% UNI_SUR_LOW_END:

+        ch = ((ch -% UNI_SUR_HIGH_START) shr halfShift) +%

+              (ch2 -% UNI_SUR_LOW_START) +% halfBase

+        inc i

+        

+    if ch <=% 127:

+      result.add chr(ch)

+    elif ch <=% 0x07FF:

+      result.add chr((ch shr 6) or 0b110_00000)

+      result.add chr((ch and ones(6)) or 0b10_000000)

+    elif ch <=% 0xFFFF:

+      result.add chr(ch shr 12 or 0b1110_0000)

+      result.add chr(ch shr 6 and ones(6) or 0b10_0000_00)

+      result.add chr(ch and ones(6) or 0b10_0000_00)

+    elif ch <=% 0x0010FFFF:

+      result.add chr(ch shr 18 or 0b1111_0000)

+      result.add chr(ch shr 12 and ones(6) or 0b10_0000_00)

+      result.add chr(ch shr 6 and ones(6) or 0b10_0000_00)

+      result.add chr(ch and ones(6) or 0b10_0000_00)

     else:

-      sysAssert(false, "") # cannot happen

-    

-  proc `$`*(s: WideCString): string =

-    result = s $ s.len

-    

-  proc allocWideCString*(s: string): WideCString =

-    ## free after usage with `dealloc`.

-    let cap = s.len+1

-    result = cast[wideCString](alloc0(cap * 2))

-    # special case: empty string: needed because MultiByteToWideChar

-    # return 0 in case of error:

-    if s.len == 0: return

-    # convert to utf-16 LE

-    let m = MultiByteToWideChar(CodePage = utf8Encoding, dwFlags = 0'i32, 

-                                lpMultiByteStr = cstring(s),

-                                cbMultiByte = cint(s.len),

-                                lpWideCharStr = result,

-                                cchWideChar = cint(cap))

-    if m == 0: raiseEncodingError()

-

-  proc allocWideCString*(s: cstring): WideCString =

-    ## free after usage with `dealloc`.

-    if s.isNil: return nil

-

-    when not defined(c_strlen):

-      proc c_strlen(a: CString): int {.nodecl, noSideEffect, importc: "strlen".}

-

-    let len = cstrlen(s)

-    let cap = len+1

-    result = cast[wideCString](alloc0(cap * 2))

-    # special case: empty string: needed because MultiByteToWideChar

-    # return 0 in case of error:

-    if s.len == 0: return

-    # convert to utf-16 LE

-    let m = MultiByteToWideChar(CodePage = utf8Encoding, dwFlags = 0'i32, 

-                                lpMultiByteStr = s,

-                                cbMultiByte = cint(len),

-                                lpWideCharStr = result,

-                                cchWideChar = cint(cap))

-    if m == 0: raiseEncodingError()

+      # replacement char:

+      result.add chr(0xFFFD shr 12 or 0b1110_0000)

+      result.add chr(0xFFFD shr 6 and ones(6) or 0b10_0000_00)

+      result.add chr(0xFFFD and ones(6) or 0b10_0000_00)

 

+proc `$`*(s: WideCString): string =

+  result = s $ 80