summary refs log blame commit diff stats
path: root/lib/system/widestrs.nim
blob: 588093d108a08e2e359b2baa77b52d2552bb0e70 (plain) (tree)



































































































































































































































































                                                                                  
#
#
#            Nimrod's Runtime Library
#        (c) Copyright 2012 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## Nimrod support for C/C++'s `wide strings`:idx:. This is part of the system
## module! Do not import it directly!

type
  TUtf16Char* = distinct int16
  WideCString* = ptr array[0.. 1_000_000, TUtf16Char]

proc len*(w: WideCString): int =
  ## returns the length of a widestring. This traverses the whole string to
  ## find the binary zero end marker!
  while int16(w[result]) != 0'i16: inc result

when true:
  const
    UNI_REPLACEMENT_CHAR = TUtf16Char(0xFFFD'i16)
    UNI_MAX_BMP = 0x0000FFFF
    UNI_MAX_UTF16 = 0x0010FFFF
    UNI_MAX_UTF32 = 0x7FFFFFFF
    UNI_MAX_LEGAL_UTF32 = 0x0010FFFF

    halfShift = 10
    halfBase = 0x0010000
    halfMask = 0x3FF

    UNI_SUR_HIGH_START = 0xD800
    UNI_SUR_HIGH_END = 0xDBFF
    UNI_SUR_LOW_START = 0xDC00
    UNI_SUR_LOW_END = 0xDFFF

  template ones(n: expr): expr = ((1 shl n)-1)

  template fastRuneAt(s: cstring, i: int, result: expr, doInc = true) =
    ## Returns the unicode character ``s[i]`` in `result`. If ``doInc == true``
    ## `i` is incremented by the number of bytes that have been processed.
    bind ones

    if ord(s[i]) <=% 127:
      result = ord(s[i])
      when doInc: inc(i)
    elif ord(s[i]) shr 5 == 0b110:
      #assert(ord(s[i+1]) shr 6 == 0b10)
      result = (ord(s[i]) and (ones(5))) shl 6 or (ord(s[i+1]) and ones(6))
      when doInc: inc(i, 2)
    elif ord(s[i]) shr 4 == 0b1110:
      #assert(ord(s[i+1]) shr 6 == 0b10)
      #assert(ord(s[i+2]) shr 6 == 0b10)
      result = (ord(s[i]) and ones(4)) shl 12 or
               (ord(s[i+1]) and ones(6)) shl 6 or
               (ord(s[i+2]) and ones(6))
      when doInc: inc(i, 3)
    elif ord(s[i]) shr 3 == 0b11110:
      #assert(ord(s[i+1]) shr 6 == 0b10)
      #assert(ord(s[i+2]) shr 6 == 0b10)
      #assert(ord(s[i+3]) shr 6 == 0b10)
      result = (ord(s[i]) and ones(3)) shl 18 or
               (ord(s[i+1]) and ones(6)) shl 12 or
               (ord(s[i+2]) and ones(6)) shl 6 or
               (ord(s[i+3]) and ones(6))
      when doInc: inc(i, 4)
    else:
      result = 0xFFFD
      when doInc: inc(i)

  iterator runes(s: cstring): int =
    var
      i = 0
      result: int
    while s[i] != '\0':
      fastRuneAt(s, i, result, true)
      yield result

  proc allocWideCString*(source: cstring, L: int): WideCString =
    ## free after usage with `dealloc`.
    result = cast[wideCString](alloc(L * 4 + 2))
    var d = 0
    for ch in runes(source):
      if ch <=% UNI_MAX_BMP:
        if ch >=% UNI_SUR_HIGH_START and ch <=% UNI_SUR_LOW_END:
          result[d] = UNI_REPLACEMENT_CHAR
        else:
          result[d] = TUtf16Char(toU16(ch))
      elif ch >% UNI_MAX_UTF16:
        result[d] = UNI_REPLACEMENT_CHAR
      else:
        let ch = ch -% halfBase
        result[d] = TUtf16Char(toU16((ch shr halfShift) +% UNI_SUR_HIGH_START))
        inc d
        result[d] = TUtf16Char(toU16((ch and halfMask) +% UNI_SUR_LOW_START))
      inc d
    result[d] = TUtf16Char(0'i16)

  proc allocWideCString*(s: cstring): WideCString =
    ## free after usage with `dealloc`.
    if s.isNil: return nil

    when not defined(c_strlen):
      proc c_strlen(a: CString): int {.nodecl, noSideEffect, importc: "strlen".}

    let L = cstrlen(s)
    result = allocWideCString(s, L)

  proc allocWideCString*(s: string): WideCString =
    ## free after usage with `dealloc`.
    result = allocWideCString(s, s.len)

  proc `$`*(w: wideCString, estimate: int): string =
    result = newStringOfCap(estimate + estimate shr 2)

    var i = 0
    while w[i].int16 != 0'i16:
      var ch = w[i].int
      inc i
      if ch >=% UNI_SUR_HIGH_START and ch <=% UNI_SUR_HIGH_END:
        # If the 16 bits following the high surrogate are in the source buffer...
        let ch2 = w[i].int
        # If it's a low surrogate, convert to UTF32:
        if ch2 >=% UNI_SUR_LOW_START and ch2 <=% UNI_SUR_LOW_END:
          ch = ((ch -% UNI_SUR_HIGH_START) shr halfShift) +%
                (ch2 -% UNI_SUR_LOW_START) +% halfBase
          inc i
          
      if ch <=% 127:
        result.add chr(ch)
      elif ch <=% 0x07FF:
        result.add chr((ch shr 6) or 0b110_00000)
        result.add chr((ch and ones(6)) or 0b10_000000)
      elif ch <=% 0xFFFF:
        result.add chr(ch shr 12 or 0b1110_0000)
        result.add chr(ch shr 6 and ones(6) or 0b10_0000_00)
        result.add chr(ch and ones(6) or 0b10_0000_00)
      elif ch <=% 0x0010FFFF:
        result.add chr(ch shr 18 or 0b1111_0000)
        result.add chr(ch shr 12 and ones(6) or 0b10_0000_00)
        result.add chr(ch shr 6 and ones(6) or 0b10_0000_00)
        result.add chr(ch and ones(6) or 0b10_0000_00)
      else:
        # replacement char:
        result.add chr(0xFFFD shr 12 or 0b1110_0000)
        result.add chr(0xFFFD shr 6 and ones(6) or 0b10_0000_00)
        result.add chr(0xFFFD and ones(6) or 0b10_0000_00)

  proc `$`*(s: WideCString): string =
    result = s $ 80

else:
  const
    utf8Encoding = 65001
    
  proc MultiByteToWideChar*(
    CodePage: int32,
    dwFlags: int32,
    lpMultiByteStr: cstring,
    cbMultiByte: cint,
    lpWideCharStr: WideCString,
    cchWideChar: cint): cint {.
      stdcall, importc: "MultiByteToWideChar", dynlib: "kernel32".}

  proc WideCharToMultiByte*(
    CodePage: int32,
    dwFlags: int32,
    lpWideCharStr: WideCString,
    cchWideChar: cint,
    lpMultiByteStr: cstring,
    cbMultiByte: cint,
    lpDefaultChar: cstring=nil,
    lpUsedDefaultChar: pointer=nil): cint {.
      stdcall, importc: "WideCharToMultiByte", dynlib: "kernel32".}

  proc raiseEncodingError() {.noinline, noreturn.} =
    raise newException(EOS, "error in unicode conversion")

  proc `$`*(s: WideCString, len: int): string =
    # special case: empty string: needed because MultiByteToWideChar
    # returns 0 in case of error:
    if len == 0: return ""

    # educated guess of capacity:
    var cap = len + len shr 2
    result = newStringOfCap(cap)
    
    let m = WideCharToMultiByte(
      CodePage = utf8Encoding,
      dwFlags = 0'i32,
      lpWideCharStr = s,
      cchWideChar = cint(len),
      lpMultiByteStr = cstring(result),
      cbMultiByte = cap)
    if m == 0:
      # try again; ask for capacity:
      cap = WideCharToMultiByte(
        CodePage = utf8Encoding,
        dwFlags = 0'i32,
        lpWideCharStr = s,
        cchWideChar = cint(len),
        lpMultiByteStr = nil,
        cbMultiByte = cint(0))
      # and do the conversion properly:
      result = newStringOfCap(cap)
      let m = WideCharToMultiByte(
        CodePage = utf8Encoding,
        dwFlags = 0'i32,
        lpWideCharStr = s,
        cchWideChar = cint(len),
        lpMultiByteStr = cstring(result),
        cbMultiByte = cap)
      if m == 0: raiseEncodingError()
      setLen(result, m)
    elif m <= cap:
      setLen(result, m)
    else:
      sysAssert(false, "") # cannot happen
    
  proc `$`*(s: WideCString): string =
    result = s $ s.len
    
  proc allocWideCString*(s: string): WideCString =
    ## free after usage with `dealloc`.
    let cap = s.len+1
    result = cast[wideCString](alloc0(cap * 2))
    # special case: empty string: needed because MultiByteToWideChar
    # return 0 in case of error:
    if s.len == 0: return
    # convert to utf-16 LE
    let m = MultiByteToWideChar(CodePage = utf8Encoding, dwFlags = 0'i32, 
                                lpMultiByteStr = cstring(s),
                                cbMultiByte = cint(s.len),
                                lpWideCharStr = result,
                                cchWideChar = cint(cap))
    if m == 0: raiseEncodingError()

  proc allocWideCString*(s: cstring): WideCString =
    ## free after usage with `dealloc`.
    if s.isNil: return nil

    when not defined(c_strlen):
      proc c_strlen(a: CString): int {.nodecl, noSideEffect, importc: "strlen".}

    let len = cstrlen(s)
    let cap = len+1
    result = cast[wideCString](alloc0(cap * 2))
    # special case: empty string: needed because MultiByteToWideChar
    # return 0 in case of error:
    if s.len == 0: return
    # convert to utf-16 LE
    let m = MultiByteToWideChar(CodePage = utf8Encoding, dwFlags = 0'i32, 
                                lpMultiByteStr = s,
                                cbMultiByte = cint(len),
                                lpWideCharStr = result,
                                cchWideChar = cint(cap))
    if m == 0: raiseEncodingError()