# # # Nimrod's Runtime Library # (c) Copyright 2012 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # ## Nimrod support for C/C++'s `wide strings`:idx:. This is part of the system ## module! Do not import it directly! type TUtf16Char* = distinct int16 WideCString* = ptr array[0.. 1_000_000, TUtf16Char] proc len*(w: WideCString): int = ## returns the length of a widestring. This traverses the whole string to ## find the binary zero end marker! while int16(w[result]) != 0'i16: inc result when true: const UNI_REPLACEMENT_CHAR = TUtf16Char(0xFFFD'i16) UNI_MAX_BMP = 0x0000FFFF UNI_MAX_UTF16 = 0x0010FFFF UNI_MAX_UTF32 = 0x7FFFFFFF UNI_MAX_LEGAL_UTF32 = 0x0010FFFF halfShift = 10 halfBase = 0x0010000 halfMask = 0x3FF UNI_SUR_HIGH_START = 0xD800 UNI_SUR_HIGH_END = 0xDBFF UNI_SUR_LOW_START = 0xDC00 UNI_SUR_LOW_END = 0xDFFF template ones(n: expr): expr = ((1 shl n)-1) template fastRuneAt(s: cstring, i: int, result: expr, doInc = true) = ## Returns the unicode character ``s[i]`` in `result`. If ``doInc == true`` ## `i` is incremented by the number of bytes that have been processed. bind ones if ord(s[i]) <=% 127: result = ord(s[i]) when doInc: inc(i) elif ord(s[i]) shr 5 == 0b110: #assert(ord(s[i+1]) shr 6 == 0b10) result = (ord(s[i]) and (ones(5))) shl 6 or (ord(s[i+1]) and ones(6)) when doInc: inc(i, 2) elif ord(s[i]) shr 4 == 0b1110: #assert(ord(s[i+1]) shr 6 == 0b10) #assert(ord(s[i+2]) shr 6 == 0b10) result = (ord(s[i]) and ones(4)) shl 12 or (ord(s[i+1]) and ones(6)) shl 6 or (ord(s[i+2]) and ones(6)) when doInc: inc(i, 3) elif ord(s[i]) shr 3 == 0b11110: #assert(ord(s[i+1]) shr 6 == 0b10) #assert(ord(s[i+2]) shr 6 == 0b10) #assert(ord(s[i+3]) shr 6 == 0b10) result = (ord(s[i]) and ones(3)) shl 18 or (ord(s[i+1]) and ones(6)) shl 12 or (ord(s[i+2]) and ones(6)) shl 6 or (ord(s[i+3]) and ones(6)) when doInc: inc(i, 4) else: result = 0xFFFD when doInc: inc(i) iterator runes(s: cstring): int = var i = 0 result: int while s[i] != '\0': fastRuneAt(s, i, result, true) yield result proc allocWideCString*(source: cstring, L: int): WideCString = ## free after usage with `dealloc`. result = cast[wideCString](alloc(L * 4 + 2)) var d = 0 for ch in runes(source): if ch <=% UNI_MAX_BMP: if ch >=% UNI_SUR_HIGH_START and ch <=% UNI_SUR_LOW_END: result[d] = UNI_REPLACEMENT_CHAR else: result[d] = TUtf16Char(toU16(ch)) elif ch >% UNI_MAX_UTF16: result[d] = UNI_REPLACEMENT_CHAR else: let ch = ch -% halfBase result[d] = TUtf16Char(toU16((ch shr halfShift) +% UNI_SUR_HIGH_START)) inc d result[d] = TUtf16Char(toU16((ch and halfMask) +% UNI_SUR_LOW_START)) inc d result[d] = TUtf16Char(0'i16) proc allocWideCString*(s: cstring): WideCString = ## free after usage with `dealloc`. if s.isNil: return nil when not defined(c_strlen): proc c_strlen(a: CString): int {.nodecl, noSideEffect, importc: "strlen".} let L = cstrlen(s) result = allocWideCString(s, L) proc allocWideCString*(s: string): WideCString = ## free after usage with `dealloc`. result = allocWideCString(s, s.len) proc `$`*(w: wideCString, estimate: int): string = result = newStringOfCap(estimate + estimate shr 2) var i = 0 while w[i].int16 != 0'i16: var ch = w[i].int inc i if ch >=% UNI_SUR_HIGH_START and ch <=% UNI_SUR_HIGH_END: # If the 16 bits following the high surrogate are in the source buffer... let ch2 = w[i].int # If it's a low surrogate, convert to UTF32: if ch2 >=% UNI_SUR_LOW_START and ch2 <=% UNI_SUR_LOW_END: ch = ((ch -% UNI_SUR_HIGH_START) shr halfShift) +% (ch2 -% UNI_SUR_LOW_START) +% halfBase inc i if ch <=% 127: result.add chr(ch) elif ch <=% 0x07FF: result.add chr((ch shr 6) or 0b110_00000) result.add chr((ch and ones(6)) or 0b10_000000) elif ch <=% 0xFFFF: result.add chr(ch shr 12 or 0b1110_0000) result.add chr(ch shr 6 and ones(6) or 0b10_0000_00) result.add chr(ch and ones(6) or 0b10_0000_00) elif ch <=% 0x0010FFFF: result.add chr(ch shr 18 or 0b1111_0000) result.add chr(ch shr 12 and ones(6) or 0b10_0000_00) result.add chr(ch shr 6 and ones(6) or 0b10_0000_00) result.add chr(ch and ones(6) or 0b10_0000_00) else: # replacement char: result.add chr(0xFFFD shr 12 or 0b1110_0000) result.add chr(0xFFFD shr 6 and ones(6) or 0b10_0000_00) result.add chr(0xFFFD and ones(6) or 0b10_0000_00) proc `$`*(s: WideCString): string = result = s $ 80 else: const utf8Encoding = 65001 proc MultiByteToWideChar*( CodePage: int32, dwFlags: int32, lpMultiByteStr: cstring, cbMultiByte: cint, lpWideCharStr: WideCString, cchWideChar: cint): cint {. stdcall, importc: "MultiByteToWideChar", dynlib: "kernel32".} proc WideCharToMultiByte*( CodePage: int32, dwFlags: int32, lpWideCharStr: WideCString, cchWideChar: cint, lpMultiByteStr: cstring, cbMultiByte: cint, lpDefaultChar: cstring=nil, lpUsedDefaultChar: pointer=nil): cint {. stdcall, importc: "WideCharToMultiByte", dynlib: "kernel32".} proc raiseEncodingError() {.noinline, noreturn.} = raise newException(EOS, "error in unicode conversion") proc `$`*(s: WideCString, len: int): string = # special case: empty string: needed because MultiByteToWideChar # returns 0 in case of error: if len == 0: return "" # educated guess of capacity: var cap = len + len shr 2 result = newStringOfCap(cap) let m = WideCharToMultiByte( CodePage = utf8Encoding, dwFlags = 0'i32, lpWideCharStr = s, cchWideChar = cint(len), lpMultiByteStr = cstring(result), cbMultiByte = cap) if m == 0: # try again; ask for capacity: cap = WideCharToMultiByte( CodePage = utf8Encoding, dwFlags = 0'i32, lpWideCharStr = s, cchWideChar = cint(len), lpMultiByteStr = nil, cbMultiByte = cint(0)) # and do the conversion properly: result = newStringOfCap(cap) let m = WideCharToMultiByte( CodePage = utf8Encoding, dwFlags = 0'i32, lpWideCharStr = s, cchWideChar = cint(len), lpMultiByteStr = cstring(result), cbMultiByte = cap) if m == 0: raiseEncodingError() setLen(result, m) elif m <= cap: setLen(result, m) else: sysAssert(false, "") # cannot happen proc `$`*(s: WideCString): string = result = s $ s.len proc allocWideCString*(s: string): WideCString = ## free after usage with `dealloc`. let cap = s.len+1 result = cast[wideCString](alloc0(cap * 2)) # special case: empty string: needed because MultiByteToWideChar # return 0 in case of error: if s.len == 0: return # convert to utf-16 LE let m = MultiByteToWideChar(CodePage = utf8Encoding, dwFlags = 0'i32, lpMultiByteStr = cstring(s), cbMultiByte = cint(s.len), lpWideCharStr = result, cchWideChar = cint(cap)) if m == 0: raiseEncodingError() proc allocWideCString*(s: cstring): WideCString = ## free after usage with `dealloc`. if s.isNil: return nil when not defined(c_strlen): proc c_strlen(a: CString): int {.nodecl, noSideEffect, importc: "strlen".} let len = cstrlen(s) let cap = len+1 result = cast[wideCString](alloc0(cap * 2)) # special case: empty string: needed because MultiByteToWideChar # return 0 in case of error: if s.len == 0: return # convert to utf-16 LE let m = MultiByteToWideChar(CodePage = utf8Encoding, dwFlags = 0'i32, lpMultiByteStr = s, cbMultiByte = cint(len), lpWideCharStr = result, cchWideChar = cint(cap)) if m == 0: raiseEncodingError()