diff options
Diffstat (limited to 'lib/system/widestrs.nim')
-rw-r--r-- | lib/system/widestrs.nim | 359 |
1 files changed, 124 insertions, 235 deletions
diff --git a/lib/system/widestrs.nim b/lib/system/widestrs.nim index 588093d10..cf1f0910c 100644 --- a/lib/system/widestrs.nim +++ b/lib/system/widestrs.nim @@ -12,249 +12,138 @@ type TUtf16Char* = distinct int16 - WideCString* = ptr array[0.. 1_000_000, TUtf16Char] + WideCString* = ref array[0.. 1_000_000, TUtf16Char] proc len*(w: WideCString): int = ## returns the length of a widestring. This traverses the whole string to ## find the binary zero end marker! while int16(w[result]) != 0'i16: inc result -when true: - const - UNI_REPLACEMENT_CHAR = TUtf16Char(0xFFFD'i16) - UNI_MAX_BMP = 0x0000FFFF - UNI_MAX_UTF16 = 0x0010FFFF - UNI_MAX_UTF32 = 0x7FFFFFFF - UNI_MAX_LEGAL_UTF32 = 0x0010FFFF - - halfShift = 10 - halfBase = 0x0010000 - halfMask = 0x3FF - - UNI_SUR_HIGH_START = 0xD800 - UNI_SUR_HIGH_END = 0xDBFF - UNI_SUR_LOW_START = 0xDC00 - UNI_SUR_LOW_END = 0xDFFF - - template ones(n: expr): expr = ((1 shl n)-1) - - template fastRuneAt(s: cstring, i: int, result: expr, doInc = true) = - ## Returns the unicode character ``s[i]`` in `result`. If ``doInc == true`` - ## `i` is incremented by the number of bytes that have been processed. - bind ones - - if ord(s[i]) <=% 127: - result = ord(s[i]) - when doInc: inc(i) - elif ord(s[i]) shr 5 == 0b110: - #assert(ord(s[i+1]) shr 6 == 0b10) - result = (ord(s[i]) and (ones(5))) shl 6 or (ord(s[i+1]) and ones(6)) - when doInc: inc(i, 2) - elif ord(s[i]) shr 4 == 0b1110: - #assert(ord(s[i+1]) shr 6 == 0b10) - #assert(ord(s[i+2]) shr 6 == 0b10) - result = (ord(s[i]) and ones(4)) shl 12 or - (ord(s[i+1]) and ones(6)) shl 6 or - (ord(s[i+2]) and ones(6)) - when doInc: inc(i, 3) - elif ord(s[i]) shr 3 == 0b11110: - #assert(ord(s[i+1]) shr 6 == 0b10) - #assert(ord(s[i+2]) shr 6 == 0b10) - #assert(ord(s[i+3]) shr 6 == 0b10) - result = (ord(s[i]) and ones(3)) shl 18 or - (ord(s[i+1]) and ones(6)) shl 12 or - (ord(s[i+2]) and ones(6)) shl 6 or - (ord(s[i+3]) and ones(6)) - when doInc: inc(i, 4) - else: - result = 0xFFFD - when doInc: inc(i) - - iterator runes(s: cstring): int = - var - i = 0 - result: int - while s[i] != '\0': - fastRuneAt(s, i, result, true) - yield result - - proc allocWideCString*(source: cstring, L: int): WideCString = - ## free after usage with `dealloc`. - result = cast[wideCString](alloc(L * 4 + 2)) - var d = 0 - for ch in runes(source): - if ch <=% UNI_MAX_BMP: - if ch >=% UNI_SUR_HIGH_START and ch <=% UNI_SUR_LOW_END: - result[d] = UNI_REPLACEMENT_CHAR - else: - result[d] = TUtf16Char(toU16(ch)) - elif ch >% UNI_MAX_UTF16: +const + UNI_REPLACEMENT_CHAR = TUtf16Char(0xFFFD'i16) + UNI_MAX_BMP = 0x0000FFFF + UNI_MAX_UTF16 = 0x0010FFFF + UNI_MAX_UTF32 = 0x7FFFFFFF + UNI_MAX_LEGAL_UTF32 = 0x0010FFFF + + halfShift = 10 + halfBase = 0x0010000 + halfMask = 0x3FF + + UNI_SUR_HIGH_START = 0xD800 + UNI_SUR_HIGH_END = 0xDBFF + UNI_SUR_LOW_START = 0xDC00 + UNI_SUR_LOW_END = 0xDFFF + +template ones(n: expr): expr = ((1 shl n)-1) + +template fastRuneAt(s: cstring, i: int, result: expr, doInc = true) = + ## Returns the unicode character ``s[i]`` in `result`. If ``doInc == true`` + ## `i` is incremented by the number of bytes that have been processed. + bind ones + + if ord(s[i]) <=% 127: + result = ord(s[i]) + when doInc: inc(i) + elif ord(s[i]) shr 5 == 0b110: + #assert(ord(s[i+1]) shr 6 == 0b10) + result = (ord(s[i]) and (ones(5))) shl 6 or (ord(s[i+1]) and ones(6)) + when doInc: inc(i, 2) + elif ord(s[i]) shr 4 == 0b1110: + #assert(ord(s[i+1]) shr 6 == 0b10) + #assert(ord(s[i+2]) shr 6 == 0b10) + result = (ord(s[i]) and ones(4)) shl 12 or + (ord(s[i+1]) and ones(6)) shl 6 or + (ord(s[i+2]) and ones(6)) + when doInc: inc(i, 3) + elif ord(s[i]) shr 3 == 0b11110: + #assert(ord(s[i+1]) shr 6 == 0b10) + #assert(ord(s[i+2]) shr 6 == 0b10) + #assert(ord(s[i+3]) shr 6 == 0b10) + result = (ord(s[i]) and ones(3)) shl 18 or + (ord(s[i+1]) and ones(6)) shl 12 or + (ord(s[i+2]) and ones(6)) shl 6 or + (ord(s[i+3]) and ones(6)) + when doInc: inc(i, 4) + else: + result = 0xFFFD + when doInc: inc(i) + +iterator runes(s: cstring): int = + var + i = 0 + result: int + while s[i] != '\0': + fastRuneAt(s, i, result, true) + yield result + +proc newWideCString*(source: cstring, L: int): WideCString = + unsafeNew(result, L * 4 + 2) + #result = cast[wideCString](alloc(L * 4 + 2)) + var d = 0 + for ch in runes(source): + if ch <=% UNI_MAX_BMP: + if ch >=% UNI_SUR_HIGH_START and ch <=% UNI_SUR_LOW_END: result[d] = UNI_REPLACEMENT_CHAR else: - let ch = ch -% halfBase - result[d] = TUtf16Char(toU16((ch shr halfShift) +% UNI_SUR_HIGH_START)) - inc d - result[d] = TUtf16Char(toU16((ch and halfMask) +% UNI_SUR_LOW_START)) + result[d] = TUtf16Char(toU16(ch)) + elif ch >% UNI_MAX_UTF16: + result[d] = UNI_REPLACEMENT_CHAR + else: + let ch = ch -% halfBase + result[d] = TUtf16Char(toU16((ch shr halfShift) +% UNI_SUR_HIGH_START)) inc d - result[d] = TUtf16Char(0'i16) - - proc allocWideCString*(s: cstring): WideCString = - ## free after usage with `dealloc`. - if s.isNil: return nil - - when not defined(c_strlen): - proc c_strlen(a: CString): int {.nodecl, noSideEffect, importc: "strlen".} - - let L = cstrlen(s) - result = allocWideCString(s, L) - - proc allocWideCString*(s: string): WideCString = - ## free after usage with `dealloc`. - result = allocWideCString(s, s.len) - - proc `$`*(w: wideCString, estimate: int): string = - result = newStringOfCap(estimate + estimate shr 2) - - var i = 0 - while w[i].int16 != 0'i16: - var ch = w[i].int - inc i - if ch >=% UNI_SUR_HIGH_START and ch <=% UNI_SUR_HIGH_END: - # If the 16 bits following the high surrogate are in the source buffer... - let ch2 = w[i].int - # If it's a low surrogate, convert to UTF32: - if ch2 >=% UNI_SUR_LOW_START and ch2 <=% UNI_SUR_LOW_END: - ch = ((ch -% UNI_SUR_HIGH_START) shr halfShift) +% - (ch2 -% UNI_SUR_LOW_START) +% halfBase - inc i - - if ch <=% 127: - result.add chr(ch) - elif ch <=% 0x07FF: - result.add chr((ch shr 6) or 0b110_00000) - result.add chr((ch and ones(6)) or 0b10_000000) - elif ch <=% 0xFFFF: - result.add chr(ch shr 12 or 0b1110_0000) - result.add chr(ch shr 6 and ones(6) or 0b10_0000_00) - result.add chr(ch and ones(6) or 0b10_0000_00) - elif ch <=% 0x0010FFFF: - result.add chr(ch shr 18 or 0b1111_0000) - result.add chr(ch shr 12 and ones(6) or 0b10_0000_00) - result.add chr(ch shr 6 and ones(6) or 0b10_0000_00) - result.add chr(ch and ones(6) or 0b10_0000_00) - else: - # replacement char: - result.add chr(0xFFFD shr 12 or 0b1110_0000) - result.add chr(0xFFFD shr 6 and ones(6) or 0b10_0000_00) - result.add chr(0xFFFD and ones(6) or 0b10_0000_00) - - proc `$`*(s: WideCString): string = - result = s $ 80 - -else: - const - utf8Encoding = 65001 - - proc MultiByteToWideChar*( - CodePage: int32, - dwFlags: int32, - lpMultiByteStr: cstring, - cbMultiByte: cint, - lpWideCharStr: WideCString, - cchWideChar: cint): cint {. - stdcall, importc: "MultiByteToWideChar", dynlib: "kernel32".} - - proc WideCharToMultiByte*( - CodePage: int32, - dwFlags: int32, - lpWideCharStr: WideCString, - cchWideChar: cint, - lpMultiByteStr: cstring, - cbMultiByte: cint, - lpDefaultChar: cstring=nil, - lpUsedDefaultChar: pointer=nil): cint {. - stdcall, importc: "WideCharToMultiByte", dynlib: "kernel32".} - - proc raiseEncodingError() {.noinline, noreturn.} = - raise newException(EOS, "error in unicode conversion") - - proc `$`*(s: WideCString, len: int): string = - # special case: empty string: needed because MultiByteToWideChar - # returns 0 in case of error: - if len == 0: return "" - - # educated guess of capacity: - var cap = len + len shr 2 - result = newStringOfCap(cap) - - let m = WideCharToMultiByte( - CodePage = utf8Encoding, - dwFlags = 0'i32, - lpWideCharStr = s, - cchWideChar = cint(len), - lpMultiByteStr = cstring(result), - cbMultiByte = cap) - if m == 0: - # try again; ask for capacity: - cap = WideCharToMultiByte( - CodePage = utf8Encoding, - dwFlags = 0'i32, - lpWideCharStr = s, - cchWideChar = cint(len), - lpMultiByteStr = nil, - cbMultiByte = cint(0)) - # and do the conversion properly: - result = newStringOfCap(cap) - let m = WideCharToMultiByte( - CodePage = utf8Encoding, - dwFlags = 0'i32, - lpWideCharStr = s, - cchWideChar = cint(len), - lpMultiByteStr = cstring(result), - cbMultiByte = cap) - if m == 0: raiseEncodingError() - setLen(result, m) - elif m <= cap: - setLen(result, m) + result[d] = TUtf16Char(toU16((ch and halfMask) +% UNI_SUR_LOW_START)) + inc d + result[d] = TUtf16Char(0'i16) + +proc newWideCString*(s: cstring): WideCString = + if s.isNil: return nil + + when not defined(c_strlen): + proc c_strlen(a: CString): int {.nodecl, noSideEffect, importc: "strlen".} + + let L = cstrlen(s) + result = newWideCString(s, L) + +proc newWideCString*(s: string): WideCString = + result = newWideCString(s, s.len) + +proc `$`*(w: wideCString, estimate: int): string = + result = newStringOfCap(estimate + estimate shr 2) + + var i = 0 + while w[i].int16 != 0'i16: + var ch = w[i].int + inc i + if ch >=% UNI_SUR_HIGH_START and ch <=% UNI_SUR_HIGH_END: + # If the 16 bits following the high surrogate are in the source buffer... + let ch2 = w[i].int + # If it's a low surrogate, convert to UTF32: + if ch2 >=% UNI_SUR_LOW_START and ch2 <=% UNI_SUR_LOW_END: + ch = ((ch -% UNI_SUR_HIGH_START) shr halfShift) +% + (ch2 -% UNI_SUR_LOW_START) +% halfBase + inc i + + if ch <=% 127: + result.add chr(ch) + elif ch <=% 0x07FF: + result.add chr((ch shr 6) or 0b110_00000) + result.add chr((ch and ones(6)) or 0b10_000000) + elif ch <=% 0xFFFF: + result.add chr(ch shr 12 or 0b1110_0000) + result.add chr(ch shr 6 and ones(6) or 0b10_0000_00) + result.add chr(ch and ones(6) or 0b10_0000_00) + elif ch <=% 0x0010FFFF: + result.add chr(ch shr 18 or 0b1111_0000) + result.add chr(ch shr 12 and ones(6) or 0b10_0000_00) + result.add chr(ch shr 6 and ones(6) or 0b10_0000_00) + result.add chr(ch and ones(6) or 0b10_0000_00) else: - sysAssert(false, "") # cannot happen - - proc `$`*(s: WideCString): string = - result = s $ s.len - - proc allocWideCString*(s: string): WideCString = - ## free after usage with `dealloc`. - let cap = s.len+1 - result = cast[wideCString](alloc0(cap * 2)) - # special case: empty string: needed because MultiByteToWideChar - # return 0 in case of error: - if s.len == 0: return - # convert to utf-16 LE - let m = MultiByteToWideChar(CodePage = utf8Encoding, dwFlags = 0'i32, - lpMultiByteStr = cstring(s), - cbMultiByte = cint(s.len), - lpWideCharStr = result, - cchWideChar = cint(cap)) - if m == 0: raiseEncodingError() - - proc allocWideCString*(s: cstring): WideCString = - ## free after usage with `dealloc`. - if s.isNil: return nil - - when not defined(c_strlen): - proc c_strlen(a: CString): int {.nodecl, noSideEffect, importc: "strlen".} - - let len = cstrlen(s) - let cap = len+1 - result = cast[wideCString](alloc0(cap * 2)) - # special case: empty string: needed because MultiByteToWideChar - # return 0 in case of error: - if s.len == 0: return - # convert to utf-16 LE - let m = MultiByteToWideChar(CodePage = utf8Encoding, dwFlags = 0'i32, - lpMultiByteStr = s, - cbMultiByte = cint(len), - lpWideCharStr = result, - cchWideChar = cint(cap)) - if m == 0: raiseEncodingError() + # replacement char: + result.add chr(0xFFFD shr 12 or 0b1110_0000) + result.add chr(0xFFFD shr 6 and ones(6) or 0b10_0000_00) + result.add chr(0xFFFD and ones(6) or 0b10_0000_00) +proc `$`*(s: WideCString): string = + result = s $ 80 |