diff options
-rw-r--r-- | lib/pure/unicode.nim | 72 | ||||
-rw-r--r-- | lib/system/widestrs.nim | 49 |
2 files changed, 78 insertions, 43 deletions
diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim index 6ba966816..0c4f15c91 100644 --- a/lib/pure/unicode.nim +++ b/lib/pure/unicode.nim @@ -49,6 +49,8 @@ proc runeLenAt*(s: string, i: Natural): int = elif ord(s[i]) shr 1 == 0b1111110: result = 6 else: result = 1 +const replRune = Rune(0xFFFD) + template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) = ## Returns the Unicode character ``s[i]`` in ``result``. If ``doInc == true`` ## ``i`` is incremented by the number of bytes that have been processed. @@ -58,49 +60,69 @@ template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) = when doInc: inc(i) elif ord(s[i]) shr 5 == 0b110: # assert(ord(s[i+1]) shr 6 == 0b10) - result = Rune((ord(s[i]) and (ones(5))) shl 6 or - (ord(s[i+1]) and ones(6))) - when doInc: inc(i, 2) + if i <= s.len - 2: + result = Rune((ord(s[i]) and (ones(5))) shl 6 or + (ord(s[i+1]) and ones(6))) + when doInc: inc(i, 2) + else: + result = replRune + when doInc: inc(i) elif ord(s[i]) shr 4 == 0b1110: # assert(ord(s[i+1]) shr 6 == 0b10) # assert(ord(s[i+2]) shr 6 == 0b10) - result = Rune((ord(s[i]) and ones(4)) shl 12 or - (ord(s[i+1]) and ones(6)) shl 6 or - (ord(s[i+2]) and ones(6))) - when doInc: inc(i, 3) + if i <= s.len - 3: + result = Rune((ord(s[i]) and ones(4)) shl 12 or + (ord(s[i+1]) and ones(6)) shl 6 or + (ord(s[i+2]) and ones(6))) + when doInc: inc(i, 3) + else: + result = replRune + when doInc: inc(i) elif ord(s[i]) shr 3 == 0b11110: # assert(ord(s[i+1]) shr 6 == 0b10) # assert(ord(s[i+2]) shr 6 == 0b10) # assert(ord(s[i+3]) shr 6 == 0b10) - result = Rune((ord(s[i]) and ones(3)) shl 18 or - (ord(s[i+1]) and ones(6)) shl 12 or - (ord(s[i+2]) and ones(6)) shl 6 or - (ord(s[i+3]) and ones(6))) - when doInc: inc(i, 4) + if i <= s.len - 4: + result = Rune((ord(s[i]) and ones(3)) shl 18 or + (ord(s[i+1]) and ones(6)) shl 12 or + (ord(s[i+2]) and ones(6)) shl 6 or + (ord(s[i+3]) and ones(6))) + when doInc: inc(i, 4) + else: + result = replRune + when doInc: inc(i) elif ord(s[i]) shr 2 == 0b111110: # assert(ord(s[i+1]) shr 6 == 0b10) # assert(ord(s[i+2]) shr 6 == 0b10) # assert(ord(s[i+3]) shr 6 == 0b10) # assert(ord(s[i+4]) shr 6 == 0b10) - result = Rune((ord(s[i]) and ones(2)) shl 24 or - (ord(s[i+1]) and ones(6)) shl 18 or - (ord(s[i+2]) and ones(6)) shl 12 or - (ord(s[i+3]) and ones(6)) shl 6 or - (ord(s[i+4]) and ones(6))) - when doInc: inc(i, 5) + if i <= s.len - 5: + result = Rune((ord(s[i]) and ones(2)) shl 24 or + (ord(s[i+1]) and ones(6)) shl 18 or + (ord(s[i+2]) and ones(6)) shl 12 or + (ord(s[i+3]) and ones(6)) shl 6 or + (ord(s[i+4]) and ones(6))) + when doInc: inc(i, 5) + else: + result = replRune + when doInc: inc(i) elif ord(s[i]) shr 1 == 0b1111110: # assert(ord(s[i+1]) shr 6 == 0b10) # assert(ord(s[i+2]) shr 6 == 0b10) # assert(ord(s[i+3]) shr 6 == 0b10) # assert(ord(s[i+4]) shr 6 == 0b10) # assert(ord(s[i+5]) shr 6 == 0b10) - result = Rune((ord(s[i]) and ones(1)) shl 30 or - (ord(s[i+1]) and ones(6)) shl 24 or - (ord(s[i+2]) and ones(6)) shl 18 or - (ord(s[i+3]) and ones(6)) shl 12 or - (ord(s[i+4]) and ones(6)) shl 6 or - (ord(s[i+5]) and ones(6))) - when doInc: inc(i, 6) + if i <= s.len - 6: + result = Rune((ord(s[i]) and ones(1)) shl 30 or + (ord(s[i+1]) and ones(6)) shl 24 or + (ord(s[i+2]) and ones(6)) shl 18 or + (ord(s[i+3]) and ones(6)) shl 12 or + (ord(s[i+4]) and ones(6)) shl 6 or + (ord(s[i+5]) and ones(6))) + when doInc: inc(i, 6) + else: + result = replRune + when doInc: inc(i) else: result = Rune(ord(s[i])) when doInc: inc(i) diff --git a/lib/system/widestrs.nim b/lib/system/widestrs.nim index 3c957476f..dda547abe 100644 --- a/lib/system/widestrs.nim +++ b/lib/system/widestrs.nim @@ -38,10 +38,11 @@ const UNI_SUR_HIGH_END = 0xDBFF UNI_SUR_LOW_START = 0xDC00 UNI_SUR_LOW_END = 0xDFFF + UNI_REPL = 0xFFFD template ones(n: untyped): untyped = ((1 shl n)-1) -template fastRuneAt(s: cstring, i: int, result: untyped, doInc = true) = +template fastRuneAt(s: cstring, i, L: int, result: untyped, doInc = true) = ## Returns the unicode character ``s[i]`` in `result`. If ``doInc == true`` ## `i` is incremented by the number of bytes that have been processed. bind ones @@ -51,24 +52,36 @@ template fastRuneAt(s: cstring, i: int, result: untyped, doInc = true) = when doInc: inc(i) elif ord(s[i]) shr 5 == 0b110: #assert(ord(s[i+1]) shr 6 == 0b10) - result = (ord(s[i]) and (ones(5))) shl 6 or (ord(s[i+1]) and ones(6)) - when doInc: inc(i, 2) + if i <= L - 2: + result = (ord(s[i]) and (ones(5))) shl 6 or (ord(s[i+1]) and ones(6)) + when doInc: inc(i, 2) + else: + result = UNI_REPL + when doInc: inc(i) elif ord(s[i]) shr 4 == 0b1110: - #assert(ord(s[i+1]) shr 6 == 0b10) - #assert(ord(s[i+2]) shr 6 == 0b10) - result = (ord(s[i]) and ones(4)) shl 12 or - (ord(s[i+1]) and ones(6)) shl 6 or - (ord(s[i+2]) and ones(6)) - when doInc: inc(i, 3) + if i <= L - 3: + #assert(ord(s[i+1]) shr 6 == 0b10) + #assert(ord(s[i+2]) shr 6 == 0b10) + result = (ord(s[i]) and ones(4)) shl 12 or + (ord(s[i+1]) and ones(6)) shl 6 or + (ord(s[i+2]) and ones(6)) + when doInc: inc(i, 3) + else: + result = UNI_REPL + when doInc: inc(i) elif ord(s[i]) shr 3 == 0b11110: - #assert(ord(s[i+1]) shr 6 == 0b10) - #assert(ord(s[i+2]) shr 6 == 0b10) - #assert(ord(s[i+3]) shr 6 == 0b10) - result = (ord(s[i]) and ones(3)) shl 18 or - (ord(s[i+1]) and ones(6)) shl 12 or - (ord(s[i+2]) and ones(6)) shl 6 or - (ord(s[i+3]) and ones(6)) - when doInc: inc(i, 4) + if i <= L - 4: + #assert(ord(s[i+1]) shr 6 == 0b10) + #assert(ord(s[i+2]) shr 6 == 0b10) + #assert(ord(s[i+3]) shr 6 == 0b10) + result = (ord(s[i]) and ones(3)) shl 18 or + (ord(s[i+1]) and ones(6)) shl 12 or + (ord(s[i+2]) and ones(6)) shl 6 or + (ord(s[i+3]) and ones(6)) + when doInc: inc(i, 4) + else: + result = UNI_REPL + when doInc: inc(i) else: result = 0xFFFD when doInc: inc(i) @@ -78,7 +91,7 @@ iterator runes(s: cstring, L: int): int = i = 0 result: int while i < L: - fastRuneAt(s, i, result, true) + fastRuneAt(s, i, L, result, true) yield result proc newWideCString*(source: cstring, L: int): WideCString = |