diff options
Diffstat (limited to 'lib/pure/unicode.nim')
-rw-r--r-- | lib/pure/unicode.nim | 2549 |
1 files changed, 1082 insertions, 1467 deletions
diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim index 978f569ac..8cbe117bb 100644 --- a/lib/pure/unicode.nim +++ b/lib/pure/unicode.nim @@ -8,163 +8,216 @@ # ## This module provides support to handle the Unicode UTF-8 encoding. - -{.deadCodeElim: on.} # dce option deprecated +## +## There are no specialized ``insert``, ``delete``, ``add`` and ``contains`` +## procedures for ``seq[Rune]`` in this module because the generic variants +## of these procedures in the system module already work with it. +## +## The current version is compatible with Unicode v12.0.0. +## +## **See also:** +## * `strutils module <strutils.html>`_ +## * `unidecode module <unidecode.html>`_ +## * `encodings module <encodings.html>`_ include "system/inclrtl" +import std/strbasics +template toOa(s: string): auto = s.toOpenArray(0, s.high) + +proc substr(s: openArray[char] , first, last: int): string = + # Copied substr from system + let first = max(first, 0) + let L = max(min(last, high(s)) - first + 1, 0) + result = newString(L) + for i in 0 .. L-1: + result[i] = s[i+first] type RuneImpl = int32 # underlying type of Rune - Rune* = distinct RuneImpl ## type that can hold any Unicode character - Rune16* = distinct int16 ## 16 bit Unicode character - -{.deprecated: [TRune: Rune, TRune16: Rune16].} - -proc `<=%`*(a, b: Rune): bool = return int(a) <=% int(b) -proc `<%`*(a, b: Rune): bool = return int(a) <% int(b) -proc `==`*(a, b: Rune): bool = return int(a) == int(b) + Rune* = distinct RuneImpl ## \ + ## Type that can hold a single Unicode code point. + ## + ## A Rune may be composed with other Runes to a character on the screen. + ## `RuneImpl` is the underlying type used to store Runes, currently `int32`. template ones(n: untyped): untyped = ((1 shl n)-1) -proc runeLen*(s: string): int {.rtl, extern: "nuc$1".} = - ## Returns the number of Unicode characters of the string ``s`` +proc runeLen*(s: openArray[char]): int {.rtl, extern: "nuc$1".} = + ## Returns the number of runes of the string ``s``. + runnableExamples: + let a = "añyóng" + doAssert a.runeLen == 6 + ## note: a.len == 8 + + result = 0 var i = 0 while i < len(s): - if ord(s[i]) <=% 127: inc(i) - elif ord(s[i]) shr 5 == 0b110: inc(i, 2) - elif ord(s[i]) shr 4 == 0b1110: inc(i, 3) - elif ord(s[i]) shr 3 == 0b11110: inc(i, 4) - elif ord(s[i]) shr 2 == 0b111110: inc(i, 5) - elif ord(s[i]) shr 1 == 0b1111110: inc(i, 6) + if uint(s[i]) <= 127: inc(i) + elif uint(s[i]) shr 5 == 0b110: inc(i, 2) + elif uint(s[i]) shr 4 == 0b1110: inc(i, 3) + elif uint(s[i]) shr 3 == 0b11110: inc(i, 4) + elif uint(s[i]) shr 2 == 0b111110: inc(i, 5) + elif uint(s[i]) shr 1 == 0b1111110: inc(i, 6) else: inc i inc(result) -proc runeLenAt*(s: string, i: Natural): int = - ## Returns the number of bytes the rune starting at ``s[i]`` takes - if ord(s[i]) <=% 127: result = 1 - elif ord(s[i]) shr 5 == 0b110: result = 2 - elif ord(s[i]) shr 4 == 0b1110: result = 3 - elif ord(s[i]) shr 3 == 0b11110: result = 4 - elif ord(s[i]) shr 2 == 0b111110: result = 5 - elif ord(s[i]) shr 1 == 0b1111110: result = 6 +proc runeLenAt*(s: openArray[char], i: Natural): int = + ## Returns the number of bytes the rune starting at ``s[i]`` takes. + ## + ## See also: + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeLenAt(0) == 1 + doAssert a.runeLenAt(1) == 2 + + if uint(s[i]) <= 127: result = 1 + elif uint(s[i]) shr 5 == 0b110: result = 2 + elif uint(s[i]) shr 4 == 0b1110: result = 3 + elif uint(s[i]) shr 3 == 0b11110: result = 4 + elif uint(s[i]) shr 2 == 0b111110: result = 5 + elif uint(s[i]) shr 1 == 0b1111110: result = 6 else: result = 1 const replRune = Rune(0xFFFD) -template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) = - ## Returns the Unicode character ``s[i]`` in ``result``. If ``doInc == true`` - ## ``i`` is incremented by the number of bytes that have been processed. +template fastRuneAt*(s: openArray[char] or string, i: int, result: untyped, doInc = true) = + ## Returns the rune ``s[i]`` in ``result``. + ## + ## If ``doInc == true`` (default), ``i`` is incremented by the number + ## of bytes that have been processed. bind ones - if ord(s[i]) <=% 127: - result = Rune(ord(s[i])) + if uint(s[i]) <= 127: + result = Rune(uint(s[i])) when doInc: inc(i) - elif ord(s[i]) shr 5 == 0b110: - # assert(ord(s[i+1]) shr 6 == 0b10) + elif uint(s[i]) shr 5 == 0b110: + # assert(uint(s[i+1]) shr 6 == 0b10) if i <= s.len - 2: - result = Rune((ord(s[i]) and (ones(5))) shl 6 or - (ord(s[i+1]) and ones(6))) + result = Rune((uint(s[i]) and (ones(5))) shl 6 or + (uint(s[i+1]) and ones(6))) when doInc: inc(i, 2) else: result = replRune when doInc: inc(i) - elif ord(s[i]) shr 4 == 0b1110: - # assert(ord(s[i+1]) shr 6 == 0b10) - # assert(ord(s[i+2]) shr 6 == 0b10) + elif uint(s[i]) shr 4 == 0b1110: + # assert(uint(s[i+1]) shr 6 == 0b10) + # assert(uint(s[i+2]) shr 6 == 0b10) if i <= s.len - 3: - result = Rune((ord(s[i]) and ones(4)) shl 12 or - (ord(s[i+1]) and ones(6)) shl 6 or - (ord(s[i+2]) and ones(6))) + result = Rune((uint(s[i]) and ones(4)) shl 12 or + (uint(s[i+1]) and ones(6)) shl 6 or + (uint(s[i+2]) and ones(6))) when doInc: inc(i, 3) else: result = replRune when doInc: inc(i) - elif ord(s[i]) shr 3 == 0b11110: - # assert(ord(s[i+1]) shr 6 == 0b10) - # assert(ord(s[i+2]) shr 6 == 0b10) - # assert(ord(s[i+3]) shr 6 == 0b10) + elif uint(s[i]) shr 3 == 0b11110: + # assert(uint(s[i+1]) shr 6 == 0b10) + # assert(uint(s[i+2]) shr 6 == 0b10) + # assert(uint(s[i+3]) shr 6 == 0b10) if i <= s.len - 4: - result = Rune((ord(s[i]) and ones(3)) shl 18 or - (ord(s[i+1]) and ones(6)) shl 12 or - (ord(s[i+2]) and ones(6)) shl 6 or - (ord(s[i+3]) and ones(6))) + result = Rune((uint(s[i]) and ones(3)) shl 18 or + (uint(s[i+1]) and ones(6)) shl 12 or + (uint(s[i+2]) and ones(6)) shl 6 or + (uint(s[i+3]) and ones(6))) when doInc: inc(i, 4) else: result = replRune when doInc: inc(i) - elif ord(s[i]) shr 2 == 0b111110: - # assert(ord(s[i+1]) shr 6 == 0b10) - # assert(ord(s[i+2]) shr 6 == 0b10) - # assert(ord(s[i+3]) shr 6 == 0b10) - # assert(ord(s[i+4]) shr 6 == 0b10) + elif uint(s[i]) shr 2 == 0b111110: + # assert(uint(s[i+1]) shr 6 == 0b10) + # assert(uint(s[i+2]) shr 6 == 0b10) + # assert(uint(s[i+3]) shr 6 == 0b10) + # assert(uint(s[i+4]) shr 6 == 0b10) if i <= s.len - 5: - result = Rune((ord(s[i]) and ones(2)) shl 24 or - (ord(s[i+1]) and ones(6)) shl 18 or - (ord(s[i+2]) and ones(6)) shl 12 or - (ord(s[i+3]) and ones(6)) shl 6 or - (ord(s[i+4]) and ones(6))) + result = Rune((uint(s[i]) and ones(2)) shl 24 or + (uint(s[i+1]) and ones(6)) shl 18 or + (uint(s[i+2]) and ones(6)) shl 12 or + (uint(s[i+3]) and ones(6)) shl 6 or + (uint(s[i+4]) and ones(6))) when doInc: inc(i, 5) else: result = replRune when doInc: inc(i) - elif ord(s[i]) shr 1 == 0b1111110: - # assert(ord(s[i+1]) shr 6 == 0b10) - # assert(ord(s[i+2]) shr 6 == 0b10) - # assert(ord(s[i+3]) shr 6 == 0b10) - # assert(ord(s[i+4]) shr 6 == 0b10) - # assert(ord(s[i+5]) shr 6 == 0b10) + elif uint(s[i]) shr 1 == 0b1111110: + # assert(uint(s[i+1]) shr 6 == 0b10) + # assert(uint(s[i+2]) shr 6 == 0b10) + # assert(uint(s[i+3]) shr 6 == 0b10) + # assert(uint(s[i+4]) shr 6 == 0b10) + # assert(uint(s[i+5]) shr 6 == 0b10) if i <= s.len - 6: - result = Rune((ord(s[i]) and ones(1)) shl 30 or - (ord(s[i+1]) and ones(6)) shl 24 or - (ord(s[i+2]) and ones(6)) shl 18 or - (ord(s[i+3]) and ones(6)) shl 12 or - (ord(s[i+4]) and ones(6)) shl 6 or - (ord(s[i+5]) and ones(6))) + result = Rune((uint(s[i]) and ones(1)) shl 30 or + (uint(s[i+1]) and ones(6)) shl 24 or + (uint(s[i+2]) and ones(6)) shl 18 or + (uint(s[i+3]) and ones(6)) shl 12 or + (uint(s[i+4]) and ones(6)) shl 6 or + (uint(s[i+5]) and ones(6))) when doInc: inc(i, 6) else: result = replRune when doInc: inc(i) else: - result = Rune(ord(s[i])) + result = Rune(uint(s[i])) when doInc: inc(i) -proc validateUtf8*(s: string): int = +proc runeAt*(s: openArray[char], i: Natural): Rune = + ## Returns the rune in ``s`` at **byte index** ``i``. + ## + ## See also: + ## * `runeAtPos proc <#runeAtPos,string,int>`_ + ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_ + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeAt(1) == "ñ".runeAt(0) + doAssert a.runeAt(2) == "ñ".runeAt(1) + doAssert a.runeAt(3) == "y".runeAt(0) + fastRuneAt(s, i, result, false) + +proc validateUtf8*(s: openArray[char]): int = ## Returns the position of the invalid byte in ``s`` if the string ``s`` does ## not hold valid UTF-8 data. Otherwise ``-1`` is returned. + ## + ## See also: + ## * `toUTF8 proc <#toUTF8,Rune>`_ + ## * `$ proc <#$,Rune>`_ alias for `toUTF8` + ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ var i = 0 let L = s.len while i < L: - if ord(s[i]) <=% 127: + if uint(s[i]) <= 127: inc(i) - elif ord(s[i]) shr 5 == 0b110: - if ord(s[i]) < 0xc2: return i # Catch overlong ascii representations. - if i+1 < L and ord(s[i+1]) shr 6 == 0b10: inc(i, 2) + elif uint(s[i]) shr 5 == 0b110: + if uint(s[i]) < 0xc2: return i # Catch overlong ascii representations. + if i+1 < L and uint(s[i+1]) shr 6 == 0b10: inc(i, 2) else: return i - elif ord(s[i]) shr 4 == 0b1110: - if i+2 < L and ord(s[i+1]) shr 6 == 0b10 and ord(s[i+2]) shr 6 == 0b10: + elif uint(s[i]) shr 4 == 0b1110: + if i+2 < L and uint(s[i+1]) shr 6 == 0b10 and uint(s[i+2]) shr 6 == 0b10: inc i, 3 else: return i - elif ord(s[i]) shr 3 == 0b11110: - if i+3 < L and ord(s[i+1]) shr 6 == 0b10 and - ord(s[i+2]) shr 6 == 0b10 and - ord(s[i+3]) shr 6 == 0b10: + elif uint(s[i]) shr 3 == 0b11110: + if i+3 < L and uint(s[i+1]) shr 6 == 0b10 and + uint(s[i+2]) shr 6 == 0b10 and + uint(s[i+3]) shr 6 == 0b10: inc i, 4 else: return i else: return i return -1 -proc runeAt*(s: string, i: Natural): Rune = - ## Returns the unicode character in ``s`` at byte index ``i`` - fastRuneAt(s, i, result, false) - template fastToUTF8Copy*(c: Rune, s: var string, pos: int, doInc = true) = - ## Copies UTF-8 representation of `c` into the preallocated string `s` - ## starting at position `pos`. If `doInc == true`, `pos` is incremented + ## Copies UTF-8 representation of ``c`` into the preallocated string ``s`` + ## starting at position ``pos``. + ## + ## If ``doInc == true`` (default), ``pos`` is incremented ## by the number of bytes that have been processed. ## - ## To be the most efficient, make sure `s` is preallocated - ## with an additional amount equal to the byte length of - ## `c`. + ## To be the most efficient, make sure ``s`` is preallocated + ## with an additional amount equal to the byte length of ``c``. + ## + ## See also: + ## * `validateUtf8 proc <#validateUtf8,string>`_ + ## * `toUTF8 proc <#toUTF8,Rune>`_ + ## * `$ proc <#$,Rune>`_ alias for `toUTF8` var i = RuneImpl(c) if i <=% 127: s.setLen(pos+1) @@ -209,27 +262,71 @@ template fastToUTF8Copy*(c: Rune, s: var string, pos: int, doInc = true) = discard # error, exception? proc toUTF8*(c: Rune): string {.rtl, extern: "nuc$1".} = - ## Converts a rune into its UTF-8 representation + ## Converts a rune into its UTF-8 representation. + ## + ## See also: + ## * `validateUtf8 proc <#validateUtf8,string>`_ + ## * `$ proc <#$,Rune>`_ alias for `toUTF8` + ## * `utf8 iterator <#utf8.i,string>`_ + ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeAt(1).toUTF8 == "ñ" + result = "" fastToUTF8Copy(c, result, 0, false) +proc add*(s: var string; c: Rune) = + ## Adds a rune ``c`` to a string ``s``. + runnableExamples: + var s = "abc" + let c = "ä".runeAt(0) + s.add(c) + doAssert s == "abcä" + + let pos = s.len + fastToUTF8Copy(c, s, pos, false) + proc `$`*(rune: Rune): string = - ## Converts a Rune to a string + ## An alias for `toUTF8 <#toUTF8,Rune>`_. + ## + ## See also: + ## * `validateUtf8 proc <#validateUtf8,string>`_ + ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ rune.toUTF8 proc `$`*(runes: seq[Rune]): string = - ## Converts a sequence of Runes to a string + ## Converts a sequence of Runes to a string. + ## + ## See also: + ## * `toRunes <#toRunes,string>`_ for a reverse operation + runnableExamples: + let + someString = "öÑ" + someRunes = toRunes(someString) + doAssert $someRunes == someString + result = "" - for rune in runes: result.add(rune.toUTF8) + for rune in runes: + result.add rune -proc runeOffset*(s: string, pos:Natural, start: Natural = 0): int = - ## Returns the byte position of unicode character - ## at position pos in s with an optional start byte position. - ## returns the special value -1 if it runs out of the string +proc runeOffset*(s: openArray[char], pos: Natural, start: Natural = 0): int = + ## Returns the byte position of rune + ## at position ``pos`` in ``s`` with an optional start byte position. + ## Returns the special value -1 if it runs out of the string. ## - ## Beware: This can lead to unoptimized code and slow execution! - ## Most problems are solve more efficient by using an iterator + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator ## or conversion to a seq of Rune. + ## + ## See also: + ## * `runeReverseOffset proc <#runeReverseOffset,string,Positive>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeOffset(1) == 1 + doAssert a.runeOffset(3) == 4 + doAssert a.runeOffset(4) == 6 + var i = 0 o = start @@ -240,66 +337,87 @@ proc runeOffset*(s: string, pos:Natural, start: Natural = 0): int = inc i return o -proc runeAtPos*(s: string, pos: int): Rune = - ## Returns the unicode character at position pos +proc runeReverseOffset*(s: openArray[char], rev: Positive): (int, int) = + ## Returns a tuple with the byte offset of the + ## rune at position ``rev`` in ``s``, counting + ## from the end (starting with 1) and the total + ## number of runes in the string. ## - ## Beware: This can lead to unoptimized code and slow execution! - ## Most problems are solve more efficient by using an iterator - ## or conversion to a seq of Rune. - fastRuneAt(s, runeOffset(s, pos), result, false) - -proc runeStrAtPos*(s: string, pos: Natural): string = - ## Returns the unicode character at position pos as UTF8 String + ## Returns a negative value for offset if there are too few runes in + ## the string to satisfy the request. ## - ## Beware: This can lead to unoptimized code and slow execution! - ## Most problems are solve more efficient by using an iterator + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator ## or conversion to a seq of Rune. - let o = runeOffset(s, pos) - s[o.. (o+runeLenAt(s, o)-1)] - -proc runeReverseOffset*(s: string, rev:Positive): (int, int) = - ## Returns a tuple with the the byte offset of the - ## unicode character at position ``rev`` in s counting - ## from the end (starting with 1) and the total - ## number of runes in the string. Returns a negative value - ## for offset if there are to few runes in the string to - ## satisfy the request. ## - ## Beware: This can lead to unoptimized code and slow execution! - ## Most problems are solve more efficient by using an iterator - ## or conversion to a seq of Rune. + ## See also: + ## * `runeOffset proc <#runeOffset,string,Natural,Natural>`_ var a = rev.int o = 0 x = 0 + let times = 2*rev.int-s.runeLen # transformed from rev.int - a < s.runeLen - rev.int while o < s.len: let r = runeLenAt(s, o) o += r - if a < 0: + if a > times: x += r dec a + result = if a > 0: (-a, rev.int-a) else: (x, -a+rev.int) - if a > 0: - return (-a, rev.int-a) - return (x, -a+rev.int) +proc runeAtPos*(s: openArray[char], pos: int): Rune = + ## Returns the rune at position ``pos``. + ## + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. + ## + ## See also: + ## * `runeAt proc <#runeAt,string,Natural>`_ + ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_ + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + fastRuneAt(s, runeOffset(s, pos), result, false) -proc runeSubStr*(s: string, pos:int, len:int = int.high): string = - ## Returns the UTF-8 substring starting at codepoint pos - ## with len codepoints. If pos or len is negative they count from - ## the end of the string. If len is not given it means the longest - ## possible string. +proc runeStrAtPos*(s: openArray[char], pos: Natural): string = + ## Returns the rune at position ``pos`` as UTF8 String. + ## + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. + ## + ## See also: + ## * `runeAt proc <#runeAt,string,Natural>`_ + ## * `runeAtPos proc <#runeAtPos,string,int>`_ + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + let o = runeOffset(s, pos) + substr(s.toOpenArray(o, (o+runeLenAt(s, o)-1))) + +proc runeSubStr*(s: openArray[char], pos: int, len: int = int.high): string = + ## Returns the UTF-8 substring starting at code point ``pos`` + ## with ``len`` code points. ## - ## (Needs some examples) + ## If ``pos`` or ``len`` is negative they count from + ## the end of the string. If ``len`` is not given it means the longest + ## possible string. + runnableExamples: + let s = "Hänsel ««: 10,00€" + doAssert(runeSubStr(s, 0, 2) == "Hä") + doAssert(runeSubStr(s, 10, 1) == ":") + doAssert(runeSubStr(s, -6) == "10,00€") + doAssert(runeSubStr(s, 10) == ": 10,00€") + doAssert(runeSubStr(s, 12, 5) == "10,00") + doAssert(runeSubStr(s, -6, 3) == "10,") + if pos < 0: let (o, rl) = runeReverseOffset(s, -pos) if len >= rl: - result = s.substr(o, s.len-1) + result = s.substr(o, s.high) elif len < 0: let e = rl + len if e < 0: result = "" else: - result = s.substr(o, runeOffset(s, e-(rl+pos) , o)-1) + result = s.substr(o, runeOffset(s, e-(rl+pos), o)-1) else: result = s.substr(o, runeOffset(s, len, o)-1) else: @@ -321,972 +439,32 @@ proc runeSubStr*(s: string, pos:int, len:int = int.high): string = e = s.len result = s.substr(o, e-1) -const - alphaRanges = [ - 0x00d8, 0x00f6, # - - 0x00f8, 0x01f5, # - - 0x0250, 0x02a8, # - - 0x038e, 0x03a1, # - - 0x03a3, 0x03ce, # - - 0x03d0, 0x03d6, # - - 0x03e2, 0x03f3, # - - 0x0490, 0x04c4, # - - 0x0561, 0x0587, # - - 0x05d0, 0x05ea, # - - 0x05f0, 0x05f2, # - - 0x0621, 0x063a, # - - 0x0640, 0x064a, # - - 0x0671, 0x06b7, # - - 0x06ba, 0x06be, # - - 0x06c0, 0x06ce, # - - 0x06d0, 0x06d3, # - - 0x0905, 0x0939, # - - 0x0958, 0x0961, # - - 0x0985, 0x098c, # - - 0x098f, 0x0990, # - - 0x0993, 0x09a8, # - - 0x09aa, 0x09b0, # - - 0x09b6, 0x09b9, # - - 0x09dc, 0x09dd, # - - 0x09df, 0x09e1, # - - 0x09f0, 0x09f1, # - - 0x0a05, 0x0a0a, # - - 0x0a0f, 0x0a10, # - - 0x0a13, 0x0a28, # - - 0x0a2a, 0x0a30, # - - 0x0a32, 0x0a33, # - - 0x0a35, 0x0a36, # - - 0x0a38, 0x0a39, # - - 0x0a59, 0x0a5c, # - - 0x0a85, 0x0a8b, # - - 0x0a8f, 0x0a91, # - - 0x0a93, 0x0aa8, # - - 0x0aaa, 0x0ab0, # - - 0x0ab2, 0x0ab3, # - - 0x0ab5, 0x0ab9, # - - 0x0b05, 0x0b0c, # - - 0x0b0f, 0x0b10, # - - 0x0b13, 0x0b28, # - - 0x0b2a, 0x0b30, # - - 0x0b32, 0x0b33, # - - 0x0b36, 0x0b39, # - - 0x0b5c, 0x0b5d, # - - 0x0b5f, 0x0b61, # - - 0x0b85, 0x0b8a, # - - 0x0b8e, 0x0b90, # - - 0x0b92, 0x0b95, # - - 0x0b99, 0x0b9a, # - - 0x0b9e, 0x0b9f, # - - 0x0ba3, 0x0ba4, # - - 0x0ba8, 0x0baa, # - - 0x0bae, 0x0bb5, # - - 0x0bb7, 0x0bb9, # - - 0x0c05, 0x0c0c, # - - 0x0c0e, 0x0c10, # - - 0x0c12, 0x0c28, # - - 0x0c2a, 0x0c33, # - - 0x0c35, 0x0c39, # - - 0x0c60, 0x0c61, # - - 0x0c85, 0x0c8c, # - - 0x0c8e, 0x0c90, # - - 0x0c92, 0x0ca8, # - - 0x0caa, 0x0cb3, # - - 0x0cb5, 0x0cb9, # - - 0x0ce0, 0x0ce1, # - - 0x0d05, 0x0d0c, # - - 0x0d0e, 0x0d10, # - - 0x0d12, 0x0d28, # - - 0x0d2a, 0x0d39, # - - 0x0d60, 0x0d61, # - - 0x0e01, 0x0e30, # - - 0x0e32, 0x0e33, # - - 0x0e40, 0x0e46, # - - 0x0e5a, 0x0e5b, # - - 0x0e81, 0x0e82, # - - 0x0e87, 0x0e88, # - - 0x0e94, 0x0e97, # - - 0x0e99, 0x0e9f, # - - 0x0ea1, 0x0ea3, # - - 0x0eaa, 0x0eab, # - - 0x0ead, 0x0eae, # - - 0x0eb2, 0x0eb3, # - - 0x0ec0, 0x0ec4, # - - 0x0edc, 0x0edd, # - - 0x0f18, 0x0f19, # - - 0x0f40, 0x0f47, # - - 0x0f49, 0x0f69, # - - 0x10d0, 0x10f6, # - - 0x1100, 0x1159, # - - 0x115f, 0x11a2, # - - 0x11a8, 0x11f9, # - - 0x1e00, 0x1e9b, # - - 0x1f50, 0x1f57, # - - 0x1f80, 0x1fb4, # - - 0x1fb6, 0x1fbc, # - - 0x1fc2, 0x1fc4, # - - 0x1fc6, 0x1fcc, # - - 0x1fd0, 0x1fd3, # - - 0x1fd6, 0x1fdb, # - - 0x1fe0, 0x1fec, # - - 0x1ff2, 0x1ff4, # - - 0x1ff6, 0x1ffc, # - - 0x210a, 0x2113, # - - 0x2115, 0x211d, # - - 0x2120, 0x2122, # - - 0x212a, 0x2131, # - - 0x2133, 0x2138, # - - 0x3041, 0x3094, # - - 0x30a1, 0x30fa, # - - 0x3105, 0x312c, # - - 0x3131, 0x318e, # - - 0x3192, 0x319f, # - - 0x3260, 0x327b, # - - 0x328a, 0x32b0, # - - 0x32d0, 0x32fe, # - - 0x3300, 0x3357, # - - 0x3371, 0x3376, # - - 0x337b, 0x3394, # - - 0x3399, 0x339e, # - - 0x33a9, 0x33ad, # - - 0x33b0, 0x33c1, # - - 0x33c3, 0x33c5, # - - 0x33c7, 0x33d7, # - - 0x33d9, 0x33dd, # - - 0x4e00, 0x9fff, # - - 0xac00, 0xd7a3, # - - 0xf900, 0xfb06, # - - 0xfb13, 0xfb17, # - - 0xfb1f, 0xfb28, # - - 0xfb2a, 0xfb36, # - - 0xfb38, 0xfb3c, # - - 0xfb40, 0xfb41, # - - 0xfb43, 0xfb44, # - - 0xfb46, 0xfbb1, # - - 0xfbd3, 0xfd3d, # - - 0xfd50, 0xfd8f, # - - 0xfd92, 0xfdc7, # - - 0xfdf0, 0xfdf9, # - - 0xfe70, 0xfe72, # - - 0xfe76, 0xfefc, # - - 0xff66, 0xff6f, # - - 0xff71, 0xff9d, # - - 0xffa0, 0xffbe, # - - 0xffc2, 0xffc7, # - - 0xffca, 0xffcf, # - - 0xffd2, 0xffd7, # - - 0xffda, 0xffdc] # - - - alphaSinglets = [ - 0x00aa, # - 0x00b5, # - 0x00ba, # - 0x03da, # - 0x03dc, # - 0x03de, # - 0x03e0, # - 0x06d5, # - 0x09b2, # - 0x0a5e, # - 0x0a8d, # - 0x0ae0, # - 0x0b9c, # - 0x0cde, # - 0x0e4f, # - 0x0e84, # - 0x0e8a, # - 0x0e8d, # - 0x0ea5, # - 0x0ea7, # - 0x0eb0, # - 0x0ebd, # - 0x1fbe, # - 0x207f, # - 0x20a8, # - 0x2102, # - 0x2107, # - 0x2124, # - 0x2126, # - 0x2128, # - 0xfb3e, # - 0xfe74] # - - spaceRanges = [ - 0x0009, 0x000d, # tab and newline - 0x0020, 0x0020, # space - 0x0085, 0x0085, # next line - 0x00a0, 0x00a0, # - 0x1680, 0x1680, # Ogham space mark - 0x2000, 0x200b, # en dash .. zero-width space - 0x200e, 0x200f, # LTR mark .. RTL mark (pattern whitespace) - 0x2028, 0x2029, # - 0x3000, 0x3000, # - 0x202f, 0x202f, # narrow no-break space - 0x205f, 0x205f, # medium mathematical space - 0x3000, 0x3000, # ideographic space - 0xfeff, 0xfeff] # - - toupperRanges = [ - 0x0061, 0x007a, 468, # a-z A-Z - 0x00e0, 0x00f6, 468, # - - - 0x00f8, 0x00fe, 468, # - - - 0x0256, 0x0257, 295, # - - - 0x0258, 0x0259, 298, # - - - 0x028a, 0x028b, 283, # - - - 0x03ad, 0x03af, 463, # - - - 0x03b1, 0x03c1, 468, # - - - 0x03c3, 0x03cb, 468, # - - - 0x03cd, 0x03ce, 437, # - - - 0x0430, 0x044f, 468, # - - - 0x0451, 0x045c, 420, # - - - 0x045e, 0x045f, 420, # - - - 0x0561, 0x0586, 452, # - - - 0x1f00, 0x1f07, 508, # - - - 0x1f10, 0x1f15, 508, # - - - 0x1f20, 0x1f27, 508, # - - - 0x1f30, 0x1f37, 508, # - - - 0x1f40, 0x1f45, 508, # - - - 0x1f60, 0x1f67, 508, # - - - 0x1f70, 0x1f71, 574, # - - - 0x1f72, 0x1f75, 586, # - - - 0x1f76, 0x1f77, 600, # - - - 0x1f78, 0x1f79, 628, # - - - 0x1f7a, 0x1f7b, 612, # - - - 0x1f7c, 0x1f7d, 626, # - - - 0x1f80, 0x1f87, 508, # - - - 0x1f90, 0x1f97, 508, # - - - 0x1fa0, 0x1fa7, 508, # - - - 0x1fb0, 0x1fb1, 508, # - - - 0x1fd0, 0x1fd1, 508, # - - - 0x1fe0, 0x1fe1, 508, # - - - 0x2170, 0x217f, 484, # - - - 0x24d0, 0x24e9, 474, # - - - 0xff41, 0xff5a, 468] # - - - - toupperSinglets = [ - 0x00ff, 621, # - 0x0101, 499, # - 0x0103, 499, # - 0x0105, 499, # - 0x0107, 499, # - 0x0109, 499, # - 0x010b, 499, # - 0x010d, 499, # - 0x010f, 499, # - 0x0111, 499, # - 0x0113, 499, # - 0x0115, 499, # - 0x0117, 499, # - 0x0119, 499, # - 0x011b, 499, # - 0x011d, 499, # - 0x011f, 499, # - 0x0121, 499, # - 0x0123, 499, # - 0x0125, 499, # - 0x0127, 499, # - 0x0129, 499, # - 0x012b, 499, # - 0x012d, 499, # - 0x012f, 499, # - 0x0131, 268, # I - 0x0133, 499, # - 0x0135, 499, # - 0x0137, 499, # - 0x013a, 499, # - 0x013c, 499, # - 0x013e, 499, # - 0x0140, 499, # - 0x0142, 499, # - 0x0144, 499, # - 0x0146, 499, # - 0x0148, 499, # - 0x014b, 499, # - 0x014d, 499, # - 0x014f, 499, # - 0x0151, 499, # - 0x0153, 499, # - 0x0155, 499, # - 0x0157, 499, # - 0x0159, 499, # - 0x015b, 499, # - 0x015d, 499, # - 0x015f, 499, # - 0x0161, 499, # - 0x0163, 499, # - 0x0165, 499, # - 0x0167, 499, # - 0x0169, 499, # - 0x016b, 499, # - 0x016d, 499, # - 0x016f, 499, # - 0x0171, 499, # - 0x0173, 499, # - 0x0175, 499, # - 0x0177, 499, # - 0x017a, 499, # - 0x017c, 499, # - 0x017e, 499, # - 0x017f, 200, # S - 0x0183, 499, # - 0x0185, 499, # - 0x0188, 499, # - 0x018c, 499, # - 0x0192, 499, # - 0x0199, 499, # - 0x01a1, 499, # - 0x01a3, 499, # - 0x01a5, 499, # - 0x01a8, 499, # - 0x01ad, 499, # - 0x01b0, 499, # - 0x01b4, 499, # - 0x01b6, 499, # - 0x01b9, 499, # - 0x01bd, 499, # - 0x01c5, 499, # - 0x01c6, 498, # - 0x01c8, 499, # - 0x01c9, 498, # - 0x01cb, 499, # - 0x01cc, 498, # - 0x01ce, 499, # - 0x01d0, 499, # - 0x01d2, 499, # - 0x01d4, 499, # - 0x01d6, 499, # - 0x01d8, 499, # - 0x01da, 499, # - 0x01dc, 499, # - 0x01df, 499, # - 0x01e1, 499, # - 0x01e3, 499, # - 0x01e5, 499, # - 0x01e7, 499, # - 0x01e9, 499, # - 0x01eb, 499, # - 0x01ed, 499, # - 0x01ef, 499, # - 0x01f2, 499, # - 0x01f3, 498, # - 0x01f5, 499, # - 0x01fb, 499, # - 0x01fd, 499, # - 0x01ff, 499, # - 0x0201, 499, # - 0x0203, 499, # - 0x0205, 499, # - 0x0207, 499, # - 0x0209, 499, # - 0x020b, 499, # - 0x020d, 499, # - 0x020f, 499, # - 0x0211, 499, # - 0x0213, 499, # - 0x0215, 499, # - 0x0217, 499, # - 0x0253, 290, # - 0x0254, 294, # - 0x025b, 297, # - 0x0260, 295, # - 0x0263, 293, # - 0x0268, 291, # - 0x0269, 289, # - 0x026f, 289, # - 0x0272, 287, # - 0x0283, 282, # - 0x0288, 282, # - 0x0292, 281, # - 0x03ac, 462, # - 0x03cc, 436, # - 0x03d0, 438, # - 0x03d1, 443, # - 0x03d5, 453, # - 0x03d6, 446, # - 0x03e3, 499, # - 0x03e5, 499, # - 0x03e7, 499, # - 0x03e9, 499, # - 0x03eb, 499, # - 0x03ed, 499, # - 0x03ef, 499, # - 0x03f0, 414, # - 0x03f1, 420, # - 0x0461, 499, # - 0x0463, 499, # - 0x0465, 499, # - 0x0467, 499, # - 0x0469, 499, # - 0x046b, 499, # - 0x046d, 499, # - 0x046f, 499, # - 0x0471, 499, # - 0x0473, 499, # - 0x0475, 499, # - 0x0477, 499, # - 0x0479, 499, # - 0x047b, 499, # - 0x047d, 499, # - 0x047f, 499, # - 0x0481, 499, # - 0x0491, 499, # - 0x0493, 499, # - 0x0495, 499, # - 0x0497, 499, # - 0x0499, 499, # - 0x049b, 499, # - 0x049d, 499, # - 0x049f, 499, # - 0x04a1, 499, # - 0x04a3, 499, # - 0x04a5, 499, # - 0x04a7, 499, # - 0x04a9, 499, # - 0x04ab, 499, # - 0x04ad, 499, # - 0x04af, 499, # - 0x04b1, 499, # - 0x04b3, 499, # - 0x04b5, 499, # - 0x04b7, 499, # - 0x04b9, 499, # - 0x04bb, 499, # - 0x04bd, 499, # - 0x04bf, 499, # - 0x04c2, 499, # - 0x04c4, 499, # - 0x04c8, 499, # - 0x04cc, 499, # - 0x04d1, 499, # - 0x04d3, 499, # - 0x04d5, 499, # - 0x04d7, 499, # - 0x04d9, 499, # - 0x04db, 499, # - 0x04dd, 499, # - 0x04df, 499, # - 0x04e1, 499, # - 0x04e3, 499, # - 0x04e5, 499, # - 0x04e7, 499, # - 0x04e9, 499, # - 0x04eb, 499, # - 0x04ef, 499, # - 0x04f1, 499, # - 0x04f3, 499, # - 0x04f5, 499, # - 0x04f9, 499, # - 0x1e01, 499, # - 0x1e03, 499, # - 0x1e05, 499, # - 0x1e07, 499, # - 0x1e09, 499, # - 0x1e0b, 499, # - 0x1e0d, 499, # - 0x1e0f, 499, # - 0x1e11, 499, # - 0x1e13, 499, # - 0x1e15, 499, # - 0x1e17, 499, # - 0x1e19, 499, # - 0x1e1b, 499, # - 0x1e1d, 499, # - 0x1e1f, 499, # - 0x1e21, 499, # - 0x1e23, 499, # - 0x1e25, 499, # - 0x1e27, 499, # - 0x1e29, 499, # - 0x1e2b, 499, # - 0x1e2d, 499, # - 0x1e2f, 499, # - 0x1e31, 499, # - 0x1e33, 499, # - 0x1e35, 499, # - 0x1e37, 499, # - 0x1e39, 499, # - 0x1e3b, 499, # - 0x1e3d, 499, # - 0x1e3f, 499, # - 0x1e41, 499, # - 0x1e43, 499, # - 0x1e45, 499, # - 0x1e47, 499, # - 0x1e49, 499, # - 0x1e4b, 499, # - 0x1e4d, 499, # - 0x1e4f, 499, # - 0x1e51, 499, # - 0x1e53, 499, # - 0x1e55, 499, # - 0x1e57, 499, # - 0x1e59, 499, # - 0x1e5b, 499, # - 0x1e5d, 499, # - 0x1e5f, 499, # - 0x1e61, 499, # - 0x1e63, 499, # - 0x1e65, 499, # - 0x1e67, 499, # - 0x1e69, 499, # - 0x1e6b, 499, # - 0x1e6d, 499, # - 0x1e6f, 499, # - 0x1e71, 499, # - 0x1e73, 499, # - 0x1e75, 499, # - 0x1e77, 499, # - 0x1e79, 499, # - 0x1e7b, 499, # - 0x1e7d, 499, # - 0x1e7f, 499, # - 0x1e81, 499, # - 0x1e83, 499, # - 0x1e85, 499, # - 0x1e87, 499, # - 0x1e89, 499, # - 0x1e8b, 499, # - 0x1e8d, 499, # - 0x1e8f, 499, # - 0x1e91, 499, # - 0x1e93, 499, # - 0x1e95, 499, # - 0x1ea1, 499, # - 0x1ea3, 499, # - 0x1ea5, 499, # - 0x1ea7, 499, # - 0x1ea9, 499, # - 0x1eab, 499, # - 0x1ead, 499, # - 0x1eaf, 499, # - 0x1eb1, 499, # - 0x1eb3, 499, # - 0x1eb5, 499, # - 0x1eb7, 499, # - 0x1eb9, 499, # - 0x1ebb, 499, # - 0x1ebd, 499, # - 0x1ebf, 499, # - 0x1ec1, 499, # - 0x1ec3, 499, # - 0x1ec5, 499, # - 0x1ec7, 499, # - 0x1ec9, 499, # - 0x1ecb, 499, # - 0x1ecd, 499, # - 0x1ecf, 499, # - 0x1ed1, 499, # - 0x1ed3, 499, # - 0x1ed5, 499, # - 0x1ed7, 499, # - 0x1ed9, 499, # - 0x1edb, 499, # - 0x1edd, 499, # - 0x1edf, 499, # - 0x1ee1, 499, # - 0x1ee3, 499, # - 0x1ee5, 499, # - 0x1ee7, 499, # - 0x1ee9, 499, # - 0x1eeb, 499, # - 0x1eed, 499, # - 0x1eef, 499, # - 0x1ef1, 499, # - 0x1ef3, 499, # - 0x1ef5, 499, # - 0x1ef7, 499, # - 0x1ef9, 499, # - 0x1f51, 508, # - 0x1f53, 508, # - 0x1f55, 508, # - 0x1f57, 508, # - 0x1fb3, 509, # - 0x1fc3, 509, # - 0x1fe5, 507, # - 0x1ff3, 509] # - - tolowerRanges = [ - 0x0041, 0x005a, 532, # A-Z a-z - 0x00c0, 0x00d6, 532, # - - - 0x00d8, 0x00de, 532, # - - - 0x0189, 0x018a, 705, # - - - 0x018e, 0x018f, 702, # - - - 0x01b1, 0x01b2, 717, # - - - 0x0388, 0x038a, 537, # - - - 0x038e, 0x038f, 563, # - - - 0x0391, 0x03a1, 532, # - - - 0x03a3, 0x03ab, 532, # - - - 0x0401, 0x040c, 580, # - - - 0x040e, 0x040f, 580, # - - - 0x0410, 0x042f, 532, # - - - 0x0531, 0x0556, 548, # - - - 0x10a0, 0x10c5, 548, # - - - 0x1f08, 0x1f0f, 492, # - - - 0x1f18, 0x1f1d, 492, # - - - 0x1f28, 0x1f2f, 492, # - - - 0x1f38, 0x1f3f, 492, # - - - 0x1f48, 0x1f4d, 492, # - - - 0x1f68, 0x1f6f, 492, # - - - 0x1f88, 0x1f8f, 492, # - - - 0x1f98, 0x1f9f, 492, # - - - 0x1fa8, 0x1faf, 492, # - - - 0x1fb8, 0x1fb9, 492, # - - - 0x1fba, 0x1fbb, 426, # - - - 0x1fc8, 0x1fcb, 414, # - - - 0x1fd8, 0x1fd9, 492, # - - - 0x1fda, 0x1fdb, 400, # - - - 0x1fe8, 0x1fe9, 492, # - - - 0x1fea, 0x1feb, 388, # - - - 0x1ff8, 0x1ff9, 372, # - - - 0x1ffa, 0x1ffb, 374, # - - - 0x2160, 0x216f, 516, # - - - 0x24b6, 0x24cf, 526, # - - - 0xff21, 0xff3a, 532] # - - - - tolowerSinglets = [ - 0x0100, 501, # - 0x0102, 501, # - 0x0104, 501, # - 0x0106, 501, # - 0x0108, 501, # - 0x010a, 501, # - 0x010c, 501, # - 0x010e, 501, # - 0x0110, 501, # - 0x0112, 501, # - 0x0114, 501, # - 0x0116, 501, # - 0x0118, 501, # - 0x011a, 501, # - 0x011c, 501, # - 0x011e, 501, # - 0x0120, 501, # - 0x0122, 501, # - 0x0124, 501, # - 0x0126, 501, # - 0x0128, 501, # - 0x012a, 501, # - 0x012c, 501, # - 0x012e, 501, # - 0x0130, 301, # i - 0x0132, 501, # - 0x0134, 501, # - 0x0136, 501, # - 0x0139, 501, # - 0x013b, 501, # - 0x013d, 501, # - 0x013f, 501, # - 0x0141, 501, # - 0x0143, 501, # - 0x0145, 501, # - 0x0147, 501, # - 0x014a, 501, # - 0x014c, 501, # - 0x014e, 501, # - 0x0150, 501, # - 0x0152, 501, # - 0x0154, 501, # - 0x0156, 501, # - 0x0158, 501, # - 0x015a, 501, # - 0x015c, 501, # - 0x015e, 501, # - 0x0160, 501, # - 0x0162, 501, # - 0x0164, 501, # - 0x0166, 501, # - 0x0168, 501, # - 0x016a, 501, # - 0x016c, 501, # - 0x016e, 501, # - 0x0170, 501, # - 0x0172, 501, # - 0x0174, 501, # - 0x0176, 501, # - 0x0178, 379, # - 0x0179, 501, # - 0x017b, 501, # - 0x017d, 501, # - 0x0181, 710, # - 0x0182, 501, # - 0x0184, 501, # - 0x0186, 706, # - 0x0187, 501, # - 0x018b, 501, # - 0x0190, 703, # - 0x0191, 501, # - 0x0193, 705, # - 0x0194, 707, # - 0x0196, 711, # - 0x0197, 709, # - 0x0198, 501, # - 0x019c, 711, # - 0x019d, 713, # - 0x01a0, 501, # - 0x01a2, 501, # - 0x01a4, 501, # - 0x01a7, 501, # - 0x01a9, 718, # - 0x01ac, 501, # - 0x01ae, 718, # - 0x01af, 501, # - 0x01b3, 501, # - 0x01b5, 501, # - 0x01b7, 719, # - 0x01b8, 501, # - 0x01bc, 501, # - 0x01c4, 502, # - 0x01c5, 501, # - 0x01c7, 502, # - 0x01c8, 501, # - 0x01ca, 502, # - 0x01cb, 501, # - 0x01cd, 501, # - 0x01cf, 501, # - 0x01d1, 501, # - 0x01d3, 501, # - 0x01d5, 501, # - 0x01d7, 501, # - 0x01d9, 501, # - 0x01db, 501, # - 0x01de, 501, # - 0x01e0, 501, # - 0x01e2, 501, # - 0x01e4, 501, # - 0x01e6, 501, # - 0x01e8, 501, # - 0x01ea, 501, # - 0x01ec, 501, # - 0x01ee, 501, # - 0x01f1, 502, # - 0x01f2, 501, # - 0x01f4, 501, # - 0x01fa, 501, # - 0x01fc, 501, # - 0x01fe, 501, # - 0x0200, 501, # - 0x0202, 501, # - 0x0204, 501, # - 0x0206, 501, # - 0x0208, 501, # - 0x020a, 501, # - 0x020c, 501, # - 0x020e, 501, # - 0x0210, 501, # - 0x0212, 501, # - 0x0214, 501, # - 0x0216, 501, # - 0x0386, 538, # - 0x038c, 564, # - 0x03e2, 501, # - 0x03e4, 501, # - 0x03e6, 501, # - 0x03e8, 501, # - 0x03ea, 501, # - 0x03ec, 501, # - 0x03ee, 501, # - 0x0460, 501, # - 0x0462, 501, # - 0x0464, 501, # - 0x0466, 501, # - 0x0468, 501, # - 0x046a, 501, # - 0x046c, 501, # - 0x046e, 501, # - 0x0470, 501, # - 0x0472, 501, # - 0x0474, 501, # - 0x0476, 501, # - 0x0478, 501, # - 0x047a, 501, # - 0x047c, 501, # - 0x047e, 501, # - 0x0480, 501, # - 0x0490, 501, # - 0x0492, 501, # - 0x0494, 501, # - 0x0496, 501, # - 0x0498, 501, # - 0x049a, 501, # - 0x049c, 501, # - 0x049e, 501, # - 0x04a0, 501, # - 0x04a2, 501, # - 0x04a4, 501, # - 0x04a6, 501, # - 0x04a8, 501, # - 0x04aa, 501, # - 0x04ac, 501, # - 0x04ae, 501, # - 0x04b0, 501, # - 0x04b2, 501, # - 0x04b4, 501, # - 0x04b6, 501, # - 0x04b8, 501, # - 0x04ba, 501, # - 0x04bc, 501, # - 0x04be, 501, # - 0x04c1, 501, # - 0x04c3, 501, # - 0x04c7, 501, # - 0x04cb, 501, # - 0x04d0, 501, # - 0x04d2, 501, # - 0x04d4, 501, # - 0x04d6, 501, # - 0x04d8, 501, # - 0x04da, 501, # - 0x04dc, 501, # - 0x04de, 501, # - 0x04e0, 501, # - 0x04e2, 501, # - 0x04e4, 501, # - 0x04e6, 501, # - 0x04e8, 501, # - 0x04ea, 501, # - 0x04ee, 501, # - 0x04f0, 501, # - 0x04f2, 501, # - 0x04f4, 501, # - 0x04f8, 501, # - 0x1e00, 501, # - 0x1e02, 501, # - 0x1e04, 501, # - 0x1e06, 501, # - 0x1e08, 501, # - 0x1e0a, 501, # - 0x1e0c, 501, # - 0x1e0e, 501, # - 0x1e10, 501, # - 0x1e12, 501, # - 0x1e14, 501, # - 0x1e16, 501, # - 0x1e18, 501, # - 0x1e1a, 501, # - 0x1e1c, 501, # - 0x1e1e, 501, # - 0x1e20, 501, # - 0x1e22, 501, # - 0x1e24, 501, # - 0x1e26, 501, # - 0x1e28, 501, # - 0x1e2a, 501, # - 0x1e2c, 501, # - 0x1e2e, 501, # - 0x1e30, 501, # - 0x1e32, 501, # - 0x1e34, 501, # - 0x1e36, 501, # - 0x1e38, 501, # - 0x1e3a, 501, # - 0x1e3c, 501, # - 0x1e3e, 501, # - 0x1e40, 501, # - 0x1e42, 501, # - 0x1e44, 501, # - 0x1e46, 501, # - 0x1e48, 501, # - 0x1e4a, 501, # - 0x1e4c, 501, # - 0x1e4e, 501, # - 0x1e50, 501, # - 0x1e52, 501, # - 0x1e54, 501, # - 0x1e56, 501, # - 0x1e58, 501, # - 0x1e5a, 501, # - 0x1e5c, 501, # - 0x1e5e, 501, # - 0x1e60, 501, # - 0x1e62, 501, # - 0x1e64, 501, # - 0x1e66, 501, # - 0x1e68, 501, # - 0x1e6a, 501, # - 0x1e6c, 501, # - 0x1e6e, 501, # - 0x1e70, 501, # - 0x1e72, 501, # - 0x1e74, 501, # - 0x1e76, 501, # - 0x1e78, 501, # - 0x1e7a, 501, # - 0x1e7c, 501, # - 0x1e7e, 501, # - 0x1e80, 501, # - 0x1e82, 501, # - 0x1e84, 501, # - 0x1e86, 501, # - 0x1e88, 501, # - 0x1e8a, 501, # - 0x1e8c, 501, # - 0x1e8e, 501, # - 0x1e90, 501, # - 0x1e92, 501, # - 0x1e94, 501, # - 0x1ea0, 501, # - 0x1ea2, 501, # - 0x1ea4, 501, # - 0x1ea6, 501, # - 0x1ea8, 501, # - 0x1eaa, 501, # - 0x1eac, 501, # - 0x1eae, 501, # - 0x1eb0, 501, # - 0x1eb2, 501, # - 0x1eb4, 501, # - 0x1eb6, 501, # - 0x1eb8, 501, # - 0x1eba, 501, # - 0x1ebc, 501, # - 0x1ebe, 501, # - 0x1ec0, 501, # - 0x1ec2, 501, # - 0x1ec4, 501, # - 0x1ec6, 501, # - 0x1ec8, 501, # - 0x1eca, 501, # - 0x1ecc, 501, # - 0x1ece, 501, # - 0x1ed0, 501, # - 0x1ed2, 501, # - 0x1ed4, 501, # - 0x1ed6, 501, # - 0x1ed8, 501, # - 0x1eda, 501, # - 0x1edc, 501, # - 0x1ede, 501, # - 0x1ee0, 501, # - 0x1ee2, 501, # - 0x1ee4, 501, # - 0x1ee6, 501, # - 0x1ee8, 501, # - 0x1eea, 501, # - 0x1eec, 501, # - 0x1eee, 501, # - 0x1ef0, 501, # - 0x1ef2, 501, # - 0x1ef4, 501, # - 0x1ef6, 501, # - 0x1ef8, 501, # - 0x1f59, 492, # - 0x1f5b, 492, # - 0x1f5d, 492, # - 0x1f5f, 492, # - 0x1fbc, 491, # - 0x1fcc, 491, # - 0x1fec, 493, # - 0x1ffc, 491] # - - toTitleSinglets = [ - 0x01c4, 501, # - 0x01c6, 499, # - 0x01c7, 501, # - 0x01c9, 499, # - 0x01ca, 501, # - 0x01cc, 499, # - 0x01f1, 501, # - 0x01f3, 499] # - -proc binarySearch(c: RuneImpl, tab: openArray[int], len, stride: int): int = +proc `<=%`*(a, b: Rune): bool = + ## Checks if code point of `a` is smaller or equal to code point of `b`. + runnableExamples: + let + a = "ú".runeAt(0) + b = "ü".runeAt(0) + doAssert a <=% b + return int(a) <=% int(b) + +proc `<%`*(a, b: Rune): bool = + ## Checks if code point of `a` is smaller than code point of `b`. + runnableExamples: + let + a = "ú".runeAt(0) + b = "ü".runeAt(0) + doAssert a <% b + return int(a) <% int(b) + +proc `==`*(a, b: Rune): bool = + ## Checks if two runes are equal. + return int(a) == int(b) + + +include "includes/unicode_ranges" + +proc binarySearch(c: RuneImpl, tab: openArray[int32], len, stride: int): int = var n = len var t = 0 while n > 1: @@ -1301,64 +479,102 @@ proc binarySearch(c: RuneImpl, tab: openArray[int], len, stride: int): int = return t return -1 -proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} = - ## Converts ``c`` into lower case. This works for any Unicode character. +proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1".} = + ## Converts ``c`` into lower case. This works for any rune. + ## ## If possible, prefer ``toLower`` over ``toUpper``. + ## + ## See also: + ## * `toUpper proc <#toUpper,Rune>`_ + ## * `toTitle proc <#toTitle,Rune>`_ + ## * `isLower proc <#isLower,Rune>`_ var c = RuneImpl(c) - var p = binarySearch(c, tolowerRanges, len(tolowerRanges) div 3, 3) - if p >= 0 and c >= tolowerRanges[p] and c <= tolowerRanges[p+1]: - return Rune(c + tolowerRanges[p+2] - 500) - p = binarySearch(c, tolowerSinglets, len(tolowerSinglets) div 2, 2) - if p >= 0 and c == tolowerSinglets[p]: - return Rune(c + tolowerSinglets[p+1] - 500) + var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3) + if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]: + return Rune(c + toLowerRanges[p+2] - 500) + p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2) + if p >= 0 and c == toLowerSinglets[p]: + return Rune(c + toLowerSinglets[p+1] - 500) return Rune(c) -proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} = - ## Converts ``c`` into upper case. This works for any Unicode character. +proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1".} = + ## Converts ``c`` into upper case. This works for any rune. + ## ## If possible, prefer ``toLower`` over ``toUpper``. + ## + ## See also: + ## * `toLower proc <#toLower,Rune>`_ + ## * `toTitle proc <#toTitle,Rune>`_ + ## * `isUpper proc <#isUpper,Rune>`_ var c = RuneImpl(c) - var p = binarySearch(c, toupperRanges, len(toupperRanges) div 3, 3) - if p >= 0 and c >= toupperRanges[p] and c <= toupperRanges[p+1]: - return Rune(c + toupperRanges[p+2] - 500) - p = binarySearch(c, toupperSinglets, len(toupperSinglets) div 2, 2) - if p >= 0 and c == toupperSinglets[p]: - return Rune(c + toupperSinglets[p+1] - 500) + var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3) + if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]: + return Rune(c + toUpperRanges[p+2] - 500) + p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2) + if p >= 0 and c == toUpperSinglets[p]: + return Rune(c + toUpperSinglets[p+1] - 500) return Rune(c) -proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} = - ## Converts ``c`` to title case +proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1".} = + ## Converts ``c`` to title case. + ## + ## See also: + ## * `toLower proc <#toLower,Rune>`_ + ## * `toUpper proc <#toUpper,Rune>`_ + ## * `isTitle proc <#isTitle,Rune>`_ var c = RuneImpl(c) var p = binarySearch(c, toTitleSinglets, len(toTitleSinglets) div 2, 2) if p >= 0 and c == toTitleSinglets[p]: return Rune(c + toTitleSinglets[p+1] - 500) return Rune(c) -proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = - ## Returns true iff ``c`` is a lower case Unicode character. +proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1".} = + ## Returns true if ``c`` is a lower case rune. + ## ## If possible, prefer ``isLower`` over ``isUpper``. + ## + ## See also: + ## * `toLower proc <#toLower,Rune>`_ + ## * `isUpper proc <#isUpper,Rune>`_ + ## * `isTitle proc <#isTitle,Rune>`_ var c = RuneImpl(c) # Note: toUpperRanges is correct here! - var p = binarySearch(c, toupperRanges, len(toupperRanges) div 3, 3) - if p >= 0 and c >= toupperRanges[p] and c <= toupperRanges[p+1]: + var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3) + if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]: return true - p = binarySearch(c, toupperSinglets, len(toupperSinglets) div 2, 2) - if p >= 0 and c == toupperSinglets[p]: + p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2) + if p >= 0 and c == toUpperSinglets[p]: return true -proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = - ## Returns true iff ``c`` is a upper case Unicode character. +proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1".} = + ## Returns true if ``c`` is a upper case rune. + ## ## If possible, prefer ``isLower`` over ``isUpper``. + ## + ## See also: + ## * `toUpper proc <#toUpper,Rune>`_ + ## * `isLower proc <#isLower,Rune>`_ + ## * `isTitle proc <#isTitle,Rune>`_ + ## * `isAlpha proc <#isAlpha,Rune>`_ + ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_ var c = RuneImpl(c) # Note: toLowerRanges is correct here! - var p = binarySearch(c, tolowerRanges, len(tolowerRanges) div 3, 3) - if p >= 0 and c >= tolowerRanges[p] and c <= tolowerRanges[p+1]: + var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3) + if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]: return true - p = binarySearch(c, tolowerSinglets, len(tolowerSinglets) div 2, 2) - if p >= 0 and c == tolowerSinglets[p]: + p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2) + if p >= 0 and c == toLowerSinglets[p]: return true -proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = - ## Returns true iff ``c`` is an *alpha* Unicode character (i.e., a letter) +proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1".} = + ## Returns true if ``c`` is an *alpha* rune (i.e., a letter). + ## + ## See also: + ## * `isLower proc <#isLower,Rune>`_ + ## * `isTitle proc <#isTitle,Rune>`_ + ## * `isAlpha proc <#isAlpha,Rune>`_ + ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_ + ## * `isCombining proc <#isCombining,Rune>`_ if isUpper(c) or isLower(c): return true var c = RuneImpl(c) @@ -1369,19 +585,38 @@ proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = if p >= 0 and c == alphaSinglets[p]: return true -proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = - ## Returns true iff ``c`` is a Unicode titlecase character +proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1".} = + ## Returns true if ``c`` is a Unicode titlecase code point. + ## + ## See also: + ## * `toTitle proc <#toTitle,Rune>`_ + ## * `isLower proc <#isLower,Rune>`_ + ## * `isUpper proc <#isUpper,Rune>`_ + ## * `isAlpha proc <#isAlpha,Rune>`_ + ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_ return isUpper(c) and isLower(c) -proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = - ## Returns true iff ``c`` is a Unicode whitespace character +proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1".} = + ## Returns true if ``c`` is a Unicode whitespace code point. + ## + ## See also: + ## * `isLower proc <#isLower,Rune>`_ + ## * `isUpper proc <#isUpper,Rune>`_ + ## * `isTitle proc <#isTitle,Rune>`_ + ## * `isAlpha proc <#isAlpha,Rune>`_ var c = RuneImpl(c) var p = binarySearch(c, spaceRanges, len(spaceRanges) div 2, 2) if p >= 0 and c >= spaceRanges[p] and c <= spaceRanges[p+1]: return true -proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = - ## Returns true iff ``c`` is a Unicode combining character +proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1".} = + ## Returns true if ``c`` is a Unicode combining code unit. + ## + ## See also: + ## * `isLower proc <#isLower,Rune>`_ + ## * `isUpper proc <#isUpper,Rune>`_ + ## * `isTitle proc <#isTitle,Rune>`_ + ## * `isAlpha proc <#isAlpha,Rune>`_ var c = RuneImpl(c) # Optimized to return false immediately for ASCII @@ -1394,155 +629,116 @@ proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = template runeCheck(s, runeProc) = ## Common code for isAlpha and isSpace. result = if len(s) == 0: false else: true - var i = 0 rune: Rune - while i < len(s) and result: - fastRuneAt(s, i, rune, doInc=true) + fastRuneAt(s, i, rune, doInc = true) result = runeProc(rune) and result -proc isAlpha*(s: string): bool {.noSideEffect, procvar, +proc isAlpha*(s: openArray[char]): bool {.noSideEffect, rtl, extern: "nuc$1Str".} = - ## Returns true iff `s` contains all alphabetic unicode characters. + ## Returns true if ``s`` contains all alphabetic runes. + runnableExamples: + let a = "añyóng" + doAssert a.isAlpha runeCheck(s, isAlpha) -proc isSpace*(s: string): bool {.noSideEffect, procvar, +proc isSpace*(s: openArray[char]): bool {.noSideEffect, rtl, extern: "nuc$1Str".} = - ## Returns true iff `s` contains all whitespace unicode characters. + ## Returns true if ``s`` contains all whitespace runes. + runnableExamples: + let a = "\t\l \v\r\f" + doAssert a.isSpace runeCheck(s, isWhiteSpace) -template runeCaseCheck(s, runeProc, skipNonAlpha) = - ## Common code for rune.isLower and rune.isUpper. - if len(s) == 0: return false - - var - i = 0 - rune: Rune - hasAtleastOneAlphaRune = false - - while i < len(s): - fastRuneAt(s, i, rune, doInc=true) - if skipNonAlpha: - var runeIsAlpha = isAlpha(rune) - if not hasAtleastOneAlphaRune: - hasAtleastOneAlphaRune = runeIsAlpha - if runeIsAlpha and (not runeProc(rune)): - return false - else: - if not runeProc(rune): - return false - return if skipNonAlpha: hasAtleastOneAlphaRune else: true - -proc isLower*(s: string, skipNonAlpha: bool): bool = - ## Checks whether ``s`` is lower case. - ## - ## If ``skipNonAlpha`` is true, returns true if all alphabetical - ## runes in ``s`` are lower case. Returns false if none of the - ## runes in ``s`` are alphabetical. - ## - ## If ``skipNonAlpha`` is false, returns true only if all runes in - ## ``s`` are alphabetical and lower case. - ## - ## For either value of ``skipNonAlpha``, returns false if ``s`` is - ## an empty string. - runeCaseCheck(s, isLower, skipNonAlpha) - -proc isUpper*(s: string, skipNonAlpha: bool): bool = - ## Checks whether ``s`` is upper case. - ## - ## If ``skipNonAlpha`` is true, returns true if all alphabetical - ## runes in ``s`` are upper case. Returns false if none of the - ## runes in ``s`` are alphabetical. - ## - ## If ``skipNonAlpha`` is false, returns true only if all runes in - ## ``s`` are alphabetical and upper case. - ## - ## For either value of ``skipNonAlpha``, returns false if ``s`` is - ## an empty string. - runeCaseCheck(s, isUpper, skipNonAlpha) template convertRune(s, runeProc) = - ## Convert runes in `s` using `runeProc` as the converter. + ## Convert runes in ``s`` using ``runeProc`` as the converter. result = newString(len(s)) - var i = 0 - lastIndex = 0 + resultIndex = 0 rune: Rune - while i < len(s): - lastIndex = i - fastRuneAt(s, i, rune, doInc=true) + fastRuneAt(s, i, rune, doInc = true) rune = runeProc(rune) + fastToUTF8Copy(rune, result, resultIndex, doInc = true) - rune.fastToUTF8Copy(result, lastIndex) - -proc toUpper*(s: string): string {.noSideEffect, procvar, +proc toUpper*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1Str".} = - ## Converts `s` into upper-case unicode characters. + ## Converts ``s`` into upper-case runes. + runnableExamples: + doAssert toUpper("abγ") == "ABΓ" convertRune(s, toUpper) -proc toLower*(s: string): string {.noSideEffect, procvar, +proc toLower*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1Str".} = - ## Converts `s` into lower-case unicode characters. + ## Converts ``s`` into lower-case runes. + runnableExamples: + doAssert toLower("ABΓ") == "abγ" convertRune(s, toLower) -proc swapCase*(s: string): string {.noSideEffect, procvar, +proc swapCase*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1".} = - ## Swaps the case of unicode characters in `s` + ## Swaps the case of runes in ``s``. ## - ## Returns a new string such that the cases of all unicode characters - ## are swapped if possible + ## Returns a new string such that the cases of all runes + ## are swapped if possible. + runnableExamples: + doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA" var i = 0 - lastIndex = 0 + resultIndex = 0 rune: Rune - result = newString(len(s)) - while i < len(s): - lastIndex = i - fastRuneAt(s, i, rune) - if rune.isUpper(): rune = rune.toLower() elif rune.isLower(): rune = rune.toUpper() + fastToUTF8Copy(rune, result, resultIndex, doInc = true) - rune.fastToUTF8Copy(result, lastIndex) - -proc capitalize*(s: string): string {.noSideEffect, procvar, +proc capitalize*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1".} = - ## Converts the first character of `s` into an upper-case unicode character. - if len(s) == 0: - return s + ## Converts the first character of ``s`` into an upper-case rune. + runnableExamples: + doAssert capitalize("βeta") == "Βeta" + if len(s) == 0: + return "" var rune: Rune i = 0 + fastRuneAt(s, i, rune, doInc = true) + result = $toUpper(rune) & substr(s.toOpenArray(i, s.high)) - fastRuneAt(s, i, rune, doInc=true) +when not defined(nimHasEffectsOf): + {.pragma: effectsOf.} - result = $toUpper(rune) & substr(s, i) - -proc translate*(s: string, replacements: proc(key: string): string): string {. - rtl, extern: "nuc$1".} = - ## Translates words in a string using the `replacements` proc to substitute - ## words inside `s` with their replacements +proc translate*(s: openArray[char], replacements: proc(key: string): string): string {. + rtl, extern: "nuc$1", effectsOf: replacements.} = + ## Translates words in a string using the ``replacements`` proc to substitute + ## words inside ``s`` with their replacements. ## - ## `replacements` is any proc that takes a word and returns + ## ``replacements`` is any proc that takes a word and returns ## a new word to fill it's place. + runnableExamples: + proc wordToNumber(s: string): string = + case s + of "one": "1" + of "two": "2" + else: s + let a = "one two three four" + doAssert a.translate(wordToNumber) == "1 2 three four" # Allocate memory for the new string based on the old one. # If the new string length is less than the old, no allocations # will be needed. If the new string length is greater than the # old, then maybe only one allocation is needed result = newStringOfCap(s.len) - var index = 0 lastIndex = 0 @@ -1552,17 +748,14 @@ proc translate*(s: string, replacements: proc(key: string): string): string {. while index < len(s): lastIndex = index - fastRuneAt(s, index, rune) - let whiteSpace = rune.isWhiteSpace() if whiteSpace and inWord: # If we've reached the end of a word - let word = s[wordStart ..< lastIndex] + let word = substr(s.toOpenArray(wordStart, lastIndex - 1)) result.add(replacements(word)) result.add($rune) - inWord = false elif not whiteSpace and not inWord: # If we've hit a non space character and @@ -1575,66 +768,37 @@ proc translate*(s: string, replacements: proc(key: string): string): string {. if wordStart < len(s) and inWord: # Get the trailing word at the end - let word = s[wordStart .. ^1] + let word = substr(s.toOpenArray(wordStart, s.high)) result.add(replacements(word)) -proc title*(s: string): string {.noSideEffect, procvar, +proc title*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1".} = - ## Converts `s` to a unicode title. + ## Converts ``s`` to a unicode title. ## ## Returns a new string such that the first character - ## in each word inside `s` is capitalized + ## in each word inside ``s`` is capitalized. + runnableExamples: + doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma" var i = 0 - lastIndex = 0 + resultIndex = 0 rune: Rune - result = newString(len(s)) - var firstRune = true while i < len(s): - lastIndex = i - fastRuneAt(s, i, rune) - if not rune.isWhiteSpace() and firstRune: rune = rune.toUpper() firstRune = false elif rune.isWhiteSpace(): firstRune = true + fastToUTF8Copy(rune, result, resultIndex, doInc = true) - rune.fastToUTF8Copy(result, lastIndex) - -proc isTitle*(s: string): bool {.noSideEffect, procvar, - rtl, extern: "nuc$1Str".}= - ## Checks whether or not `s` is a unicode title. - ## - ## Returns true if the first character in each word inside `s` - ## are upper case and there is at least one character in `s`. - if s.len() == 0: - return false - - result = true - var - i = 0 - rune: Rune - - var firstRune = true - - while i < len(s) and result: - fastRuneAt(s, i, rune, doInc=true) - - if not rune.isWhiteSpace() and firstRune: - result = rune.isUpper() and result - firstRune = false - elif rune.isWhiteSpace(): - firstRune = true - -iterator runes*(s: string): Rune = - ## Iterates over any unicode character of the string ``s`` returning runes +iterator runes*(s: openArray[char]): Rune = + ## Iterates over any rune of the string ``s`` returning runes. var i = 0 result: Rune @@ -1642,26 +806,39 @@ iterator runes*(s: string): Rune = fastRuneAt(s, i, result, true) yield result -iterator utf8*(s: string): string = - ## Iterates over any unicode character of the string ``s`` returning utf8 values +iterator utf8*(s: openArray[char]): string = + ## Iterates over any rune of the string ``s`` returning utf8 values. + ## + ## See also: + ## * `validateUtf8 proc <#validateUtf8,string>`_ + ## * `toUTF8 proc <#toUTF8,Rune>`_ + ## * `$ proc <#$,Rune>`_ alias for `toUTF8` + ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ var o = 0 while o < s.len: let n = runeLenAt(s, o) - yield s[o.. (o+n-1)] + yield substr(s.toOpenArray(o, (o+n-1))) o += n -proc toRunes*(s: string): seq[Rune] = - ## Obtains a sequence containing the Runes in ``s`` +proc toRunes*(s: openArray[char]): seq[Rune] = + ## Obtains a sequence containing the Runes in ``s``. + ## + ## See also: + ## * `$ proc <#$,Rune>`_ for a reverse operation + runnableExamples: + let a = toRunes("aáä") + doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)] + result = newSeq[Rune]() for r in s.runes: result.add(r) -proc cmpRunesIgnoreCase*(a, b: string): int {.rtl, extern: "nuc$1", procvar.} = +proc cmpRunesIgnoreCase*(a, b: openArray[char]): int {.rtl, extern: "nuc$1".} = ## Compares two UTF-8 strings and ignores the case. Returns: ## - ## | 0 iff a == b - ## | < 0 iff a < b - ## | > 0 iff a > b + ## | `0` if a == b + ## | `< 0` if a < b + ## | `> 0` if a > b var i = 0 var j = 0 var ar, br: Rune @@ -1669,20 +846,25 @@ proc cmpRunesIgnoreCase*(a, b: string): int {.rtl, extern: "nuc$1", procvar.} = # slow path: fastRuneAt(a, i, ar) fastRuneAt(b, j, br) - result = RuneImpl(toLower(ar)) - RuneImpl(toLower(br)) + when sizeof(int) < 4: + const lo = low(int).int32 + const hi = high(int).int32 + result = clamp(RuneImpl(toLower(ar)) - RuneImpl(toLower(br)), lo, hi).int + else: + result = RuneImpl(toLower(ar)) - RuneImpl(toLower(br)) if result != 0: return result = a.len - b.len -proc reversed*(s: string): string = - ## Returns the reverse of ``s``, interpreting it as Unicode characters. - ## Unicode combining characters are correctly interpreted as well: - ## - ## .. code-block:: nim +proc reversed*(s: openArray[char]): string = + ## Returns the reverse of ``s``, interpreting it as runes. ## - ## assert reversed("Reverse this!") == "!siht esreveR" - ## assert reversed("先秦兩漢") == "漢兩秦先" - ## assert reversed("as⃝df̅") == "f̅ds⃝a" - ## assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞" + ## Unicode combining characters are correctly interpreted as well. + runnableExamples: + assert reversed("Reverse this!") == "!siht esreveR" + assert reversed("先秦兩漢") == "漢兩秦先" + assert reversed("as⃝df̅") == "f̅ds⃝a" + assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞" + var i = 0 lastI = 0 @@ -1708,9 +890,15 @@ proc reversed*(s: string): string = reverseUntil(len(s)) -proc graphemeLen*(s: string; i: Natural): Natural = - ## The number of bytes belonging to 's[i]' including following combining - ## characters. +proc graphemeLen*(s: openArray[char]; i: Natural): Natural = + ## The number of bytes belonging to byte index ``s[i]``, + ## including following combining code units. + runnableExamples: + let a = "añyóng" + doAssert a.graphemeLen(1) == 2 ## ñ + doAssert a.graphemeLen(2) == 1 + doAssert a.graphemeLen(4) == 2 ## ó + var j = i.int var r, r2: Rune if j < s.len: @@ -1721,180 +909,607 @@ proc graphemeLen*(s: string; i: Natural): Natural = if not isCombining(r2): break result = j-i -proc lastRune*(s: string; last: int): (Rune, int) = - ## length of the last rune in 's[0..last]'. Returns the rune and its length +proc lastRune*(s: openArray[char]; last: int): (Rune, int) = + ## Length of the last rune in ``s[0..last]``. Returns the rune and its length ## in bytes. if s[last] <= chr(127): result = (Rune(s[last]), 1) else: var L = 0 - while last-L >= 0 and ord(s[last-L]) shr 6 == 0b10: inc(L) + while last-L >= 0 and uint(s[last-L]) shr 6 == 0b10: inc(L) var r: Rune fastRuneAt(s, last-L, r, false) result = (r, L+1) -when isMainModule: +proc size*(r: Rune): int {.noSideEffect.} = + ## Returns the number of bytes the rune ``r`` takes. + runnableExamples: + let a = toRunes "aá" + doAssert size(a[0]) == 1 + doAssert size(a[1]) == 2 + + let v = r.uint32 + if v <= 0x007F'u32: result = 1 + elif v <= 0x07FF'u32: result = 2 + elif v <= 0xFFFF'u32: result = 3 + elif v <= 0x1FFFFF'u32: result = 4 + elif v <= 0x3FFFFFF'u32: result = 5 + elif v <= 0x7FFFFFFF'u32: result = 6 + else: result = 1 + +# --------- Private templates for different split separators ----------- +proc stringHasSep(s: openArray[char], index: int, seps: openArray[Rune]): bool = + var rune: Rune + fastRuneAt(s, index, rune, false) + return seps.contains(rune) + +proc stringHasSep(s: openArray[char], index: int, sep: Rune): bool = + var rune: Rune + fastRuneAt(s, index, rune, false) + return sep == rune + +template splitCommon(s, sep, maxsplit: untyped) = + ## Common code for split procedures. let - someString = "öÑ" - someRunes = @[runeAt(someString, 0), runeAt(someString, 2)] - compared = (someString == $someRunes) - doAssert compared == true - - proc test_replacements(word: string): string = - case word - of "two": - return "2" - of "foo": - return "BAR" - of "βeta": - return "beta" - of "alpha": - return "αlpha" - else: - return "12345" - - doAssert translate("two not alpha foo βeta", test_replacements) == "2 12345 αlpha BAR beta" - doAssert translate(" two not foo βeta ", test_replacements) == " 2 12345 BAR beta " - - doAssert title("foo bar") == "Foo Bar" - doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma" - doAssert title("") == "" - - doAssert capitalize("βeta") == "Βeta" - doAssert capitalize("foo") == "Foo" - doAssert capitalize("") == "" - - doAssert isTitle("Foo") - doAssert(not isTitle("Foo bar")) - doAssert(not isTitle("αlpha Βeta")) - doAssert(isTitle("Αlpha Βeta Γamma")) - doAssert(not isTitle("fFoo")) - - doAssert swapCase("FooBar") == "fOObAR" - doAssert swapCase(" ") == " " - doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA" - doAssert swapCase("a✓B") == "A✓b" - doAssert swapCase("") == "" - - doAssert isAlpha("r") - doAssert isAlpha("α") - doAssert(not isAlpha("$")) - doAssert(not isAlpha("")) - - doAssert isAlpha("Βeta") - doAssert isAlpha("Args") - doAssert(not isAlpha("$Foo✓")) - - doAssert isSpace("\t") - doAssert isSpace("\l") - doAssert(not isSpace("Β")) - doAssert(not isSpace("Βeta")) - - doAssert isSpace("\t\l \v\r\f") - doAssert isSpace(" ") - doAssert(not isSpace("")) - doAssert(not isSpace("ΑΓc \td")) - - doAssert(not isLower(' '.Rune)) - - doAssert isLower("a", false) - doAssert isLower("γ", true) - doAssert(not isLower("Γ", false)) - doAssert(not isLower("4", true)) - doAssert(not isLower("", false)) - doAssert isLower("abcdγ", false) - doAssert(not isLower("33aaΓ", false)) - doAssert(not isLower("a b", false)) - - doAssert(not isLower("abCDΓ", true)) - doAssert isLower("a b", true) - doAssert isLower("1, 2, 3 go!", true) - doAssert(not isLower(" ", true)) - doAssert(not isLower("(*&#@(^#$✓ ", true)) # None of the string runes are alphabets - - doAssert(not isUpper(' '.Rune)) - - doAssert isUpper("Γ", false) - doAssert(not isUpper("α", false)) - doAssert(not isUpper("", false)) - doAssert isUpper("ΑΒΓ", false) - doAssert(not isUpper("A#$β", false)) - doAssert(not isUpper("A B", false)) - - doAssert(not isUpper("b", true)) - doAssert(not isUpper("✓", true)) - doAssert(not isUpper("AAccβ", true)) - doAssert isUpper("A B", true) - doAssert isUpper("1, 2, 3 GO!", true) - doAssert(not isUpper(" ", true)) - doAssert(not isUpper("(*&#@(^#$✓ ", true)) # None of the string runes are alphabets - - doAssert toUpper("Γ") == "Γ" - doAssert toUpper("b") == "B" - doAssert toUpper("α") == "Α" - doAssert toUpper("✓") == "✓" - doAssert toUpper("") == "" - - doAssert toUpper("ΑΒΓ") == "ΑΒΓ" - doAssert toUpper("AAccβ") == "AACCΒ" - doAssert toUpper("A✓$β") == "A✓$Β" - - doAssert toLower("a") == "a" - doAssert toLower("γ") == "γ" - doAssert toLower("Γ") == "γ" - doAssert toLower("4") == "4" - doAssert toLower("") == "" - - doAssert toLower("abcdγ") == "abcdγ" - doAssert toLower("abCDΓ") == "abcdγ" - doAssert toLower("33aaΓ") == "33aaγ" - - doAssert reversed("Reverse this!") == "!siht esreveR" - doAssert reversed("先秦兩漢") == "漢兩秦先" - doAssert reversed("as⃝df̅") == "f̅ds⃝a" - doAssert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞" - doAssert len(toRunes("as⃝df̅")) == runeLen("as⃝df̅") - const test = "as⃝" - doAssert lastRune(test, test.len-1)[1] == 3 - doAssert graphemeLen("è", 0) == 2 - - # test for rune positioning and runeSubStr() - let s = "Hänsel ««: 10,00€" - - var t = "" - for c in s.utf8: - t.add c - - doAssert(s == t) - - doAssert(runeReverseOffset(s, 1) == (20, 18)) - doAssert(runeReverseOffset(s, 19) == (-1, 18)) - - doAssert(runeStrAtPos(s, 0) == "H") - doAssert(runeSubStr(s, 0, 1) == "H") - doAssert(runeStrAtPos(s, 10) == ":") - doAssert(runeSubStr(s, 10, 1) == ":") - doAssert(runeStrAtPos(s, 9) == "«") - doAssert(runeSubStr(s, 9, 1) == "«") - doAssert(runeStrAtPos(s, 17) == "€") - doAssert(runeSubStr(s, 17, 1) == "€") - # echo runeStrAtPos(s, 18) # index error - - doAssert(runeSubStr(s, 0) == "Hänsel ««: 10,00€") - doAssert(runeSubStr(s, -18) == "Hänsel ««: 10,00€") - doAssert(runeSubStr(s, 10) == ": 10,00€") - doAssert(runeSubStr(s, 18) == "") - doAssert(runeSubStr(s, 0, 10) == "Hänsel ««") - - doAssert(runeSubStr(s, 12) == "10,00€") - doAssert(runeSubStr(s, -6) == "10,00€") - - doAssert(runeSubStr(s, 12, 5) == "10,00") - doAssert(runeSubStr(s, 12, -1) == "10,00") - doAssert(runeSubStr(s, -6, 5) == "10,00") - doAssert(runeSubStr(s, -6, -1) == "10,00") - - doAssert(runeSubStr(s, 0, 100) == "Hänsel ««: 10,00€") - doAssert(runeSubStr(s, -100, 100) == "Hänsel ««: 10,00€") - doAssert(runeSubStr(s, 0, -100) == "") - doAssert(runeSubStr(s, 100, -100) == "") + sLen = len(s) + var + last = 0 + splits = maxsplit + if sLen > 0: + while last <= sLen: + var first = last + while last < sLen and not stringHasSep(s, last, sep): + inc(last, runeLenAt(s, last)) + if splits == 0: last = sLen + yield substr(s.toOpenArray(first, (last - 1))) + if splits == 0: break + dec(splits) + inc(last, if last < sLen: runeLenAt(s, last) else: 1) + +iterator split*(s: openArray[char], seps: openArray[Rune] = unicodeSpaces, + maxsplit: int = -1): string = + ## Splits the unicode string ``s`` into substrings using a group of separators. + ## + ## Substrings are separated by a substring containing only ``seps``. + runnableExamples: + import std/sequtils + + assert toSeq("hÃllo\lthis\lis an\texample\l是".split) == + @["hÃllo", "this", "is", "an", "example", "是"] + + # And the following code splits the same string using a sequence of Runes. + assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) == + @["añyóng", "hÃllo", "是", "example"] + + # example with a `Rune` separator and unused one `;`: + assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""] + + # Another example that splits a string containing a date. + let date = "2012-11-20T22:08:08.398990" + + assert toSeq(split(date, " -:T".toRunes)) == + @["2012", "11", "20", "22", "08", "08.398990"] + + splitCommon(s, seps, maxsplit) + +iterator splitWhitespace*(s: openArray[char]): string = + ## Splits a unicode string at whitespace runes. + splitCommon(s, unicodeSpaces, -1) + +template accResult(iter: untyped) = + result = @[] + for x in iter: add(result, x) + +proc splitWhitespace*(s: openArray[char]): seq[string] {.noSideEffect, + rtl, extern: "ncuSplitWhitespace".} = + ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_ + ## iterator, but is a proc that returns a sequence of substrings. + accResult(splitWhitespace(s)) + +iterator split*(s: openArray[char], sep: Rune, maxsplit: int = -1): string = + ## Splits the unicode string ``s`` into substrings using a single separator. + ## Substrings are separated by the rune ``sep``. + runnableExamples: + import std/sequtils + + assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) == + @["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"] + + splitCommon(s, sep, maxsplit) + +proc split*(s: openArray[char], seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1): + seq[string] {.noSideEffect, rtl, extern: "nucSplitRunes".} = + ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_, + ## but is a proc that returns a sequence of substrings. + accResult(split(s, seps, maxsplit)) + +proc split*(s: openArray[char], sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect, + rtl, extern: "nucSplitRune".} = + ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc + ## that returns a sequence of substrings. + accResult(split(s, sep, maxsplit)) + +proc strip*(s: openArray[char], leading = true, trailing = true, + runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect, + rtl, extern: "nucStrip".} = + ## Strips leading or trailing ``runes`` from ``s`` and returns + ## the resulting string. + ## + ## If ``leading`` is true (default), leading ``runes`` are stripped. + ## If ``trailing`` is true (default), trailing ``runes`` are stripped. + ## If both are false, the string is returned unchanged. + runnableExamples: + let a = "\táñyóng " + doAssert a.strip == "áñyóng" + doAssert a.strip(leading = false) == "\táñyóng" + doAssert a.strip(trailing = false) == "áñyóng " + + var + sI = 0 ## starting index into string ``s`` + eI = len(s) - 1 ## ending index into ``s``, where the last ``Rune`` starts + if leading: + var + i = 0 + xI: int ## value of ``sI`` at the beginning of the iteration + rune: Rune + while i < len(s): + xI = i + fastRuneAt(s, i, rune) + sI = i # Assume to start from next rune + if not runes.contains(rune): + sI = xI # Go back to where the current rune starts + break + if trailing: + var + i = eI + xI: int + rune: Rune + while i >= 0: + xI = i + fastRuneAt(s, xI, rune) + var yI = i - 1 + while yI >= 0: + var + yIend = yI + pRune: Rune + fastRuneAt(s, yIend, pRune) + if yIend < xI: break + i = yI + rune = pRune + dec(yI) + if not runes.contains(rune): + eI = xI - 1 + break + dec(i) + let newLen = eI - sI + 1 + result = newStringOfCap(newLen) + if newLen > 0: + result.add substr(s.toOpenArray(sI, eI)) + +proc repeat*(c: Rune, count: Natural): string {.noSideEffect, + rtl, extern: "nucRepeatRune".} = + ## Returns a string of ``count`` Runes ``c``. + ## + ## The returned string will have a rune-length of ``count``. + runnableExamples: + let a = "ñ".runeAt(0) + doAssert a.repeat(5) == "ñññññ" + + let s = $c + result = newStringOfCap(count * s.len) + for i in 0 ..< count: + result.add s + +proc align*(s: openArray[char], count: Natural, padding = ' '.Rune): string {. + noSideEffect, rtl, extern: "nucAlignString".} = + ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length + ## of ``count``. + ## + ## ``padding`` characters (by default spaces) are added before ``s`` resulting in + ## right alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is + ## returned unchanged. If you need to left align a string use the `alignLeft + ## proc <#alignLeft,string,Natural>`_. + runnableExamples: + assert align("abc", 4) == " abc" + assert align("a", 0) == "a" + assert align("1232", 6) == " 1232" + assert align("1232", 6, '#'.Rune) == "##1232" + assert align("Åge", 5) == " Åge" + assert align("×", 4, '_'.Rune) == "___×" + + let sLen = s.runeLen + if sLen < count: + let padStr = $padding + result = newStringOfCap(padStr.len * count) + let spaces = count - sLen + for i in 0 ..< spaces: result.add padStr + result.add s + else: + result = s.substr + +proc alignLeft*(s: openArray[char], count: Natural, padding = ' '.Rune): string {. + noSideEffect.} = + ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a + ## rune-length of ``count``. + ## + ## ``padding`` characters (by default spaces) are added after ``s`` resulting in + ## left alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is + ## returned unchanged. If you need to right align a string use the `align + ## proc <#align,string,Natural>`_. + runnableExamples: + assert alignLeft("abc", 4) == "abc " + assert alignLeft("a", 0) == "a" + assert alignLeft("1232", 6) == "1232 " + assert alignLeft("1232", 6, '#'.Rune) == "1232##" + assert alignLeft("Åge", 5) == "Åge " + assert alignLeft("×", 4, '_'.Rune) == "×___" + let sLen = s.runeLen + if sLen < count: + let padStr = $padding + result = newStringOfCap(s.len + (count - sLen) * padStr.len) + result.add s + for i in sLen ..< count: + result.add padStr + else: + result = s.substr + + +proc runeLen*(s: string): int {.inline.} = + ## Returns the number of runes of the string ``s``. + runnableExamples: + let a = "añyóng" + doAssert a.runeLen == 6 + ## note: a.len == 8 + runeLen(toOa(s)) + +proc runeLenAt*(s: string, i: Natural): int {.inline.} = + ## Returns the number of bytes the rune starting at ``s[i]`` takes. + ## + ## See also: + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeLenAt(0) == 1 + doAssert a.runeLenAt(1) == 2 + runeLenAt(toOa(s), i) + +proc runeAt*(s: string, i: Natural): Rune {.inline.} = + ## Returns the rune in ``s`` at **byte index** ``i``. + ## + ## See also: + ## * `runeAtPos proc <#runeAtPos,string,int>`_ + ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_ + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeAt(1) == "ñ".runeAt(0) + doAssert a.runeAt(2) == "ñ".runeAt(1) + doAssert a.runeAt(3) == "y".runeAt(0) + fastRuneAt(s, i, result, false) + +proc validateUtf8*(s: string): int {.inline.} = + ## Returns the position of the invalid byte in ``s`` if the string ``s`` does + ## not hold valid UTF-8 data. Otherwise ``-1`` is returned. + ## + ## See also: + ## * `toUTF8 proc <#toUTF8,Rune>`_ + ## * `$ proc <#$,Rune>`_ alias for `toUTF8` + ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ + validateUtf8(toOa(s)) + +proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int {.inline.} = + ## Returns the byte position of rune + ## at position ``pos`` in ``s`` with an optional start byte position. + ## Returns the special value -1 if it runs out of the string. + ## + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. + ## + ## See also: + ## * `runeReverseOffset proc <#runeReverseOffset,string,Positive>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeOffset(1) == 1 + doAssert a.runeOffset(3) == 4 + doAssert a.runeOffset(4) == 6 + runeOffset(toOa(s), pos, start) + +proc runeReverseOffset*(s: string, rev: Positive): (int, int) {.inline.} = + ## Returns a tuple with the byte offset of the + ## rune at position ``rev`` in ``s``, counting + ## from the end (starting with 1) and the total + ## number of runes in the string. + ## + ## Returns a negative value for offset if there are too few runes in + ## the string to satisfy the request. + ## + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. + ## + ## See also: + ## * `runeOffset proc <#runeOffset,string,Natural,Natural>`_ + runeReverseOffset(toOa(s), rev) + +proc runeAtPos*(s: string, pos: int): Rune {.inline.} = + ## Returns the rune at position ``pos``. + ## + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. + ## + ## See also: + ## * `runeAt proc <#runeAt,string,Natural>`_ + ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_ + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + fastRuneAt(toOa(s), runeOffset(s, pos), result, false) + +proc runeStrAtPos*(s: string, pos: Natural): string {.inline.} = + ## Returns the rune at position ``pos`` as UTF8 String. + ## + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. + ## + ## See also: + ## * `runeAt proc <#runeAt,string,Natural>`_ + ## * `runeAtPos proc <#runeAtPos,string,int>`_ + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + let o = runeOffset(s, pos) + substr(s.toOpenArray(o, (o+runeLenAt(s, o)-1))) + +proc runeSubStr*(s: string, pos: int, len: int = int.high): string {.inline.} = + ## Returns the UTF-8 substring starting at code point ``pos`` + ## with ``len`` code points. + ## + ## If ``pos`` or ``len`` is negative they count from + ## the end of the string. If ``len`` is not given it means the longest + ## possible string. + runnableExamples: + let s = "Hänsel ««: 10,00€" + doAssert(runeSubStr(s, 0, 2) == "Hä") + doAssert(runeSubStr(s, 10, 1) == ":") + doAssert(runeSubStr(s, -6) == "10,00€") + doAssert(runeSubStr(s, 10) == ": 10,00€") + doAssert(runeSubStr(s, 12, 5) == "10,00") + doAssert(runeSubStr(s, -6, 3) == "10,") + runeSubStr(toOa(s), pos, len) + + +proc isAlpha*(s: string): bool {.noSideEffect, inline.} = + ## Returns true if ``s`` contains all alphabetic runes. + runnableExamples: + let a = "añyóng" + doAssert a.isAlpha + isAlpha(toOa(s)) + +proc isSpace*(s: string): bool {.noSideEffect, inline.} = + ## Returns true if ``s`` contains all whitespace runes. + runnableExamples: + let a = "\t\l \v\r\f" + doAssert a.isSpace + isSpace(toOa(s)) + + +proc toUpper*(s: string): string {.noSideEffect, inline.} = + ## Converts ``s`` into upper-case runes. + runnableExamples: + doAssert toUpper("abγ") == "ABΓ" + toUpper(toOa(s)) + +proc toLower*(s: string): string {.noSideEffect, inline.} = + ## Converts ``s`` into lower-case runes. + runnableExamples: + doAssert toLower("ABΓ") == "abγ" + toLower(toOa(s)) + +proc swapCase*(s: string): string {.noSideEffect, inline.} = + ## Swaps the case of runes in ``s``. + ## + ## Returns a new string such that the cases of all runes + ## are swapped if possible. + runnableExamples: + doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA" + swapCase(toOa(s)) + +proc capitalize*(s: string): string {.noSideEffect.} = + ## Converts the first character of ``s`` into an upper-case rune. + runnableExamples: + doAssert capitalize("βeta") == "Βeta" + capitalize(toOa(s)) + + +proc translate*(s: string, replacements: proc(key: string): string): string {.effectsOf: replacements, inline.} = + ## Translates words in a string using the ``replacements`` proc to substitute + ## words inside ``s`` with their replacements. + ## + ## ``replacements`` is any proc that takes a word and returns + ## a new word to fill it's place. + runnableExamples: + proc wordToNumber(s: string): string = + case s + of "one": "1" + of "two": "2" + else: s + let a = "one two three four" + doAssert a.translate(wordToNumber) == "1 2 three four" + translate(toOa(s), replacements) + +proc title*(s: string): string {.noSideEffect, inline.} = + ## Converts ``s`` to a unicode title. + ## + ## Returns a new string such that the first character + ## in each word inside ``s`` is capitalized. + runnableExamples: + doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma" + title(toOa(s)) + + +iterator runes*(s: string): Rune = + ## Iterates over any rune of the string ``s`` returning runes. + for rune in runes(toOa(s)): + yield rune + +iterator utf8*(s: string): string = + ## Iterates over any rune of the string ``s`` returning utf8 values. + ## + ## See also: + ## * `validateUtf8 proc <#validateUtf8,string>`_ + ## * `toUTF8 proc <#toUTF8,Rune>`_ + ## * `$ proc <#$,Rune>`_ alias for `toUTF8` + ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ + for str in utf8(toOa(s)): + yield str + +proc toRunes*(s: string): seq[Rune] {.inline.} = + ## Obtains a sequence containing the Runes in ``s``. + ## + ## See also: + ## * `$ proc <#$,Rune>`_ for a reverse operation + runnableExamples: + let a = toRunes("aáä") + doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)] + toRunes(toOa(s)) + +proc cmpRunesIgnoreCase*(a, b: string): int {.inline.} = + ## Compares two UTF-8 strings and ignores the case. Returns: + ## + ## | `0` if a == b + ## | `< 0` if a < b + ## | `> 0` if a > b + cmpRunesIgnoreCase(a.toOa(), b.toOa()) + +proc reversed*(s: string): string {.inline.} = + ## Returns the reverse of ``s``, interpreting it as runes. + ## + ## Unicode combining characters are correctly interpreted as well. + runnableExamples: + assert reversed("Reverse this!") == "!siht esreveR" + assert reversed("先秦兩漢") == "漢兩秦先" + assert reversed("as⃝df̅") == "f̅ds⃝a" + assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞" + reversed(toOa(s)) + +proc graphemeLen*(s: string; i: Natural): Natural {.inline.} = + ## The number of bytes belonging to byte index ``s[i]``, + ## including following combining code unit. + runnableExamples: + let a = "añyóng" + doAssert a.graphemeLen(1) == 2 ## ñ + doAssert a.graphemeLen(2) == 1 + doAssert a.graphemeLen(4) == 2 ## ó + graphemeLen(toOa(s), i) + +proc lastRune*(s: string; last: int): (Rune, int) {.inline.} = + ## Length of the last rune in ``s[0..last]``. Returns the rune and its length + ## in bytes. + lastRune(toOa(s), last) + +iterator split*(s: string, seps: openArray[Rune] = unicodeSpaces, + maxsplit: int = -1): string = + ## Splits the unicode string ``s`` into substrings using a group of separators. + ## + ## Substrings are separated by a substring containing only ``seps``. + runnableExamples: + import std/sequtils + + assert toSeq("hÃllo\lthis\lis an\texample\l是".split) == + @["hÃllo", "this", "is", "an", "example", "是"] + + # And the following code splits the same string using a sequence of Runes. + assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) == + @["añyóng", "hÃllo", "是", "example"] + + # example with a `Rune` separator and unused one `;`: + assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""] + + # Another example that splits a string containing a date. + let date = "2012-11-20T22:08:08.398990" + + assert toSeq(split(date, " -:T".toRunes)) == + @["2012", "11", "20", "22", "08", "08.398990"] + + splitCommon(toOa(s), seps, maxsplit) + +iterator splitWhitespace*(s: string): string = + ## Splits a unicode string at whitespace runes. + splitCommon(s.toOa(), unicodeSpaces, -1) + + +proc splitWhitespace*(s: string): seq[string] {.noSideEffect, inline.}= + ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_ + ## iterator, but is a proc that returns a sequence of substrings. + accResult(splitWhitespace(toOa(s))) + +iterator split*(s: string, sep: Rune, maxsplit: int = -1): string = + ## Splits the unicode string ``s`` into substrings using a single separator. + ## Substrings are separated by the rune ``sep``. + runnableExamples: + import std/sequtils + + assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) == + @["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"] + + splitCommon(toOa(s), sep, maxsplit) + +proc split*(s: string, seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1): + seq[string] {.noSideEffect, inline.} = + ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_, + ## but is a proc that returns a sequence of substrings. + accResult(split(toOa(s), seps, maxsplit)) + +proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect, inline.} = + ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc + ## that returns a sequence of substrings. + accResult(split(toOa(s), sep, maxsplit)) + +proc strip*(s: string, leading = true, trailing = true, + runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect, inline.} = + ## Strips leading or trailing ``runes`` from ``s`` and returns + ## the resulting string. + ## + ## If ``leading`` is true (default), leading ``runes`` are stripped. + ## If ``trailing`` is true (default), trailing ``runes`` are stripped. + ## If both are false, the string is returned unchanged. + runnableExamples: + let a = "\táñyóng " + doAssert a.strip == "áñyóng" + doAssert a.strip(leading = false) == "\táñyóng" + doAssert a.strip(trailing = false) == "áñyóng " + strip(toOa(s), leading, trailing, runes) + + +proc align*(s: string, count: Natural, padding = ' '.Rune): string {.noSideEffect, inline.} = + ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length + ## of ``count``. + ## + ## ``padding`` characters (by default spaces) are added before ``s`` resulting in + ## right alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is + ## returned unchanged. If you need to left align a string use the `alignLeft + ## proc <#alignLeft,string,Natural>`_. + runnableExamples: + assert align("abc", 4) == " abc" + assert align("a", 0) == "a" + assert align("1232", 6) == " 1232" + assert align("1232", 6, '#'.Rune) == "##1232" + assert align("Åge", 5) == " Åge" + assert align("×", 4, '_'.Rune) == "___×" + align(toOa(s), count, padding) + +proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {.noSideEffect, inline.} = + ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a + ## rune-length of ``count``. + ## + ## ``padding`` characters (by default spaces) are added after ``s`` resulting in + ## left alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is + ## returned unchanged. If you need to right align a string use the `align + ## proc <#align,string,Natural>`_. + runnableExamples: + assert alignLeft("abc", 4) == "abc " + assert alignLeft("a", 0) == "a" + assert alignLeft("1232", 6) == "1232 " + assert alignLeft("1232", 6, '#'.Rune) == "1232##" + assert alignLeft("Åge", 5) == "Åge " + assert alignLeft("×", 4, '_'.Rune) == "×___" + alignLeft(toOa(s), count, padding) |