diff options
Diffstat (limited to 'lib/pure/unicode.nim')
-rw-r--r-- | lib/pure/unicode.nim | 884 |
1 files changed, 463 insertions, 421 deletions
diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim index 8d76cc787..8cbe117bb 100644 --- a/lib/pure/unicode.nim +++ b/lib/pure/unicode.nim @@ -20,10 +20,17 @@ ## * `unidecode module <unidecode.html>`_ ## * `encodings module <encodings.html>`_ - -{.deadCodeElim: on.} # dce option deprecated - include "system/inclrtl" +import std/strbasics +template toOa(s: string): auto = s.toOpenArray(0, s.high) + +proc substr(s: openArray[char] , first, last: int): string = + # Copied substr from system + let first = max(first, 0) + let L = max(min(last, high(s)) - first + 1, 0) + result = newString(L) + for i in 0 .. L-1: + result[i] = s[i+first] type RuneImpl = int32 # underlying type of Rune @@ -31,16 +38,18 @@ type ## Type that can hold a single Unicode code point. ## ## A Rune may be composed with other Runes to a character on the screen. + ## `RuneImpl` is the underlying type used to store Runes, currently `int32`. template ones(n: untyped): untyped = ((1 shl n)-1) -proc runeLen*(s: string): int {.rtl, extern: "nuc$1".} = +proc runeLen*(s: openArray[char]): int {.rtl, extern: "nuc$1".} = ## Returns the number of runes of the string ``s``. runnableExamples: let a = "añyóng" doAssert a.runeLen == 6 ## note: a.len == 8 + result = 0 var i = 0 while i < len(s): if uint(s[i]) <= 127: inc(i) @@ -52,7 +61,7 @@ proc runeLen*(s: string): int {.rtl, extern: "nuc$1".} = else: inc i inc(result) -proc runeLenAt*(s: string, i: Natural): int = +proc runeLenAt*(s: openArray[char], i: Natural): int = ## Returns the number of bytes the rune starting at ``s[i]`` takes. ## ## See also: @@ -72,7 +81,7 @@ proc runeLenAt*(s: string, i: Natural): int = const replRune = Rune(0xFFFD) -template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) = +template fastRuneAt*(s: openArray[char] or string, i: int, result: untyped, doInc = true) = ## Returns the rune ``s[i]`` in ``result``. ## ## If ``doInc == true`` (default), ``i`` is incremented by the number @@ -150,7 +159,7 @@ template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) = result = Rune(uint(s[i])) when doInc: inc(i) -proc runeAt*(s: string, i: Natural): Rune = +proc runeAt*(s: openArray[char], i: Natural): Rune = ## Returns the rune in ``s`` at **byte index** ``i``. ## ## See also: @@ -164,7 +173,7 @@ proc runeAt*(s: string, i: Natural): Rune = doAssert a.runeAt(3) == "y".runeAt(0) fastRuneAt(s, i, result, false) -proc validateUtf8*(s: string): int = +proc validateUtf8*(s: openArray[char]): int = ## Returns the position of the invalid byte in ``s`` if the string ``s`` does ## not hold valid UTF-8 data. Otherwise ``-1`` is returned. ## @@ -301,7 +310,7 @@ proc `$`*(runes: seq[Rune]): string = for rune in runes: result.add rune -proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int = +proc runeOffset*(s: openArray[char], pos: Natural, start: Natural = 0): int = ## Returns the byte position of rune ## at position ``pos`` in ``s`` with an optional start byte position. ## Returns the special value -1 if it runs out of the string. @@ -328,13 +337,13 @@ proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int = inc i return o -proc runeReverseOffset*(s: string, rev: Positive): (int, int) = +proc runeReverseOffset*(s: openArray[char], rev: Positive): (int, int) = ## Returns a tuple with the byte offset of the ## rune at position ``rev`` in ``s``, counting ## from the end (starting with 1) and the total ## number of runes in the string. ## - ## Returns a negative value for offset if there are to few runes in + ## Returns a negative value for offset if there are too few runes in ## the string to satisfy the request. ## ## **Beware:** This can lead to unoptimized code and slow execution! @@ -347,18 +356,16 @@ proc runeReverseOffset*(s: string, rev: Positive): (int, int) = a = rev.int o = 0 x = 0 + let times = 2*rev.int-s.runeLen # transformed from rev.int - a < s.runeLen - rev.int while o < s.len: let r = runeLenAt(s, o) o += r - if a < 0: + if a > times: x += r dec a + result = if a > 0: (-a, rev.int-a) else: (x, -a+rev.int) - if a > 0: - return (-a, rev.int-a) - return (x, -a+rev.int) - -proc runeAtPos*(s: string, pos: int): Rune = +proc runeAtPos*(s: openArray[char], pos: int): Rune = ## Returns the rune at position ``pos``. ## ## **Beware:** This can lead to unoptimized code and slow execution! @@ -371,7 +378,7 @@ proc runeAtPos*(s: string, pos: int): Rune = ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ fastRuneAt(s, runeOffset(s, pos), result, false) -proc runeStrAtPos*(s: string, pos: Natural): string = +proc runeStrAtPos*(s: openArray[char], pos: Natural): string = ## Returns the rune at position ``pos`` as UTF8 String. ## ## **Beware:** This can lead to unoptimized code and slow execution! @@ -383,9 +390,9 @@ proc runeStrAtPos*(s: string, pos: Natural): string = ## * `runeAtPos proc <#runeAtPos,string,int>`_ ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ let o = runeOffset(s, pos) - s[o .. (o+runeLenAt(s, o)-1)] + substr(s.toOpenArray(o, (o+runeLenAt(s, o)-1))) -proc runeSubStr*(s: string, pos: int, len: int = int.high): string = +proc runeSubStr*(s: openArray[char], pos: int, len: int = int.high): string = ## Returns the UTF-8 substring starting at code point ``pos`` ## with ``len`` code points. ## @@ -404,7 +411,7 @@ proc runeSubStr*(s: string, pos: int, len: int = int.high): string = if pos < 0: let (o, rl) = runeReverseOffset(s, -pos) if len >= rl: - result = s.substr(o, s.len-1) + result = s.substr(o, s.high) elif len < 0: let e = rl + len if e < 0: @@ -457,7 +464,7 @@ proc `==`*(a, b: Rune): bool = include "includes/unicode_ranges" -proc binarySearch(c: RuneImpl, tab: openArray[int], len, stride: int): int = +proc binarySearch(c: RuneImpl, tab: openArray[int32], len, stride: int): int = var n = len var t = 0 while n > 1: @@ -472,7 +479,7 @@ proc binarySearch(c: RuneImpl, tab: openArray[int], len, stride: int): int = return t return -1 -proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} = +proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1".} = ## Converts ``c`` into lower case. This works for any rune. ## ## If possible, prefer ``toLower`` over ``toUpper``. @@ -490,7 +497,7 @@ proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} = return Rune(c + toLowerSinglets[p+1] - 500) return Rune(c) -proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} = +proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1".} = ## Converts ``c`` into upper case. This works for any rune. ## ## If possible, prefer ``toLower`` over ``toUpper``. @@ -508,7 +515,7 @@ proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} = return Rune(c + toUpperSinglets[p+1] - 500) return Rune(c) -proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} = +proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1".} = ## Converts ``c`` to title case. ## ## See also: @@ -521,7 +528,7 @@ proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} = return Rune(c + toTitleSinglets[p+1] - 500) return Rune(c) -proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = +proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1".} = ## Returns true if ``c`` is a lower case rune. ## ## If possible, prefer ``isLower`` over ``isUpper``. @@ -539,7 +546,7 @@ proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = if p >= 0 and c == toUpperSinglets[p]: return true -proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = +proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1".} = ## Returns true if ``c`` is a upper case rune. ## ## If possible, prefer ``isLower`` over ``isUpper``. @@ -559,7 +566,7 @@ proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = if p >= 0 and c == toLowerSinglets[p]: return true -proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = +proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1".} = ## Returns true if ``c`` is an *alpha* rune (i.e., a letter). ## ## See also: @@ -578,7 +585,7 @@ proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = if p >= 0 and c == alphaSinglets[p]: return true -proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = +proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1".} = ## Returns true if ``c`` is a Unicode titlecase code point. ## ## See also: @@ -589,7 +596,7 @@ proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_ return isUpper(c) and isLower(c) -proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = +proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1".} = ## Returns true if ``c`` is a Unicode whitespace code point. ## ## See also: @@ -602,7 +609,7 @@ proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = if p >= 0 and c >= spaceRanges[p] and c <= spaceRanges[p+1]: return true -proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = +proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1".} = ## Returns true if ``c`` is a Unicode combining code unit. ## ## See also: @@ -629,7 +636,7 @@ template runeCheck(s, runeProc) = fastRuneAt(s, i, rune, doInc = true) result = runeProc(rune) and result -proc isAlpha*(s: string): bool {.noSideEffect, procvar, +proc isAlpha*(s: openArray[char]): bool {.noSideEffect, rtl, extern: "nuc$1Str".} = ## Returns true if ``s`` contains all alphabetic runes. runnableExamples: @@ -637,7 +644,7 @@ proc isAlpha*(s: string): bool {.noSideEffect, procvar, doAssert a.isAlpha runeCheck(s, isAlpha) -proc isSpace*(s: string): bool {.noSideEffect, procvar, +proc isSpace*(s: openArray[char]): bool {.noSideEffect, rtl, extern: "nuc$1Str".} = ## Returns true if ``s`` contains all whitespace runes. runnableExamples: @@ -658,21 +665,21 @@ template convertRune(s, runeProc) = rune = runeProc(rune) fastToUTF8Copy(rune, result, resultIndex, doInc = true) -proc toUpper*(s: string): string {.noSideEffect, procvar, +proc toUpper*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1Str".} = ## Converts ``s`` into upper-case runes. runnableExamples: doAssert toUpper("abγ") == "ABΓ" convertRune(s, toUpper) -proc toLower*(s: string): string {.noSideEffect, procvar, +proc toLower*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1Str".} = ## Converts ``s`` into lower-case runes. runnableExamples: doAssert toLower("ABΓ") == "abγ" convertRune(s, toLower) -proc swapCase*(s: string): string {.noSideEffect, procvar, +proc swapCase*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1".} = ## Swaps the case of runes in ``s``. ## @@ -694,22 +701,25 @@ proc swapCase*(s: string): string {.noSideEffect, procvar, rune = rune.toUpper() fastToUTF8Copy(rune, result, resultIndex, doInc = true) -proc capitalize*(s: string): string {.noSideEffect, procvar, +proc capitalize*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1".} = ## Converts the first character of ``s`` into an upper-case rune. runnableExamples: doAssert capitalize("βeta") == "Βeta" if len(s) == 0: - return s + return "" var rune: Rune i = 0 fastRuneAt(s, i, rune, doInc = true) - result = $toUpper(rune) & substr(s, i) + result = $toUpper(rune) & substr(s.toOpenArray(i, s.high)) -proc translate*(s: string, replacements: proc(key: string): string): string {. - rtl, extern: "nuc$1".} = +when not defined(nimHasEffectsOf): + {.pragma: effectsOf.} + +proc translate*(s: openArray[char], replacements: proc(key: string): string): string {. + rtl, extern: "nuc$1", effectsOf: replacements.} = ## Translates words in a string using the ``replacements`` proc to substitute ## words inside ``s`` with their replacements. ## @@ -743,7 +753,7 @@ proc translate*(s: string, replacements: proc(key: string): string): string {. if whiteSpace and inWord: # If we've reached the end of a word - let word = s[wordStart ..< lastIndex] + let word = substr(s.toOpenArray(wordStart, lastIndex - 1)) result.add(replacements(word)) result.add($rune) inWord = false @@ -758,10 +768,10 @@ proc translate*(s: string, replacements: proc(key: string): string): string {. if wordStart < len(s) and inWord: # Get the trailing word at the end - let word = s[wordStart .. ^1] + let word = substr(s.toOpenArray(wordStart, s.high)) result.add(replacements(word)) -proc title*(s: string): string {.noSideEffect, procvar, +proc title*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1".} = ## Converts ``s`` to a unicode title. ## @@ -787,7 +797,7 @@ proc title*(s: string): string {.noSideEffect, procvar, fastToUTF8Copy(rune, result, resultIndex, doInc = true) -iterator runes*(s: string): Rune = +iterator runes*(s: openArray[char]): Rune = ## Iterates over any rune of the string ``s`` returning runes. var i = 0 @@ -796,7 +806,7 @@ iterator runes*(s: string): Rune = fastRuneAt(s, i, result, true) yield result -iterator utf8*(s: string): string = +iterator utf8*(s: openArray[char]): string = ## Iterates over any rune of the string ``s`` returning utf8 values. ## ## See also: @@ -807,14 +817,14 @@ iterator utf8*(s: string): string = var o = 0 while o < s.len: let n = runeLenAt(s, o) - yield s[o .. (o+n-1)] + yield substr(s.toOpenArray(o, (o+n-1))) o += n -proc toRunes*(s: string): seq[Rune] = +proc toRunes*(s: openArray[char]): seq[Rune] = ## Obtains a sequence containing the Runes in ``s``. ## ## See also: - ## * `$ proc <#$,seq[T][Rune]>`_ for a reverse operation + ## * `$ proc <#$,Rune>`_ for a reverse operation runnableExamples: let a = toRunes("aáä") doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)] @@ -823,12 +833,12 @@ proc toRunes*(s: string): seq[Rune] = for r in s.runes: result.add(r) -proc cmpRunesIgnoreCase*(a, b: string): int {.rtl, extern: "nuc$1", procvar.} = +proc cmpRunesIgnoreCase*(a, b: openArray[char]): int {.rtl, extern: "nuc$1".} = ## Compares two UTF-8 strings and ignores the case. Returns: ## - ## | 0 if a == b - ## | < 0 if a < b - ## | > 0 if a > b + ## | `0` if a == b + ## | `< 0` if a < b + ## | `> 0` if a > b var i = 0 var j = 0 var ar, br: Rune @@ -836,11 +846,16 @@ proc cmpRunesIgnoreCase*(a, b: string): int {.rtl, extern: "nuc$1", procvar.} = # slow path: fastRuneAt(a, i, ar) fastRuneAt(b, j, br) - result = RuneImpl(toLower(ar)) - RuneImpl(toLower(br)) + when sizeof(int) < 4: + const lo = low(int).int32 + const hi = high(int).int32 + result = clamp(RuneImpl(toLower(ar)) - RuneImpl(toLower(br)), lo, hi).int + else: + result = RuneImpl(toLower(ar)) - RuneImpl(toLower(br)) if result != 0: return result = a.len - b.len -proc reversed*(s: string): string = +proc reversed*(s: openArray[char]): string = ## Returns the reverse of ``s``, interpreting it as runes. ## ## Unicode combining characters are correctly interpreted as well. @@ -875,9 +890,9 @@ proc reversed*(s: string): string = reverseUntil(len(s)) -proc graphemeLen*(s: string; i: Natural): Natural = +proc graphemeLen*(s: openArray[char]; i: Natural): Natural = ## The number of bytes belonging to byte index ``s[i]``, - ## including following combining code unit. + ## including following combining code units. runnableExamples: let a = "añyóng" doAssert a.graphemeLen(1) == 2 ## ñ @@ -894,7 +909,7 @@ proc graphemeLen*(s: string; i: Natural): Natural = if not isCombining(r2): break result = j-i -proc lastRune*(s: string; last: int): (Rune, int) = +proc lastRune*(s: openArray[char]; last: int): (Rune, int) = ## Length of the last rune in ``s[0..last]``. Returns the rune and its length ## in bytes. if s[last] <= chr(127): @@ -923,83 +938,61 @@ proc size*(r: Rune): int {.noSideEffect.} = else: result = 1 # --------- Private templates for different split separators ----------- -proc stringHasSep(s: string, index: int, seps: openArray[Rune]): bool = +proc stringHasSep(s: openArray[char], index: int, seps: openArray[Rune]): bool = var rune: Rune fastRuneAt(s, index, rune, false) return seps.contains(rune) -proc stringHasSep(s: string, index: int, sep: Rune): bool = +proc stringHasSep(s: openArray[char], index: int, sep: Rune): bool = var rune: Rune fastRuneAt(s, index, rune, false) return sep == rune -template splitCommon(s, sep, maxsplit: untyped, sepLen: int = -1) = +template splitCommon(s, sep, maxsplit: untyped) = ## Common code for split procedures. + let + sLen = len(s) var last = 0 splits = maxsplit - if len(s) > 0: - while last <= len(s): + if sLen > 0: + while last <= sLen: var first = last - while last < len(s) and not stringHasSep(s, last, sep): - when sep is Rune: - inc(last, sepLen) - else: - inc(last, runeLenAt(s, last)) - if splits == 0: last = len(s) - yield s[first .. (last - 1)] + while last < sLen and not stringHasSep(s, last, sep): + inc(last, runeLenAt(s, last)) + if splits == 0: last = sLen + yield substr(s.toOpenArray(first, (last - 1))) if splits == 0: break dec(splits) - when sep is Rune: - inc(last, sepLen) - else: - inc(last, if last < len(s): runeLenAt(s, last) else: 1) + inc(last, if last < sLen: runeLenAt(s, last) else: 1) -iterator split*(s: string, seps: openArray[Rune] = unicodeSpaces, +iterator split*(s: openArray[char], seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1): string = ## Splits the unicode string ``s`` into substrings using a group of separators. ## ## Substrings are separated by a substring containing only ``seps``. - ## - ## .. code-block:: nim - ## for word in split("this\lis an\texample"): - ## writeLine(stdout, word) - ## - ## ...generates this output: - ## - ## .. code-block:: - ## "this" - ## "is" - ## "an" - ## "example" - ## - ## And the following code: - ## - ## .. code-block:: nim - ## for word in split("this:is;an$example", {';', ':', '$'}): - ## writeLine(stdout, word) - ## - ## ...produces the same output as the first example. The code: - ## - ## .. code-block:: nim - ## let date = "2012-11-20T22:08:08.398990" - ## let separators = {' ', '-', ':', 'T'} - ## for number in split(date, separators): - ## writeLine(stdout, number) - ## - ## ...results in: - ## - ## .. code-block:: - ## "2012" - ## "11" - ## "20" - ## "22" - ## "08" - ## "08.398990" - ## + runnableExamples: + import std/sequtils + + assert toSeq("hÃllo\lthis\lis an\texample\l是".split) == + @["hÃllo", "this", "is", "an", "example", "是"] + + # And the following code splits the same string using a sequence of Runes. + assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) == + @["añyóng", "hÃllo", "是", "example"] + + # example with a `Rune` separator and unused one `;`: + assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""] + + # Another example that splits a string containing a date. + let date = "2012-11-20T22:08:08.398990" + + assert toSeq(split(date, " -:T".toRunes)) == + @["2012", "11", "20", "22", "08", "08.398990"] + splitCommon(s, seps, maxsplit) -iterator splitWhitespace*(s: string): string = +iterator splitWhitespace*(s: openArray[char]): string = ## Splits a unicode string at whitespace runes. splitCommon(s, unicodeSpaces, -1) @@ -1007,51 +1000,36 @@ template accResult(iter: untyped) = result = @[] for x in iter: add(result, x) -proc splitWhitespace*(s: string): seq[string] {.noSideEffect, +proc splitWhitespace*(s: openArray[char]): seq[string] {.noSideEffect, rtl, extern: "ncuSplitWhitespace".} = ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_ ## iterator, but is a proc that returns a sequence of substrings. accResult(splitWhitespace(s)) -iterator split*(s: string, sep: Rune, maxsplit: int = -1): string = +iterator split*(s: openArray[char], sep: Rune, maxsplit: int = -1): string = ## Splits the unicode string ``s`` into substrings using a single separator. - ## ## Substrings are separated by the rune ``sep``. - ## The code: - ## - ## .. code-block:: nim - ## for word in split(";;this;is;an;;example;;;", ';'): - ## writeLine(stdout, word) - ## - ## Results in: - ## - ## .. code-block:: - ## "" - ## "" - ## "this" - ## "is" - ## "an" - ## "" - ## "example" - ## "" - ## "" - ## "" - ## - splitCommon(s, sep, maxsplit, sep.size) + runnableExamples: + import std/sequtils -proc split*(s: string, seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1): + assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) == + @["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"] + + splitCommon(s, sep, maxsplit) + +proc split*(s: openArray[char], seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1): seq[string] {.noSideEffect, rtl, extern: "nucSplitRunes".} = ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_, ## but is a proc that returns a sequence of substrings. accResult(split(s, seps, maxsplit)) -proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect, +proc split*(s: openArray[char], sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect, rtl, extern: "nucSplitRune".} = ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc ## that returns a sequence of substrings. accResult(split(s, sep, maxsplit)) -proc strip*(s: string, leading = true, trailing = true, +proc strip*(s: openArray[char], leading = true, trailing = true, runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect, rtl, extern: "nucStrip".} = ## Strips leading or trailing ``runes`` from ``s`` and returns @@ -1106,7 +1084,7 @@ proc strip*(s: string, leading = true, trailing = true, let newLen = eI - sI + 1 result = newStringOfCap(newLen) if newLen > 0: - result.add s[sI .. eI] + result.add substr(s.toOpenArray(sI, eI)) proc repeat*(c: Rune, count: Natural): string {.noSideEffect, rtl, extern: "nucRepeatRune".} = @@ -1122,7 +1100,7 @@ proc repeat*(c: Rune, count: Natural): string {.noSideEffect, for i in 0 ..< count: result.add s -proc align*(s: string, count: Natural, padding = ' '.Rune): string {. +proc align*(s: openArray[char], count: Natural, padding = ' '.Rune): string {. noSideEffect, rtl, extern: "nucAlignString".} = ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length ## of ``count``. @@ -1147,9 +1125,9 @@ proc align*(s: string, count: Natural, padding = ' '.Rune): string {. for i in 0 ..< spaces: result.add padStr result.add s else: - result = s + result = s.substr -proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {. +proc alignLeft*(s: openArray[char], count: Natural, padding = ' '.Rune): string {. noSideEffect.} = ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a ## rune-length of ``count``. @@ -1173,301 +1151,365 @@ proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {. for i in sLen ..< count: result.add padStr else: - result = s + result = s.substr -# ----------------------------------------------------------------------------- -# deprecated -template runeCaseCheck(s, runeProc, skipNonAlpha) = - ## Common code for rune.isLower and rune.isUpper. - if len(s) == 0: return false - var - i = 0 - rune: Rune - hasAtleastOneAlphaRune = false - while i < len(s): - fastRuneAt(s, i, rune, doInc = true) - if skipNonAlpha: - var runeIsAlpha = isAlpha(rune) - if not hasAtleastOneAlphaRune: - hasAtleastOneAlphaRune = runeIsAlpha - if runeIsAlpha and (not runeProc(rune)): - return false - else: - if not runeProc(rune): - return false - return if skipNonAlpha: hasAtleastOneAlphaRune else: true +proc runeLen*(s: string): int {.inline.} = + ## Returns the number of runes of the string ``s``. + runnableExamples: + let a = "añyóng" + doAssert a.runeLen == 6 + ## note: a.len == 8 + runeLen(toOa(s)) + +proc runeLenAt*(s: string, i: Natural): int {.inline.} = + ## Returns the number of bytes the rune starting at ``s[i]`` takes. + ## + ## See also: + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeLenAt(0) == 1 + doAssert a.runeLenAt(1) == 2 + runeLenAt(toOa(s), i) -proc isLower*(s: string, skipNonAlpha: bool): bool {. - deprecated: "Deprecated since version 0.20 since its semantics are unclear".} = - ## **Deprecated since version 0.20 since its semantics are unclear** +proc runeAt*(s: string, i: Natural): Rune {.inline.} = + ## Returns the rune in ``s`` at **byte index** ``i``. ## - ## Checks whether ``s`` is lower case. + ## See also: + ## * `runeAtPos proc <#runeAtPos,string,int>`_ + ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_ + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeAt(1) == "ñ".runeAt(0) + doAssert a.runeAt(2) == "ñ".runeAt(1) + doAssert a.runeAt(3) == "y".runeAt(0) + fastRuneAt(s, i, result, false) + +proc validateUtf8*(s: string): int {.inline.} = + ## Returns the position of the invalid byte in ``s`` if the string ``s`` does + ## not hold valid UTF-8 data. Otherwise ``-1`` is returned. ## - ## If ``skipNonAlpha`` is true, returns true if all alphabetical - ## runes in ``s`` are lower case. Returns false if none of the - ## runes in ``s`` are alphabetical. + ## See also: + ## * `toUTF8 proc <#toUTF8,Rune>`_ + ## * `$ proc <#$,Rune>`_ alias for `toUTF8` + ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ + validateUtf8(toOa(s)) + +proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int {.inline.} = + ## Returns the byte position of rune + ## at position ``pos`` in ``s`` with an optional start byte position. + ## Returns the special value -1 if it runs out of the string. ## - ## If ``skipNonAlpha`` is false, returns true only if all runes in - ## ``s`` are alphabetical and lower case. + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. ## - ## For either value of ``skipNonAlpha``, returns false if ``s`` is - ## an empty string. - runeCaseCheck(s, isLower, skipNonAlpha) + ## See also: + ## * `runeReverseOffset proc <#runeReverseOffset,string,Positive>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeOffset(1) == 1 + doAssert a.runeOffset(3) == 4 + doAssert a.runeOffset(4) == 6 + runeOffset(toOa(s), pos, start) -proc isUpper*(s: string, skipNonAlpha: bool): bool {. - deprecated: "Deprecated since version 0.20 since its semantics are unclear".} = - ## **Deprecated since version 0.20 since its semantics are unclear** +proc runeReverseOffset*(s: string, rev: Positive): (int, int) {.inline.} = + ## Returns a tuple with the byte offset of the + ## rune at position ``rev`` in ``s``, counting + ## from the end (starting with 1) and the total + ## number of runes in the string. + ## + ## Returns a negative value for offset if there are too few runes in + ## the string to satisfy the request. ## - ## Checks whether ``s`` is upper case. + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. ## - ## If ``skipNonAlpha`` is true, returns true if all alphabetical - ## runes in ``s`` are upper case. Returns false if none of the - ## runes in ``s`` are alphabetical. + ## See also: + ## * `runeOffset proc <#runeOffset,string,Natural,Natural>`_ + runeReverseOffset(toOa(s), rev) + +proc runeAtPos*(s: string, pos: int): Rune {.inline.} = + ## Returns the rune at position ``pos``. ## - ## If ``skipNonAlpha`` is false, returns true only if all runes in - ## ``s`` are alphabetical and upper case. + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. ## - ## For either value of ``skipNonAlpha``, returns false if ``s`` is - ## an empty string. - runeCaseCheck(s, isUpper, skipNonAlpha) + ## See also: + ## * `runeAt proc <#runeAt,string,Natural>`_ + ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_ + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + fastRuneAt(toOa(s), runeOffset(s, pos), result, false) -proc isTitle*(s: string): bool {.noSideEffect, procvar, rtl, extern: "nuc$1Str", - deprecated: "Deprecated since version 0.20 since its semantics are unclear".} = - ## **Deprecated since version 0.20 since its semantics are unclear** +proc runeStrAtPos*(s: string, pos: Natural): string {.inline.} = + ## Returns the rune at position ``pos`` as UTF8 String. ## - ## Checks whether or not ``s`` is a unicode title. + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. ## - ## Returns true if the first character in each word inside ``s`` - ## are upper case and there is at least one character in ``s``. - if s.len == 0: - return false - result = true - var - i = 0 - rune: Rune - var firstRune = true + ## See also: + ## * `runeAt proc <#runeAt,string,Natural>`_ + ## * `runeAtPos proc <#runeAtPos,string,int>`_ + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + let o = runeOffset(s, pos) + substr(s.toOpenArray(o, (o+runeLenAt(s, o)-1))) - while i < len(s) and result: - fastRuneAt(s, i, rune, doInc = true) - if not rune.isWhiteSpace() and firstRune: - result = rune.isUpper() and result - firstRune = false - elif rune.isWhiteSpace(): - firstRune = true +proc runeSubStr*(s: string, pos: int, len: int = int.high): string {.inline.} = + ## Returns the UTF-8 substring starting at code point ``pos`` + ## with ``len`` code points. + ## + ## If ``pos`` or ``len`` is negative they count from + ## the end of the string. If ``len`` is not given it means the longest + ## possible string. + runnableExamples: + let s = "Hänsel ««: 10,00€" + doAssert(runeSubStr(s, 0, 2) == "Hä") + doAssert(runeSubStr(s, 10, 1) == ":") + doAssert(runeSubStr(s, -6) == "10,00€") + doAssert(runeSubStr(s, 10) == ": 10,00€") + doAssert(runeSubStr(s, 12, 5) == "10,00") + doAssert(runeSubStr(s, -6, 3) == "10,") + runeSubStr(toOa(s), pos, len) +proc isAlpha*(s: string): bool {.noSideEffect, inline.} = + ## Returns true if ``s`` contains all alphabetic runes. + runnableExamples: + let a = "añyóng" + doAssert a.isAlpha + isAlpha(toOa(s)) -when isMainModule: +proc isSpace*(s: string): bool {.noSideEffect, inline.} = + ## Returns true if ``s`` contains all whitespace runes. + runnableExamples: + let a = "\t\l \v\r\f" + doAssert a.isSpace + isSpace(toOa(s)) - proc asRune(s: static[string]): Rune = - ## Compile-time conversion proc for converting string literals to a Rune - ## value. Returns the first Rune of the specified string. - ## - ## Shortcuts code like ``"å".runeAt(0)`` to ``"å".asRune`` and returns a - ## compile-time constant. - if s.len == 0: Rune(0) - else: s.runeAt(0) - let - someString = "öÑ" - someRunes = toRunes(someString) - compared = (someString == $someRunes) - doAssert compared == true - - proc testReplacements(word: string): string = - case word - of "two": - return "2" - of "foo": - return "BAR" - of "βeta": - return "beta" - of "alpha": - return "αlpha" - else: - return "12345" - - doAssert translate("two not alpha foo βeta", testReplacements) == "2 12345 αlpha BAR beta" - doAssert translate(" two not foo βeta ", testReplacements) == " 2 12345 BAR beta " - - doAssert title("foo bar") == "Foo Bar" - doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma" - doAssert title("") == "" - - doAssert capitalize("βeta") == "Βeta" - doAssert capitalize("foo") == "Foo" - doAssert capitalize("") == "" - - doAssert swapCase("FooBar") == "fOObAR" - doAssert swapCase(" ") == " " - doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA" - doAssert swapCase("a✓B") == "A✓b" - doAssert swapCase("Јамогујестистаклоитоминештети") == "јАМОГУЈЕСТИСТАКЛОИТОМИНЕШТЕТИ" - doAssert swapCase("ὕαλονϕαγεῖνδύναμαιτοῦτοοὔμεβλάπτει") == "ὝΑΛΟΝΦΑΓΕῖΝΔΎΝΑΜΑΙΤΟῦΤΟΟὔΜΕΒΛΆΠΤΕΙ" - doAssert swapCase("Կրնամապակիուտեևինծիանհանգիստչըներ") == "կՐՆԱՄԱՊԱԿԻՈՒՏԵևԻՆԾԻԱՆՀԱՆԳԻՍՏՉԸՆԵՐ" - doAssert swapCase("") == "" - - doAssert isAlpha("r") - doAssert isAlpha("α") - doAssert isAlpha("ϙ") - doAssert isAlpha("ஶ") - doAssert(not isAlpha("$")) - doAssert(not isAlpha("")) - - doAssert isAlpha("Βeta") - doAssert isAlpha("Args") - doAssert isAlpha("𐌼𐌰𐌲𐌲𐌻𐌴𐍃𐍄𐌰𐌽") - doAssert isAlpha("ὕαλονϕαγεῖνδύναμαιτοῦτοοὔμεβλάπτει") - doAssert isAlpha("Јамогујестистаклоитоминештети") - doAssert isAlpha("Կրնամապակիուտեևինծիանհանգիստչըներ") - doAssert(not isAlpha("$Foo✓")) - doAssert(not isAlpha("⠙⠕⠑⠎⠝⠞")) - - doAssert isSpace("\t") - doAssert isSpace("\l") - doAssert(not isSpace("Β")) - doAssert(not isSpace("Βeta")) - - doAssert isSpace("\t\l \v\r\f") - doAssert isSpace(" ") - doAssert(not isSpace("")) - doAssert(not isSpace("ΑΓc \td")) - - doAssert(not isLower(' '.Rune)) - - doAssert(not isUpper(' '.Rune)) - - doAssert toUpper("Γ") == "Γ" - doAssert toUpper("b") == "B" - doAssert toUpper("α") == "Α" - doAssert toUpper("✓") == "✓" - doAssert toUpper("ϙ") == "Ϙ" - doAssert toUpper("") == "" - - doAssert toUpper("ΑΒΓ") == "ΑΒΓ" - doAssert toUpper("AAccβ") == "AACCΒ" - doAssert toUpper("A✓$β") == "A✓$Β" - - doAssert toLower("a") == "a" - doAssert toLower("γ") == "γ" - doAssert toLower("Γ") == "γ" - doAssert toLower("4") == "4" - doAssert toLower("Ϙ") == "ϙ" - doAssert toLower("") == "" - - doAssert toLower("abcdγ") == "abcdγ" - doAssert toLower("abCDΓ") == "abcdγ" - doAssert toLower("33aaΓ") == "33aaγ" - - doAssert reversed("Reverse this!") == "!siht esreveR" - doAssert reversed("先秦兩漢") == "漢兩秦先" - doAssert reversed("as⃝df̅") == "f̅ds⃝a" - doAssert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞" - doAssert reversed("ὕαλονϕαγεῖνδύναμαιτοῦτοοὔμεβλάπτει") == "ιετπάλβεμὔοοτῦοτιαμανύδνῖεγαϕνολαὕ" - doAssert reversed("Јамогујестистаклоитоминештети") == "итетшенимотиолкатситсејугомаЈ" - doAssert reversed("Կրնամապակիուտեևինծիանհանգիստչըներ") == "րենըչտսիգնահնաիծնիևետւոիկապամանրԿ" - doAssert len(toRunes("as⃝df̅")) == runeLen("as⃝df̅") - const test = "as⃝" - doAssert lastRune(test, test.len-1)[1] == 3 - doAssert graphemeLen("è", 0) == 2 - - # test for rune positioning and runeSubStr() - let s = "Hänsel ««: 10,00€" - - var t = "" - for c in s.utf8: - t.add c - - doAssert(s == t) - - doAssert(runeReverseOffset(s, 1) == (20, 18)) - doAssert(runeReverseOffset(s, 19) == (-1, 18)) - - doAssert(runeStrAtPos(s, 0) == "H") - doAssert(runeSubStr(s, 0, 1) == "H") - doAssert(runeStrAtPos(s, 10) == ":") - doAssert(runeSubStr(s, 10, 1) == ":") - doAssert(runeStrAtPos(s, 9) == "«") - doAssert(runeSubStr(s, 9, 1) == "«") - doAssert(runeStrAtPos(s, 17) == "€") - doAssert(runeSubStr(s, 17, 1) == "€") - # echo runeStrAtPos(s, 18) # index error - - doAssert(runeSubStr(s, 0) == "Hänsel ««: 10,00€") - doAssert(runeSubStr(s, -18) == "Hänsel ««: 10,00€") - doAssert(runeSubStr(s, 10) == ": 10,00€") - doAssert(runeSubStr(s, 18) == "") - doAssert(runeSubStr(s, 0, 10) == "Hänsel ««") - - doAssert(runeSubStr(s, 12) == "10,00€") - doAssert(runeSubStr(s, -6) == "10,00€") - - doAssert(runeSubStr(s, 12, 5) == "10,00") - doAssert(runeSubStr(s, 12, -1) == "10,00") - doAssert(runeSubStr(s, -6, 5) == "10,00") - doAssert(runeSubStr(s, -6, -1) == "10,00") - - doAssert(runeSubStr(s, 0, 100) == "Hänsel ««: 10,00€") - doAssert(runeSubStr(s, -100, 100) == "Hänsel ««: 10,00€") - doAssert(runeSubStr(s, 0, -100) == "") - doAssert(runeSubStr(s, 100, -100) == "") - - block splitTests: - let s = " this is an example " - let s2 = ":this;is;an:example;;" - let s3 = ":this×is×an:example××" - doAssert s.split() == @["", "this", "is", "an", "example", "", ""] - doAssert s2.split(seps = [':'.Rune, ';'.Rune]) == @["", "this", "is", "an", - "example", "", ""] - doAssert s3.split(seps = [':'.Rune, "×".asRune]) == @["", "this", "is", - "an", "example", "", ""] - doAssert s.split(maxsplit = 4) == @["", "this", "is", "an", "example "] - doAssert s.split(' '.Rune, maxsplit = 1) == @["", "this is an example "] - - block stripTests: - doAssert(strip("") == "") - doAssert(strip(" ") == "") - doAssert(strip("y") == "y") - doAssert(strip(" foofoofoo ") == "foofoofoo") - doAssert(strip("sfoofoofoos", runes = ['s'.Rune]) == "foofoofoo") - - block: - let stripTestRunes = ['b'.Rune, 'a'.Rune, 'r'.Rune] - doAssert(strip("barfoofoofoobar", runes = stripTestRunes) == "foofoofoo") - doAssert(strip("sfoofoofoos", leading = false, runes = ['s'.Rune]) == "sfoofoofoo") - doAssert(strip("sfoofoofoos", trailing = false, runes = ['s'.Rune]) == "foofoofoos") - - block: - let stripTestRunes = ["«".asRune, "»".asRune] - doAssert(strip("«TEXT»", runes = stripTestRunes) == "TEXT") - doAssert(strip("copyright©", leading = false, runes = ["©".asRune]) == "copyright") - doAssert(strip("¿Question?", trailing = false, runes = ["¿".asRune]) == "Question?") - doAssert(strip("×text×", leading = false, runes = ["×".asRune]) == "×text") - doAssert(strip("×text×", trailing = false, runes = ["×".asRune]) == "text×") - - block repeatTests: - doAssert repeat('c'.Rune, 5) == "ccccc" - doAssert repeat("×".asRune, 5) == "×××××" - - block alignTests: - doAssert align("abc", 4) == " abc" - doAssert align("a", 0) == "a" - doAssert align("1232", 6) == " 1232" - doAssert align("1232", 6, '#'.Rune) == "##1232" - doAssert align("1232", 6, "×".asRune) == "××1232" - doAssert alignLeft("abc", 4) == "abc " - doAssert alignLeft("a", 0) == "a" - doAssert alignLeft("1232", 6) == "1232 " - doAssert alignLeft("1232", 6, '#'.Rune) == "1232##" - doAssert alignLeft("1232", 6, "×".asRune) == "1232××" - - block differentSizes: - # upper and lower variants have different number of bytes - doAssert toLower("AẞC") == "aßc" - doAssert toLower("ȺẞCD") == "ⱥßcd" - doAssert toUpper("ⱥbc") == "ȺBC" - doAssert toUpper("rsⱦuv") == "RSȾUV" - doAssert swapCase("ⱥbCd") == "ȺBcD" - doAssert swapCase("XyꟆaB") == "xYᶎAb" - doAssert swapCase("aᵹcᲈd") == "AꝽCꙊD" +proc toUpper*(s: string): string {.noSideEffect, inline.} = + ## Converts ``s`` into upper-case runes. + runnableExamples: + doAssert toUpper("abγ") == "ABΓ" + toUpper(toOa(s)) + +proc toLower*(s: string): string {.noSideEffect, inline.} = + ## Converts ``s`` into lower-case runes. + runnableExamples: + doAssert toLower("ABΓ") == "abγ" + toLower(toOa(s)) + +proc swapCase*(s: string): string {.noSideEffect, inline.} = + ## Swaps the case of runes in ``s``. + ## + ## Returns a new string such that the cases of all runes + ## are swapped if possible. + runnableExamples: + doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA" + swapCase(toOa(s)) + +proc capitalize*(s: string): string {.noSideEffect.} = + ## Converts the first character of ``s`` into an upper-case rune. + runnableExamples: + doAssert capitalize("βeta") == "Βeta" + capitalize(toOa(s)) + + +proc translate*(s: string, replacements: proc(key: string): string): string {.effectsOf: replacements, inline.} = + ## Translates words in a string using the ``replacements`` proc to substitute + ## words inside ``s`` with their replacements. + ## + ## ``replacements`` is any proc that takes a word and returns + ## a new word to fill it's place. + runnableExamples: + proc wordToNumber(s: string): string = + case s + of "one": "1" + of "two": "2" + else: s + let a = "one two three four" + doAssert a.translate(wordToNumber) == "1 2 three four" + translate(toOa(s), replacements) + +proc title*(s: string): string {.noSideEffect, inline.} = + ## Converts ``s`` to a unicode title. + ## + ## Returns a new string such that the first character + ## in each word inside ``s`` is capitalized. + runnableExamples: + doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma" + title(toOa(s)) + + +iterator runes*(s: string): Rune = + ## Iterates over any rune of the string ``s`` returning runes. + for rune in runes(toOa(s)): + yield rune + +iterator utf8*(s: string): string = + ## Iterates over any rune of the string ``s`` returning utf8 values. + ## + ## See also: + ## * `validateUtf8 proc <#validateUtf8,string>`_ + ## * `toUTF8 proc <#toUTF8,Rune>`_ + ## * `$ proc <#$,Rune>`_ alias for `toUTF8` + ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ + for str in utf8(toOa(s)): + yield str + +proc toRunes*(s: string): seq[Rune] {.inline.} = + ## Obtains a sequence containing the Runes in ``s``. + ## + ## See also: + ## * `$ proc <#$,Rune>`_ for a reverse operation + runnableExamples: + let a = toRunes("aáä") + doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)] + toRunes(toOa(s)) + +proc cmpRunesIgnoreCase*(a, b: string): int {.inline.} = + ## Compares two UTF-8 strings and ignores the case. Returns: + ## + ## | `0` if a == b + ## | `< 0` if a < b + ## | `> 0` if a > b + cmpRunesIgnoreCase(a.toOa(), b.toOa()) + +proc reversed*(s: string): string {.inline.} = + ## Returns the reverse of ``s``, interpreting it as runes. + ## + ## Unicode combining characters are correctly interpreted as well. + runnableExamples: + assert reversed("Reverse this!") == "!siht esreveR" + assert reversed("先秦兩漢") == "漢兩秦先" + assert reversed("as⃝df̅") == "f̅ds⃝a" + assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞" + reversed(toOa(s)) + +proc graphemeLen*(s: string; i: Natural): Natural {.inline.} = + ## The number of bytes belonging to byte index ``s[i]``, + ## including following combining code unit. + runnableExamples: + let a = "añyóng" + doAssert a.graphemeLen(1) == 2 ## ñ + doAssert a.graphemeLen(2) == 1 + doAssert a.graphemeLen(4) == 2 ## ó + graphemeLen(toOa(s), i) + +proc lastRune*(s: string; last: int): (Rune, int) {.inline.} = + ## Length of the last rune in ``s[0..last]``. Returns the rune and its length + ## in bytes. + lastRune(toOa(s), last) + +iterator split*(s: string, seps: openArray[Rune] = unicodeSpaces, + maxsplit: int = -1): string = + ## Splits the unicode string ``s`` into substrings using a group of separators. + ## + ## Substrings are separated by a substring containing only ``seps``. + runnableExamples: + import std/sequtils + + assert toSeq("hÃllo\lthis\lis an\texample\l是".split) == + @["hÃllo", "this", "is", "an", "example", "是"] + + # And the following code splits the same string using a sequence of Runes. + assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) == + @["añyóng", "hÃllo", "是", "example"] + + # example with a `Rune` separator and unused one `;`: + assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""] + + # Another example that splits a string containing a date. + let date = "2012-11-20T22:08:08.398990" + + assert toSeq(split(date, " -:T".toRunes)) == + @["2012", "11", "20", "22", "08", "08.398990"] + + splitCommon(toOa(s), seps, maxsplit) + +iterator splitWhitespace*(s: string): string = + ## Splits a unicode string at whitespace runes. + splitCommon(s.toOa(), unicodeSpaces, -1) + + +proc splitWhitespace*(s: string): seq[string] {.noSideEffect, inline.}= + ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_ + ## iterator, but is a proc that returns a sequence of substrings. + accResult(splitWhitespace(toOa(s))) + +iterator split*(s: string, sep: Rune, maxsplit: int = -1): string = + ## Splits the unicode string ``s`` into substrings using a single separator. + ## Substrings are separated by the rune ``sep``. + runnableExamples: + import std/sequtils + + assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) == + @["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"] + + splitCommon(toOa(s), sep, maxsplit) + +proc split*(s: string, seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1): + seq[string] {.noSideEffect, inline.} = + ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_, + ## but is a proc that returns a sequence of substrings. + accResult(split(toOa(s), seps, maxsplit)) + +proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect, inline.} = + ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc + ## that returns a sequence of substrings. + accResult(split(toOa(s), sep, maxsplit)) + +proc strip*(s: string, leading = true, trailing = true, + runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect, inline.} = + ## Strips leading or trailing ``runes`` from ``s`` and returns + ## the resulting string. + ## + ## If ``leading`` is true (default), leading ``runes`` are stripped. + ## If ``trailing`` is true (default), trailing ``runes`` are stripped. + ## If both are false, the string is returned unchanged. + runnableExamples: + let a = "\táñyóng " + doAssert a.strip == "áñyóng" + doAssert a.strip(leading = false) == "\táñyóng" + doAssert a.strip(trailing = false) == "áñyóng " + strip(toOa(s), leading, trailing, runes) + + +proc align*(s: string, count: Natural, padding = ' '.Rune): string {.noSideEffect, inline.} = + ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length + ## of ``count``. + ## + ## ``padding`` characters (by default spaces) are added before ``s`` resulting in + ## right alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is + ## returned unchanged. If you need to left align a string use the `alignLeft + ## proc <#alignLeft,string,Natural>`_. + runnableExamples: + assert align("abc", 4) == " abc" + assert align("a", 0) == "a" + assert align("1232", 6) == " 1232" + assert align("1232", 6, '#'.Rune) == "##1232" + assert align("Åge", 5) == " Åge" + assert align("×", 4, '_'.Rune) == "___×" + align(toOa(s), count, padding) + +proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {.noSideEffect, inline.} = + ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a + ## rune-length of ``count``. + ## + ## ``padding`` characters (by default spaces) are added after ``s`` resulting in + ## left alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is + ## returned unchanged. If you need to right align a string use the `align + ## proc <#align,string,Natural>`_. + runnableExamples: + assert alignLeft("abc", 4) == "abc " + assert alignLeft("a", 0) == "a" + assert alignLeft("1232", 6) == "1232 " + assert alignLeft("1232", 6, '#'.Rune) == "1232##" + assert alignLeft("Åge", 5) == "Åge " + assert alignLeft("×", 4, '_'.Rune) == "×___" + alignLeft(toOa(s), count, padding) |