diff options
author | Jason Beetham <beefers331@gmail.com> | 2022-10-27 06:16:41 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-27 14:16:41 +0200 |
commit | 0b1989d7369d8616f623aafb0658a2b6d8cb5f73 (patch) | |
tree | 1c368bbc6f7bb375a27ea953ced682ea389d3f6b | |
parent | aa6f9d490ffbb7750ffa6b89ed57de3c4fb88718 (diff) | |
download | Nim-0b1989d7369d8616f623aafb0658a2b6d8cb5f73.tar.gz |
Added openarray[char] overloads to std/unicode (#20648)
* Added openarray[char] overloads to std/unicode Call substr instead of index slice inside unicode Added substr overload for openarray for parity with string functionality Made style checker happies and fixed overloads for substr * Added update to changelog [skip ci] * Inline unicode string operations * Moved substr overload to unicode Co-authored-by: Andreas Rumpf <rumpf_a@web.de>
-rw-r--r-- | changelog.md | 1 | ||||
-rw-r--r-- | lib/pure/unicode.nim | 463 |
2 files changed, 418 insertions, 46 deletions
diff --git a/changelog.md b/changelog.md index 9560098d1..c66de940c 100644 --- a/changelog.md +++ b/changelog.md @@ -137,6 +137,7 @@ - Added [`jsre.hasIndices`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/hasIndices) - Added `capacity` for `string` and `seq` to return the current capacity, see https://github.com/nim-lang/RFCs/issues/460 - Added `openArray[char]` overloads for `std/parseutils` allowing more code reuse. +- Added `openArray[char]` overloads for `std/unicode` allowing more code reuse. - Added `safe` parameter to `base64.encodeMime`. [//]: # "Deprecations:" diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim index 660be7814..09233035f 100644 --- a/lib/pure/unicode.nim +++ b/lib/pure/unicode.nim @@ -21,6 +21,16 @@ ## * `encodings module <encodings.html>`_ include "system/inclrtl" +import std/strbasics +template toOa(s: string): auto = s.toOpenArray(0, s.high) + +proc substr(s: openArray[char] , first, last: int): string = + # Copied substr from system + let first = max(first, 0) + let L = max(min(last, high(s)) - first + 1, 0) + result = newString(L) + for i in 0 .. L-1: + result[i] = s[i+first] type RuneImpl = int32 # underlying type of Rune @@ -32,7 +42,7 @@ type template ones(n: untyped): untyped = ((1 shl n)-1) -proc runeLen*(s: string): int {.rtl, extern: "nuc$1".} = +proc runeLen*(s: openArray[char]): int {.rtl, extern: "nuc$1".} = ## Returns the number of runes of the string ``s``. runnableExamples: let a = "añyóng" @@ -51,7 +61,7 @@ proc runeLen*(s: string): int {.rtl, extern: "nuc$1".} = else: inc i inc(result) -proc runeLenAt*(s: string, i: Natural): int = +proc runeLenAt*(s: openArray[char], i: Natural): int = ## Returns the number of bytes the rune starting at ``s[i]`` takes. ## ## See also: @@ -71,7 +81,7 @@ proc runeLenAt*(s: string, i: Natural): int = const replRune = Rune(0xFFFD) -template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) = +template fastRuneAt*(s: openArray[char] or string, i: int, result: untyped, doInc = true) = ## Returns the rune ``s[i]`` in ``result``. ## ## If ``doInc == true`` (default), ``i`` is incremented by the number @@ -149,7 +159,7 @@ template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) = result = Rune(uint(s[i])) when doInc: inc(i) -proc runeAt*(s: string, i: Natural): Rune = +proc runeAt*(s: openArray[char], i: Natural): Rune = ## Returns the rune in ``s`` at **byte index** ``i``. ## ## See also: @@ -163,7 +173,7 @@ proc runeAt*(s: string, i: Natural): Rune = doAssert a.runeAt(3) == "y".runeAt(0) fastRuneAt(s, i, result, false) -proc validateUtf8*(s: string): int = +proc validateUtf8*(s: openArray[char]): int = ## Returns the position of the invalid byte in ``s`` if the string ``s`` does ## not hold valid UTF-8 data. Otherwise ``-1`` is returned. ## @@ -300,7 +310,7 @@ proc `$`*(runes: seq[Rune]): string = for rune in runes: result.add rune -proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int = +proc runeOffset*(s: openArray[char], pos: Natural, start: Natural = 0): int = ## Returns the byte position of rune ## at position ``pos`` in ``s`` with an optional start byte position. ## Returns the special value -1 if it runs out of the string. @@ -327,7 +337,7 @@ proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int = inc i return o -proc runeReverseOffset*(s: string, rev: Positive): (int, int) = +proc runeReverseOffset*(s: openArray[char], rev: Positive): (int, int) = ## Returns a tuple with the byte offset of the ## rune at position ``rev`` in ``s``, counting ## from the end (starting with 1) and the total @@ -355,7 +365,7 @@ proc runeReverseOffset*(s: string, rev: Positive): (int, int) = dec a result = if a > 0: (-a, rev.int-a) else: (x, -a+rev.int) -proc runeAtPos*(s: string, pos: int): Rune = +proc runeAtPos*(s: openArray[char], pos: int): Rune = ## Returns the rune at position ``pos``. ## ## **Beware:** This can lead to unoptimized code and slow execution! @@ -368,7 +378,7 @@ proc runeAtPos*(s: string, pos: int): Rune = ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ fastRuneAt(s, runeOffset(s, pos), result, false) -proc runeStrAtPos*(s: string, pos: Natural): string = +proc runeStrAtPos*(s: openArray[char], pos: Natural): string = ## Returns the rune at position ``pos`` as UTF8 String. ## ## **Beware:** This can lead to unoptimized code and slow execution! @@ -380,9 +390,9 @@ proc runeStrAtPos*(s: string, pos: Natural): string = ## * `runeAtPos proc <#runeAtPos,string,int>`_ ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ let o = runeOffset(s, pos) - s[o .. (o+runeLenAt(s, o)-1)] + substr(s.toOpenArray(o, (o+runeLenAt(s, o)-1))) -proc runeSubStr*(s: string, pos: int, len: int = int.high): string = +proc runeSubStr*(s: openArray[char], pos: int, len: int = int.high): string = ## Returns the UTF-8 substring starting at code point ``pos`` ## with ``len`` code points. ## @@ -401,7 +411,7 @@ proc runeSubStr*(s: string, pos: int, len: int = int.high): string = if pos < 0: let (o, rl) = runeReverseOffset(s, -pos) if len >= rl: - result = s.substr(o, s.len-1) + result = s.substr(o, s.high) elif len < 0: let e = rl + len if e < 0: @@ -626,7 +636,7 @@ template runeCheck(s, runeProc) = fastRuneAt(s, i, rune, doInc = true) result = runeProc(rune) and result -proc isAlpha*(s: string): bool {.noSideEffect, +proc isAlpha*(s: openArray[char]): bool {.noSideEffect, rtl, extern: "nuc$1Str".} = ## Returns true if ``s`` contains all alphabetic runes. runnableExamples: @@ -634,7 +644,7 @@ proc isAlpha*(s: string): bool {.noSideEffect, doAssert a.isAlpha runeCheck(s, isAlpha) -proc isSpace*(s: string): bool {.noSideEffect, +proc isSpace*(s: openArray[char]): bool {.noSideEffect, rtl, extern: "nuc$1Str".} = ## Returns true if ``s`` contains all whitespace runes. runnableExamples: @@ -655,21 +665,21 @@ template convertRune(s, runeProc) = rune = runeProc(rune) fastToUTF8Copy(rune, result, resultIndex, doInc = true) -proc toUpper*(s: string): string {.noSideEffect, +proc toUpper*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1Str".} = ## Converts ``s`` into upper-case runes. runnableExamples: doAssert toUpper("abγ") == "ABΓ" convertRune(s, toUpper) -proc toLower*(s: string): string {.noSideEffect, +proc toLower*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1Str".} = ## Converts ``s`` into lower-case runes. runnableExamples: doAssert toLower("ABΓ") == "abγ" convertRune(s, toLower) -proc swapCase*(s: string): string {.noSideEffect, +proc swapCase*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1".} = ## Swaps the case of runes in ``s``. ## @@ -691,7 +701,7 @@ proc swapCase*(s: string): string {.noSideEffect, rune = rune.toUpper() fastToUTF8Copy(rune, result, resultIndex, doInc = true) -proc capitalize*(s: string): string {.noSideEffect, +proc capitalize*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1".} = ## Converts the first character of ``s`` into an upper-case rune. runnableExamples: @@ -703,12 +713,12 @@ proc capitalize*(s: string): string {.noSideEffect, rune: Rune i = 0 fastRuneAt(s, i, rune, doInc = true) - result = $toUpper(rune) & substr(s, i) + result = $toUpper(rune) & substr(s.toOpenArray(i, s.high)) when not defined(nimHasEffectsOf): {.pragma: effectsOf.} -proc translate*(s: string, replacements: proc(key: string): string): string {. +proc translate*(s: openArray[char], replacements: proc(key: string): string): string {. rtl, extern: "nuc$1", effectsOf: replacements.} = ## Translates words in a string using the ``replacements`` proc to substitute ## words inside ``s`` with their replacements. @@ -743,7 +753,7 @@ proc translate*(s: string, replacements: proc(key: string): string): string {. if whiteSpace and inWord: # If we've reached the end of a word - let word = s[wordStart ..< lastIndex] + let word = substr(s.toOpenArray(wordStart, lastIndex - 1)) result.add(replacements(word)) result.add($rune) inWord = false @@ -758,10 +768,10 @@ proc translate*(s: string, replacements: proc(key: string): string): string {. if wordStart < len(s) and inWord: # Get the trailing word at the end - let word = s[wordStart .. ^1] + let word = substr(s.toOpenArray(wordStart, s.high)) result.add(replacements(word)) -proc title*(s: string): string {.noSideEffect, +proc title*(s: openArray[char]): string {.noSideEffect, rtl, extern: "nuc$1".} = ## Converts ``s`` to a unicode title. ## @@ -787,7 +797,7 @@ proc title*(s: string): string {.noSideEffect, fastToUTF8Copy(rune, result, resultIndex, doInc = true) -iterator runes*(s: string): Rune = +iterator runes*(s: openArray[char]): Rune = ## Iterates over any rune of the string ``s`` returning runes. var i = 0 @@ -796,7 +806,7 @@ iterator runes*(s: string): Rune = fastRuneAt(s, i, result, true) yield result -iterator utf8*(s: string): string = +iterator utf8*(s: openArray[char]): string = ## Iterates over any rune of the string ``s`` returning utf8 values. ## ## See also: @@ -807,10 +817,10 @@ iterator utf8*(s: string): string = var o = 0 while o < s.len: let n = runeLenAt(s, o) - yield s[o .. (o+n-1)] + yield substr(s.toOpenArray(o, (o+n-1))) o += n -proc toRunes*(s: string): seq[Rune] = +proc toRunes*(s: openArray[char]): seq[Rune] = ## Obtains a sequence containing the Runes in ``s``. ## ## See also: @@ -823,7 +833,7 @@ proc toRunes*(s: string): seq[Rune] = for r in s.runes: result.add(r) -proc cmpRunesIgnoreCase*(a, b: string): int {.rtl, extern: "nuc$1".} = +proc cmpRunesIgnoreCase*(a, b: openArray[char]): int {.rtl, extern: "nuc$1".} = ## Compares two UTF-8 strings and ignores the case. Returns: ## ## | 0 if a == b @@ -840,7 +850,7 @@ proc cmpRunesIgnoreCase*(a, b: string): int {.rtl, extern: "nuc$1".} = if result != 0: return result = a.len - b.len -proc reversed*(s: string): string = +proc reversed*(s: openArray[char]): string = ## Returns the reverse of ``s``, interpreting it as runes. ## ## Unicode combining characters are correctly interpreted as well. @@ -875,7 +885,7 @@ proc reversed*(s: string): string = reverseUntil(len(s)) -proc graphemeLen*(s: string; i: Natural): Natural = +proc graphemeLen*(s: openArray[char]; i: Natural): Natural = ## The number of bytes belonging to byte index ``s[i]``, ## including following combining code unit. runnableExamples: @@ -894,7 +904,7 @@ proc graphemeLen*(s: string; i: Natural): Natural = if not isCombining(r2): break result = j-i -proc lastRune*(s: string; last: int): (Rune, int) = +proc lastRune*(s: openArray[char]; last: int): (Rune, int) = ## Length of the last rune in ``s[0..last]``. Returns the rune and its length ## in bytes. if s[last] <= chr(127): @@ -923,12 +933,12 @@ proc size*(r: Rune): int {.noSideEffect.} = else: result = 1 # --------- Private templates for different split separators ----------- -proc stringHasSep(s: string, index: int, seps: openArray[Rune]): bool = +proc stringHasSep(s: openArray[char], index: int, seps: openArray[Rune]): bool = var rune: Rune fastRuneAt(s, index, rune, false) return seps.contains(rune) -proc stringHasSep(s: string, index: int, sep: Rune): bool = +proc stringHasSep(s: openArray[char], index: int, sep: Rune): bool = var rune: Rune fastRuneAt(s, index, rune, false) return sep == rune @@ -946,12 +956,12 @@ template splitCommon(s, sep, maxsplit: untyped) = while last < sLen and not stringHasSep(s, last, sep): inc(last, runeLenAt(s, last)) if splits == 0: last = sLen - yield s[first .. (last - 1)] + yield substr(s.toOpenArray(first, (last - 1))) if splits == 0: break dec(splits) inc(last, if last < sLen: runeLenAt(s, last) else: 1) -iterator split*(s: string, seps: openArray[Rune] = unicodeSpaces, +iterator split*(s: openArray[char], seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1): string = ## Splits the unicode string ``s`` into substrings using a group of separators. ## @@ -977,7 +987,7 @@ iterator split*(s: string, seps: openArray[Rune] = unicodeSpaces, splitCommon(s, seps, maxsplit) -iterator splitWhitespace*(s: string): string = +iterator splitWhitespace*(s: openArray[char]): string = ## Splits a unicode string at whitespace runes. splitCommon(s, unicodeSpaces, -1) @@ -985,13 +995,13 @@ template accResult(iter: untyped) = result = @[] for x in iter: add(result, x) -proc splitWhitespace*(s: string): seq[string] {.noSideEffect, +proc splitWhitespace*(s: openArray[char]): seq[string] {.noSideEffect, rtl, extern: "ncuSplitWhitespace".} = ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_ ## iterator, but is a proc that returns a sequence of substrings. accResult(splitWhitespace(s)) -iterator split*(s: string, sep: Rune, maxsplit: int = -1): string = +iterator split*(s: openArray[char], sep: Rune, maxsplit: int = -1): string = ## Splits the unicode string ``s`` into substrings using a single separator. ## Substrings are separated by the rune ``sep``. runnableExamples: @@ -1002,19 +1012,19 @@ iterator split*(s: string, sep: Rune, maxsplit: int = -1): string = splitCommon(s, sep, maxsplit) -proc split*(s: string, seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1): +proc split*(s: openArray[char], seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1): seq[string] {.noSideEffect, rtl, extern: "nucSplitRunes".} = ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_, ## but is a proc that returns a sequence of substrings. accResult(split(s, seps, maxsplit)) -proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect, +proc split*(s: openArray[char], sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect, rtl, extern: "nucSplitRune".} = ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc ## that returns a sequence of substrings. accResult(split(s, sep, maxsplit)) -proc strip*(s: string, leading = true, trailing = true, +proc strip*(s: openArray[char], leading = true, trailing = true, runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect, rtl, extern: "nucStrip".} = ## Strips leading or trailing ``runes`` from ``s`` and returns @@ -1069,7 +1079,7 @@ proc strip*(s: string, leading = true, trailing = true, let newLen = eI - sI + 1 result = newStringOfCap(newLen) if newLen > 0: - result.add s[sI .. eI] + result.add substr(s.toOpenArray(sI, eI)) proc repeat*(c: Rune, count: Natural): string {.noSideEffect, rtl, extern: "nucRepeatRune".} = @@ -1085,7 +1095,7 @@ proc repeat*(c: Rune, count: Natural): string {.noSideEffect, for i in 0 ..< count: result.add s -proc align*(s: string, count: Natural, padding = ' '.Rune): string {. +proc align*(s: openArray[char], count: Natural, padding = ' '.Rune): string {. noSideEffect, rtl, extern: "nucAlignString".} = ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length ## of ``count``. @@ -1110,9 +1120,9 @@ proc align*(s: string, count: Natural, padding = ' '.Rune): string {. for i in 0 ..< spaces: result.add padStr result.add s else: - result = s + result = s.substr -proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {. +proc alignLeft*(s: openArray[char], count: Natural, padding = ' '.Rune): string {. noSideEffect.} = ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a ## rune-length of ``count``. @@ -1136,4 +1146,365 @@ proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {. for i in sLen ..< count: result.add padStr else: - result = s + result = s.substr + + +proc runeLen*(s: string): int {.inline.} = + ## Returns the number of runes of the string ``s``. + runnableExamples: + let a = "añyóng" + doAssert a.runeLen == 6 + ## note: a.len == 8 + runeLen(toOa(s)) + +proc runeLenAt*(s: string, i: Natural): int {.inline.} = + ## Returns the number of bytes the rune starting at ``s[i]`` takes. + ## + ## See also: + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeLenAt(0) == 1 + doAssert a.runeLenAt(1) == 2 + runeLenAt(toOa(s), i) + +proc runeAt*(s: string, i: Natural): Rune {.inline.} = + ## Returns the rune in ``s`` at **byte index** ``i``. + ## + ## See also: + ## * `runeAtPos proc <#runeAtPos,string,int>`_ + ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_ + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeAt(1) == "ñ".runeAt(0) + doAssert a.runeAt(2) == "ñ".runeAt(1) + doAssert a.runeAt(3) == "y".runeAt(0) + fastRuneAt(s, i, result, false) + +proc validateUtf8*(s: string): int {.inline.} = + ## Returns the position of the invalid byte in ``s`` if the string ``s`` does + ## not hold valid UTF-8 data. Otherwise ``-1`` is returned. + ## + ## See also: + ## * `toUTF8 proc <#toUTF8,Rune>`_ + ## * `$ proc <#$,Rune>`_ alias for `toUTF8` + ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ + validateUtf8(toOa(s)) + +proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int {.inline.} = + ## Returns the byte position of rune + ## at position ``pos`` in ``s`` with an optional start byte position. + ## Returns the special value -1 if it runs out of the string. + ## + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. + ## + ## See also: + ## * `runeReverseOffset proc <#runeReverseOffset,string,Positive>`_ + runnableExamples: + let a = "añyóng" + doAssert a.runeOffset(1) == 1 + doAssert a.runeOffset(3) == 4 + doAssert a.runeOffset(4) == 6 + runeOffset(toOa(s), pos, start) + +proc runeReverseOffset*(s: string, rev: Positive): (int, int) {.inline.} = + ## Returns a tuple with the byte offset of the + ## rune at position ``rev`` in ``s``, counting + ## from the end (starting with 1) and the total + ## number of runes in the string. + ## + ## Returns a negative value for offset if there are too few runes in + ## the string to satisfy the request. + ## + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. + ## + ## See also: + ## * `runeOffset proc <#runeOffset,string,Natural,Natural>`_ + runeReverseOffset(toOa(s), rev) + +proc runeAtPos*(s: string, pos: int): Rune {.inline.} = + ## Returns the rune at position ``pos``. + ## + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. + ## + ## See also: + ## * `runeAt proc <#runeAt,string,Natural>`_ + ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_ + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + fastRuneAt(toOa(s), runeOffset(s, pos), result, false) + +proc runeStrAtPos*(s: string, pos: Natural): string {.inline.} = + ## Returns the rune at position ``pos`` as UTF8 String. + ## + ## **Beware:** This can lead to unoptimized code and slow execution! + ## Most problems can be solved more efficiently by using an iterator + ## or conversion to a seq of Rune. + ## + ## See also: + ## * `runeAt proc <#runeAt,string,Natural>`_ + ## * `runeAtPos proc <#runeAtPos,string,int>`_ + ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ + let o = runeOffset(s, pos) + substr(s.toOpenArray(o, (o+runeLenAt(s, o)-1))) + +proc runeSubStr*(s: string, pos: int, len: int = int.high): string {.inline.} = + ## Returns the UTF-8 substring starting at code point ``pos`` + ## with ``len`` code points. + ## + ## If ``pos`` or ``len`` is negative they count from + ## the end of the string. If ``len`` is not given it means the longest + ## possible string. + runnableExamples: + let s = "Hänsel ««: 10,00€" + doAssert(runeSubStr(s, 0, 2) == "Hä") + doAssert(runeSubStr(s, 10, 1) == ":") + doAssert(runeSubStr(s, -6) == "10,00€") + doAssert(runeSubStr(s, 10) == ": 10,00€") + doAssert(runeSubStr(s, 12, 5) == "10,00") + doAssert(runeSubStr(s, -6, 3) == "10,") + runeSubStr(toOa(s), pos, len) + + +proc isAlpha*(s: string): bool {.noSideEffect, inline.} = + ## Returns true if ``s`` contains all alphabetic runes. + runnableExamples: + let a = "añyóng" + doAssert a.isAlpha + isAlpha(toOa(s)) + +proc isSpace*(s: string): bool {.noSideEffect, inline.} = + ## Returns true if ``s`` contains all whitespace runes. + runnableExamples: + let a = "\t\l \v\r\f" + doAssert a.isSpace + isSpace(toOa(s)) + + +proc toUpper*(s: string): string {.noSideEffect, inline.} = + ## Converts ``s`` into upper-case runes. + runnableExamples: + doAssert toUpper("abγ") == "ABΓ" + toUpper(toOa(s)) + +proc toLower*(s: string): string {.noSideEffect, inline.} = + ## Converts ``s`` into lower-case runes. + runnableExamples: + doAssert toLower("ABΓ") == "abγ" + toLower(toOa(s)) + +proc swapCase*(s: string): string {.noSideEffect, inline.} = + ## Swaps the case of runes in ``s``. + ## + ## Returns a new string such that the cases of all runes + ## are swapped if possible. + runnableExamples: + doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA" + swapCase(toOa(s)) + +proc capitalize*(s: string): string {.noSideEffect.} = + ## Converts the first character of ``s`` into an upper-case rune. + runnableExamples: + doAssert capitalize("βeta") == "Βeta" + capitalize(toOa(s)) + + +proc translate*(s: string, replacements: proc(key: string): string): string {.effectsOf: replacements, inline.} = + ## Translates words in a string using the ``replacements`` proc to substitute + ## words inside ``s`` with their replacements. + ## + ## ``replacements`` is any proc that takes a word and returns + ## a new word to fill it's place. + runnableExamples: + proc wordToNumber(s: string): string = + case s + of "one": "1" + of "two": "2" + else: s + let a = "one two three four" + doAssert a.translate(wordToNumber) == "1 2 three four" + translate(toOa(s), replacements) + +proc title*(s: string): string {.noSideEffect, inline.} = + ## Converts ``s`` to a unicode title. + ## + ## Returns a new string such that the first character + ## in each word inside ``s`` is capitalized. + runnableExamples: + doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma" + title(toOa(s)) + + +iterator runes*(s: string): Rune = + ## Iterates over any rune of the string ``s`` returning runes. + for rune in runes(toOa(s)): + yield rune + +iterator utf8*(s: string): string = + ## Iterates over any rune of the string ``s`` returning utf8 values. + ## + ## See also: + ## * `validateUtf8 proc <#validateUtf8,string>`_ + ## * `toUTF8 proc <#toUTF8,Rune>`_ + ## * `$ proc <#$,Rune>`_ alias for `toUTF8` + ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ + for str in utf8(toOa(s)): + yield str + +proc toRunes*(s: string): seq[Rune] {.inline.} = + ## Obtains a sequence containing the Runes in ``s``. + ## + ## See also: + ## * `$ proc <#$,Rune>`_ for a reverse operation + runnableExamples: + let a = toRunes("aáä") + doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)] + toRunes(toOa(s)) + +proc cmpRunesIgnoreCase*(a, b: string): int {.inline.} = + ## Compares two UTF-8 strings and ignores the case. Returns: + ## + ## | 0 if a == b + ## | < 0 if a < b + ## | > 0 if a > b + cmpRunesIgnoreCase(a.toOa(), b.toOa()) + +proc reversed*(s: string): string {.inline.} = + ## Returns the reverse of ``s``, interpreting it as runes. + ## + ## Unicode combining characters are correctly interpreted as well. + runnableExamples: + assert reversed("Reverse this!") == "!siht esreveR" + assert reversed("先秦兩漢") == "漢兩秦先" + assert reversed("as⃝df̅") == "f̅ds⃝a" + assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞" + reversed(toOa(s)) + +proc graphemeLen*(s: string; i: Natural): Natural {.inline.} = + ## The number of bytes belonging to byte index ``s[i]``, + ## including following combining code unit. + runnableExamples: + let a = "añyóng" + doAssert a.graphemeLen(1) == 2 ## ñ + doAssert a.graphemeLen(2) == 1 + doAssert a.graphemeLen(4) == 2 ## ó + graphemeLen(toOa(s), i) + +proc lastRune*(s: string; last: int): (Rune, int) {.inline.} = + ## Length of the last rune in ``s[0..last]``. Returns the rune and its length + ## in bytes. + lastRune(toOa(s), last) + +iterator split*(s: string, seps: openArray[Rune] = unicodeSpaces, + maxsplit: int = -1): string = + ## Splits the unicode string ``s`` into substrings using a group of separators. + ## + ## Substrings are separated by a substring containing only ``seps``. + runnableExamples: + import std/sequtils + + assert toSeq("hÃllo\lthis\lis an\texample\l是".split) == + @["hÃllo", "this", "is", "an", "example", "是"] + + # And the following code splits the same string using a sequence of Runes. + assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) == + @["añyóng", "hÃllo", "是", "example"] + + # example with a `Rune` separator and unused one `;`: + assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""] + + # Another example that splits a string containing a date. + let date = "2012-11-20T22:08:08.398990" + + assert toSeq(split(date, " -:T".toRunes)) == + @["2012", "11", "20", "22", "08", "08.398990"] + + splitCommon(toOa(s), seps, maxsplit) + +iterator splitWhitespace*(s: string): string = + ## Splits a unicode string at whitespace runes. + splitCommon(s.toOa(), unicodeSpaces, -1) + + +proc splitWhitespace*(s: string): seq[string] {.noSideEffect, inline.}= + ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_ + ## iterator, but is a proc that returns a sequence of substrings. + accResult(splitWhitespace(toOa(s))) + +iterator split*(s: string, sep: Rune, maxsplit: int = -1): string = + ## Splits the unicode string ``s`` into substrings using a single separator. + ## Substrings are separated by the rune ``sep``. + runnableExamples: + import std/sequtils + + assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) == + @["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"] + + splitCommon(toOa(s), sep, maxsplit) + +proc split*(s: string, seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1): + seq[string] {.noSideEffect, inline.} = + ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_, + ## but is a proc that returns a sequence of substrings. + accResult(split(toOa(s), seps, maxsplit)) + +proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect, inline.} = + ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc + ## that returns a sequence of substrings. + accResult(split(toOa(s), sep, maxsplit)) + +proc strip*(s: string, leading = true, trailing = true, + runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect, inline.} = + ## Strips leading or trailing ``runes`` from ``s`` and returns + ## the resulting string. + ## + ## If ``leading`` is true (default), leading ``runes`` are stripped. + ## If ``trailing`` is true (default), trailing ``runes`` are stripped. + ## If both are false, the string is returned unchanged. + runnableExamples: + let a = "\táñyóng " + doAssert a.strip == "áñyóng" + doAssert a.strip(leading = false) == "\táñyóng" + doAssert a.strip(trailing = false) == "áñyóng " + strip(toOa(s), leading, trailing, runes) + + +proc align*(s: string, count: Natural, padding = ' '.Rune): string {.noSideEffect, inline.} = + ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length + ## of ``count``. + ## + ## ``padding`` characters (by default spaces) are added before ``s`` resulting in + ## right alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is + ## returned unchanged. If you need to left align a string use the `alignLeft + ## proc <#alignLeft,string,Natural>`_. + runnableExamples: + assert align("abc", 4) == " abc" + assert align("a", 0) == "a" + assert align("1232", 6) == " 1232" + assert align("1232", 6, '#'.Rune) == "##1232" + assert align("Åge", 5) == " Åge" + assert align("×", 4, '_'.Rune) == "___×" + align(toOa(s), count, padding) + +proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {.noSideEffect, inline.} = + ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a + ## rune-length of ``count``. + ## + ## ``padding`` characters (by default spaces) are added after ``s`` resulting in + ## left alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is + ## returned unchanged. If you need to right align a string use the `align + ## proc <#align,string,Natural>`_. + runnableExamples: + assert alignLeft("abc", 4) == "abc " + assert alignLeft("a", 0) == "a" + assert alignLeft("1232", 6) == "1232 " + assert alignLeft("1232", 6, '#'.Rune) == "1232##" + assert alignLeft("Åge", 5) == "Åge " + assert alignLeft("×", 4, '_'.Rune) == "×___" + alignLeft(toOa(s), count, padding) |