diff options
author | Andreas Rumpf <rumpf_a@web.de> | 2016-06-14 09:38:45 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-06-14 09:38:45 +0200 |
commit | c9c4b6c41d9465599f4a627c09fe626cab727270 (patch) | |
tree | ce1068c16a5cd5a0824b24ea7392eb62c55c7392 /lib | |
parent | 93424420d48929965929901c2d5698de189c1a34 (diff) | |
parent | 84d9081e21783151b40c0320ca50df4845043559 (diff) | |
download | Nim-c9c4b6c41d9465599f4a627c09fe626cab727270.tar.gz |
Merge pull request #4276 from jyapayne/devel
Add extra string functions to strutils to satisfy part of #4218 and #4251
Diffstat (limited to 'lib')
-rw-r--r-- | lib/pure/strmisc.nim | 56 | ||||
-rw-r--r-- | lib/pure/strutils.nim | 266 | ||||
-rw-r--r-- | lib/pure/unicode.nim | 238 |
3 files changed, 526 insertions, 34 deletions
diff --git a/lib/pure/strmisc.nim b/lib/pure/strmisc.nim new file mode 100644 index 000000000..359014d8c --- /dev/null +++ b/lib/pure/strmisc.nim @@ -0,0 +1,56 @@ +# +# +# Nim's Runtime Library +# (c) Copyright 2016 Joey Payne +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module contains various string utility routines that are uncommonly +## used in comparison to `strutils <strutils.html>`_. + +import strutils + +{.deadCodeElim: on.} + +proc partition*(s: string, sep: string, + right: bool = false): (string, string, string) + {.noSideEffect, procvar.} = + ## Split the string at the first or last occurrence of `sep` into a 3-tuple + ## + ## Returns a 3 string tuple of (beforeSep, `sep`, afterSep) or + ## (`s`, "", "") if `sep` is not found and `right` is false or + ## ("", "", `s`) if `sep` is not found and `right` is true + + let position = if right: s.rfind(sep) else: s.find(sep) + + if position != -1: + let + beforeSep = s[0 ..< position] + afterSep = s[position + sep.len ..< s.len] + + return (beforeSep, sep, afterSep) + + return if right: ("", "", s) else: (s, "", "") + +proc rpartition*(s: string, sep: string): (string, string, string) + {.noSideEffect, procvar.} = + ## Split the string at the last occurrence of `sep` into a 3-tuple + ## + ## Returns a 3 string tuple of (beforeSep, `sep`, afterSep) or + ## ("", "", `s`) if `sep` is not found + return partition(s, sep, right = true) + +when isMainModule: + doAssert partition("foo:bar", ":") == ("foo", ":", "bar") + doAssert partition("foobarbar", "bar") == ("foo", "bar", "bar") + doAssert partition("foobarbar", "bank") == ("foobarbar", "", "") + doAssert partition("foobarbar", "foo") == ("", "foo", "barbar") + doAssert partition("foofoobar", "bar") == ("foofoo", "bar", "") + + doAssert rpartition("foo:bar", ":") == ("foo", ":", "bar") + doAssert rpartition("foobarbar", "bar") == ("foobar", "bar", "") + doAssert rpartition("foobarbar", "bank") == ("", "", "foobarbar") + doAssert rpartition("foobarbar", "foo") == ("", "foo", "barbar") + doAssert rpartition("foofoobar", "bar") == ("foofoo", "bar", "") diff --git a/lib/pure/strutils.nim b/lib/pure/strutils.nim index 86af6ac88..708f9ed4b 100644 --- a/lib/pure/strutils.nim +++ b/lib/pure/strutils.nim @@ -15,6 +15,7 @@ import parseutils from math import pow, round, floor, log10 +from algorithm import reverse {.deadCodeElim: on.} @@ -325,7 +326,8 @@ proc toOctal*(c: char): string {.noSideEffect, rtl, extern: "nsuToOctal".} = result[i] = chr(val mod 8 + ord('0')) val = val div 8 -iterator split*(s: string, seps: set[char] = Whitespace, maxsplit: int = -1): string = +iterator split*(s: string, seps: set[char] = Whitespace, + maxsplit: int = -1): string = ## Splits the string `s` into substrings using a group of separators. ## ## Substrings are separated by a substring containing only `seps`. Note @@ -422,10 +424,13 @@ iterator split*(s: string, sep: char, maxsplit: int = -1): string = dec(splits) inc(last) -proc substrEq(s: string, a, L: int, x: string): bool = +proc substrEq(s: string, pos: int, substr: string): bool = var i = 0 - while i < L and s[a+i] == x[i]: inc i - result = i == L + var length = substr.len + while i < length and s[pos+i] == substr[i]: + inc i + + return i == length iterator split*(s: string, sep: string, maxsplit: int = -1): string = ## Splits the string `s` into substrings using a string separator. @@ -433,10 +438,11 @@ iterator split*(s: string, sep: string, maxsplit: int = -1): string = ## Substrings are separated by the string `sep`. var last = 0 var splits = maxsplit + if len(s) > 0: while last <= len(s): var first = last - while last < len(s) and not s.substrEq(last, sep.len, sep): + while last < len(s) and not s.substrEq(last, sep): inc(last) if splits == 0: last = len(s) yield substr(s, first, last-1) @@ -444,6 +450,108 @@ iterator split*(s: string, sep: string, maxsplit: int = -1): string = dec(splits) inc(last, sep.len) +# --------- Private templates for different rsplit separators ----------- + +template stringHasSep(s: string, index: int, seps: set[char]): bool = + s[index] in seps + +template stringHasSep(s: string, index: int, sep: char): bool = + s[index] == sep + +template stringHasSep(s: string, index: int, sep: string): bool = + s.substrEq(index, sep) + +template rsplitCommon(s, sep, maxsplit, sepLen) = + ## Common code for rsplit functions + var + last = s.len - 1 + first = last + splits = maxsplit + startPos = 0 + + if len(s) > 0: + # go to -1 in order to get separators at the beginning + while first >= -1: + while first >= 0 and not stringHasSep(s, first, sep): + dec(first) + + if splits == 0: + # No more splits means set first to the beginning + first = -1 + + if first == -1: + startPos = 0 + else: + startPos = first + sepLen + + yield substr(s, startPos, last) + + if splits == 0: + break + + dec(splits) + dec(first) + + last = first + +iterator rsplit*(s: string, seps: set[char] = Whitespace, + maxsplit: int = -1): string = + ## Splits the string `s` into substrings from the right using a + ## string separator. Works exactly the same as `split iterator + ## <#split.i,string,char>`_ except in reverse order. + ## + ## .. code-block:: nim + ## for piece in "foo bar".rsplit(WhiteSpace): + ## echo piece + ## + ## Results in: + ## + ## .. code-block:: nim + ## "bar" + ## "foo" + ## + ## Substrings are separated from the right by the set of chars `seps` + + rsplitCommon(s, seps, maxsplit, 1) + +iterator rsplit*(s: string, sep: char, + maxsplit: int = -1): string = + ## Splits the string `s` into substrings from the right using a + ## string separator. Works exactly the same as `split iterator + ## <#split.i,string,char>`_ except in reverse order. + ## + ## .. code-block:: nim + ## for piece in "foo:bar".rsplit(':'): + ## echo piece + ## + ## Results in: + ## + ## .. code-block:: nim + ## "bar" + ## "foo" + ## + ## Substrings are separated from the right by the char `sep` + rsplitCommon(s, sep, maxsplit, 1) + +iterator rsplit*(s: string, sep: string, maxsplit: int = -1, + keepSeparators: bool = false): string = + ## Splits the string `s` into substrings from the right using a + ## string separator. Works exactly the same as `split iterator + ## <#split.i,string,string>`_ except in reverse order. + ## + ## .. code-block:: nim + ## for piece in "foothebar".rsplit("the"): + ## echo piece + ## + ## Results in: + ## + ## .. code-block:: nim + ## "bar" + ## "foo" + ## + ## Substrings are separated from the right by the string `sep` + rsplitCommon(s, sep, maxsplit, sep.len) + iterator splitLines*(s: string): string = ## Splits the string `s` into its containing lines. ## @@ -531,6 +639,73 @@ proc split*(s: string, sep: string, maxsplit: int = -1): seq[string] {.noSideEff ## `split iterator <#split.i,string,string>`_. accumulateResult(split(s, sep, maxsplit)) +proc rsplit*(s: string, seps: set[char] = Whitespace, + maxsplit: int = -1): seq[string] + {.noSideEffect, rtl, extern: "nsuRSplitCharSet".} = + ## The same as the `rsplit iterator <#rsplit.i,string,set[char]>`_, but is a + ## proc that returns a sequence of substrings. + ## + ## A possible common use case for `rsplit` is path manipulation, + ## particularly on systems that don't use a common delimiter. + ## + ## For example, if a system had `#` as a delimiter, you could + ## do the following to get the tail of the path: + ## + ## .. code-block:: nim + ## var tailSplit = rsplit("Root#Object#Method#Index", {'#'}, maxsplit=1) + ## + ## Results in `tailSplit` containing: + ## + ## .. code-block:: nim + ## @["Root#Object#Method", "Index"] + ## + accumulateResult(rsplit(s, seps, maxsplit)) + result.reverse() + +proc rsplit*(s: string, sep: char, maxsplit: int = -1): seq[string] + {.noSideEffect, rtl, extern: "nsuRSplitChar".} = + ## The same as the `split iterator <#rsplit.i,string,char>`_, but is a proc + ## that returns a sequence of substrings. + ## + ## A possible common use case for `rsplit` is path manipulation, + ## particularly on systems that don't use a common delimiter. + ## + ## For example, if a system had `#` as a delimiter, you could + ## do the following to get the tail of the path: + ## + ## .. code-block:: nim + ## var tailSplit = rsplit("Root#Object#Method#Index", '#', maxsplit=1) + ## + ## Results in `tailSplit` containing: + ## + ## .. code-block:: nim + ## @["Root#Object#Method", "Index"] + ## + accumulateResult(rsplit(s, sep, maxsplit)) + result.reverse() + +proc rsplit*(s: string, sep: string, maxsplit: int = -1): seq[string] + {.noSideEffect, rtl, extern: "nsuRSplitString".} = + ## The same as the `split iterator <#rsplit.i,string,string>`_, but is a proc + ## that returns a sequence of substrings. + ## + ## A possible common use case for `rsplit` is path manipulation, + ## particularly on systems that don't use a common delimiter. + ## + ## For example, if a system had `#` as a delimiter, you could + ## do the following to get the tail of the path: + ## + ## .. code-block:: nim + ## var tailSplit = rsplit("Root#Object#Method#Index", "#", maxsplit=1) + ## + ## Results in `tailSplit` containing: + ## + ## .. code-block:: nim + ## @["Root#Object#Method", "Index"] + ## + accumulateResult(rsplit(s, sep, maxsplit)) + result.reverse() + proc toHex*(x: BiggestInt, len: Positive): string {.noSideEffect, rtl, extern: "nsuToHex".} = ## Converts `x` to its hexadecimal representation. @@ -1035,6 +1210,34 @@ proc rfind*(s: string, sub: char, start: int = -1): int {.noSideEffect, if sub == s[i]: return i return -1 +proc center*(s: string, width: int, fillChar: char = ' '): string {. + noSideEffect, rtl, extern: "nsuCenterString".} = + ## Return the contents of `s` centered in a string `width` long using + ## `fillChar` as padding. + ## + ## The original string is returned if `width` is less than or equal + ## to `s.len`. + if width <= s.len: + return s + + result = newString(width) + + # Left padding will be one fillChar + # smaller if there are an odd number + # of characters + let + charsLeft = (width - s.len) + leftPadding = charsLeft div 2 + + for i in 0 ..< width: + if i >= leftPadding and i < leftPadding + s.len: + # we are where the string should be located + result[i] = s[i-leftPadding] + else: + # we are either before or after where + # the string s should go + result[i] = fillChar + proc count*(s: string, sub: string, overlapping: bool = false): int {. noSideEffect, rtl, extern: "nsuCountString".} = ## Count the occurrences of a substring `sub` in the string `s`. @@ -1116,6 +1319,38 @@ proc replace*(s: string, sub, by: char): string {.noSideEffect, else: result[i] = s[i] inc(i) +proc expandTabs*(s: string, tabSize: int = 8): string {.noSideEffect, + procvar, rtl, extern: "nsuExpandTabsStr".} = + ## Expand tab characters in `s` by `tabSize` spaces + + if len(s) == 0: + return s + + result = newStringOfCap(s.len + s.len shr 2) + + var pos = 0 + + template addSpaces(n) = + for j in 0 ..< n: + result.add(' ') + pos += 1 + + for i in 0 ..< len(s): + let c = s[i] + + if c == '\t': + let + denominator = if tabSize > 0: tabSize else: 1 + numSpaces = tabSize - pos mod denominator + + addSpaces(numSpaces) + else: + result.add(c) + pos += 1 + + if c == '\l': + pos = 0 + proc replaceWord*(s, sub: string, by = ""): string {.noSideEffect, rtl, extern: "nsuReplaceWord".} = ## Replaces `sub` in `s` by the string `by`. @@ -1899,6 +2134,11 @@ when isMainModule: doAssert parseEnum("invalid enum value", enC) == enC + doAssert center("foo", 13) == " foo " + doAssert center("foo", 0) == "foo" + doAssert center("foo", 3, fillChar = 'a') == "foo" + doAssert center("foo", 10, fillChar = '\t') == "\t\t\tfoo\t\t\t\t" + doAssert count("foofoofoo", "foofoo") == 1 doAssert count("foofoofoo", "foofoo", overlapping = true) == 2 doAssert count("foofoofoo", 'f') == 3 @@ -1967,6 +2207,22 @@ when isMainModule: doAssert(not isUpper("AAcc")) doAssert(not isUpper("A#$")) + doAssert expandTabs("\t", 4) == " " + doAssert expandTabs("\tfoo\t", 4) == " foo " + doAssert expandTabs("\tfoo\tbar", 4) == " foo bar" + doAssert expandTabs("\tfoo\tbar\t", 4) == " foo bar " + doAssert expandTabs("", 4) == "" + doAssert expandTabs("", 0) == "" + doAssert expandTabs("\t\t\t", 0) == "" + + doAssert rsplit("foo bar", seps=Whitespace) == @["foo", "bar"] + doAssert rsplit(" foo bar", seps=Whitespace, maxsplit=1) == @[" foo", "bar"] + doAssert rsplit(" foo bar ", seps=Whitespace, maxsplit=1) == @[" foo bar", ""] + doAssert rsplit(":foo:bar", sep=':') == @["", "foo", "bar"] + doAssert rsplit(":foo:bar", sep=':', maxsplit=2) == @["", "foo", "bar"] + doAssert rsplit(":foo:bar", sep=':', maxsplit=3) == @["", "foo", "bar"] + doAssert rsplit("foothebar", sep="the") == @["foo", "bar"] + doAssert(unescape(r"\x013", "", "") == "\x013") doAssert join(["foo", "bar", "baz"]) == "foobarbaz" diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim index 5d302c9dc..ac25dccef 100644 --- a/lib/pure/unicode.nim +++ b/lib/pure/unicode.nim @@ -135,45 +135,62 @@ proc runeAt*(s: string, i: Natural): Rune = ## Returns the unicode character in ``s`` at byte index ``i`` fastRuneAt(s, i, result, false) -proc toUTF8*(c: Rune): string {.rtl, extern: "nuc$1".} = - ## Converts a rune into its UTF-8 representation +template fastToUTF8Copy*(c: Rune, s: var string, pos: int, doInc = true) = + ## Copies UTF-8 representation of `c` into the preallocated string `s` + ## starting at position `pos`. If `doInc == true`, `pos` is incremented + ## by the number of bytes that have been processed. + ## + ## To be the most efficient, make sure `s` is preallocated + ## with an additional amount equal to the byte length of + ## `c`. var i = RuneImpl(c) if i <=% 127: - result = newString(1) - result[0] = chr(i) + s.setLen(pos+1) + s[pos+0] = chr(i) + when doInc: inc(pos) elif i <=% 0x07FF: - result = newString(2) - result[0] = chr((i shr 6) or 0b110_00000) - result[1] = chr((i and ones(6)) or 0b10_0000_00) + s.setLen(pos+2) + s[pos+0] = chr((i shr 6) or 0b110_00000) + s[pos+1] = chr((i and ones(6)) or 0b10_0000_00) + when doInc: inc(pos, 2) elif i <=% 0xFFFF: - result = newString(3) - result[0] = chr(i shr 12 or 0b1110_0000) - result[1] = chr(i shr 6 and ones(6) or 0b10_0000_00) - result[2] = chr(i and ones(6) or 0b10_0000_00) + s.setLen(pos+3) + s[pos+0] = chr(i shr 12 or 0b1110_0000) + s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00) + s[pos+2] = chr(i and ones(6) or 0b10_0000_00) + when doInc: inc(pos, 3) elif i <=% 0x001FFFFF: - result = newString(4) - result[0] = chr(i shr 18 or 0b1111_0000) - result[1] = chr(i shr 12 and ones(6) or 0b10_0000_00) - result[2] = chr(i shr 6 and ones(6) or 0b10_0000_00) - result[3] = chr(i and ones(6) or 0b10_0000_00) + s.setLen(pos+4) + s[pos+0] = chr(i shr 18 or 0b1111_0000) + s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00) + s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00) + s[pos+3] = chr(i and ones(6) or 0b10_0000_00) + when doInc: inc(pos, 4) elif i <=% 0x03FFFFFF: - result = newString(5) - result[0] = chr(i shr 24 or 0b111110_00) - result[1] = chr(i shr 18 and ones(6) or 0b10_0000_00) - result[2] = chr(i shr 12 and ones(6) or 0b10_0000_00) - result[3] = chr(i shr 6 and ones(6) or 0b10_0000_00) - result[4] = chr(i and ones(6) or 0b10_0000_00) + s.setLen(pos+5) + s[pos+0] = chr(i shr 24 or 0b111110_00) + s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00) + s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00) + s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00) + s[pos+4] = chr(i and ones(6) or 0b10_0000_00) + when doInc: inc(pos, 5) elif i <=% 0x7FFFFFFF: - result = newString(6) - result[0] = chr(i shr 30 or 0b1111110_0) - result[1] = chr(i shr 24 and ones(6) or 0b10_0000_00) - result[2] = chr(i shr 18 and ones(6) or 0b10_0000_00) - result[3] = chr(i shr 12 and ones(6) or 0b10_0000_00) - result[4] = chr(i shr 6 and ones(6) or 0b10_0000_00) - result[5] = chr(i and ones(6) or 0b10_0000_00) + s.setLen(pos+6) + s[pos+0] = chr(i shr 30 or 0b1111110_0) + s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00) + s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00) + s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00) + s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00) + s[pos+5] = chr(i and ones(6) or 0b10_0000_00) + when doInc: inc(pos, 6) else: discard # error, exception? +proc toUTF8*(c: Rune): string {.rtl, extern: "nuc$1".} = + ## Converts a rune into its UTF-8 representation + result = "" + fastToUTF8Copy(c, result, 0, false) + proc `$`*(rune: Rune): string = ## Converts a Rune to a string rune.toUTF8 @@ -1352,6 +1369,136 @@ proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} = (c >= 0x20d0 and c <= 0x20ff) or (c >= 0xfe20 and c <= 0xfe2f)) +proc swapCase*(s: string): string {.noSideEffect, procvar, + rtl, extern: "nuc$1".} = + ## Swaps the case of unicode characters in `s` + ## + ## Returns a new string such that the cases of all unicode characters + ## are swapped if possible + + var + i = 0 + lastIndex = 0 + rune: Rune + + result = newString(len(s)) + + while i < len(s): + lastIndex = i + + fastRuneAt(s, i, rune) + + if rune.isUpper(): + rune = rune.toLower() + elif rune.isLower(): + rune = rune.toUpper() + + rune.fastToUTF8Copy(result, lastIndex) + +proc translate*(s: string, replacements: proc(key: string): string): string {. + rtl, extern: "nuc$1".} = + ## Translates words in a string using the `replacements` proc to substitute + ## words inside `s` with their replacements + ## + ## `replacements` is any proc that takes a word and returns + ## a new word to fill it's place. + + # Allocate memory for the new string based on the old one. + # If the new string length is less than the old, no allocations + # will be needed. If the new string length is greater than the + # old, then maybe only one allocation is needed + result = newStringOfCap(s.len) + + var + index = 0 + lastIndex = 0 + wordStart = 0 + inWord = false + rune: Rune + + while index < len(s): + lastIndex = index + + fastRuneAt(s, index, rune) + + let whiteSpace = rune.isWhiteSpace() + + if whiteSpace and inWord: + # If we've reached the end of a word + let word = s[wordStart ..< lastIndex] + result.add(replacements(word)) + result.add($rune) + + inWord = false + elif not whiteSpace and not inWord: + # If we've hit a non space character and + # are not currently in a word, track + # the starting index of the word + inWord = true + wordStart = lastIndex + elif whiteSpace: + result.add($rune) + + if wordStart < len(s) and inWord: + # Get the trailing word at the end + let word = s[wordStart .. ^1] + result.add(replacements(word)) + +proc title*(s: string): string {.noSideEffect, procvar, + rtl, extern: "nuc$1".} = + ## Converts `s` to a unicode title. + ## + ## Returns a new string such that the first character + ## in each word inside `s` is capitalized + + var + i = 0 + lastIndex = 0 + rune: Rune + + result = newString(len(s)) + + var firstRune = true + + while i < len(s): + lastIndex = i + + fastRuneAt(s, i, rune) + + if not rune.isWhiteSpace() and firstRune: + rune = rune.toUpper() + firstRune = false + elif rune.isWhiteSpace(): + firstRune = true + + rune.fastToUTF8Copy(result, lastIndex) + +proc isTitle*(s: string): bool {.noSideEffect, procvar, + rtl, extern: "nuc$1Str".}= + ## Checks whether or not `s` is a unicode title. + ## + ## Returns true if the first character in each word inside `s` + ## are upper case and there is at least one character in `s`. + if s.len() == 0: + return false + + result = true + + var + i = 0 + rune: Rune + + var firstRune = true + + while i < len(s) and result: + fastRuneAt(s, i, rune, doInc=true) + + if not rune.isWhiteSpace() and firstRune: + result = rune.isUpper() and result + firstRune = false + elif rune.isWhiteSpace(): + firstRune = true + iterator runes*(s: string): Rune = ## Iterates over any unicode character of the string ``s`` var @@ -1451,6 +1598,39 @@ when isMainModule: compared = (someString == $someRunes) doAssert compared == true + proc test_replacements(word: string): string = + case word + of "two": + return "2" + of "foo": + return "BAR" + of "βeta": + return "beta" + of "alpha": + return "αlpha" + else: + return "12345" + + doAssert translate("two not alpha foo βeta", test_replacements) == "2 12345 αlpha BAR beta" + doAssert translate(" two not foo βeta ", test_replacements) == " 2 12345 BAR beta " + + doAssert title("foo bar") == "Foo Bar" + doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma" + doAssert title("") == "" + + doAssert isTitle("Foo") + doAssert(not isTitle("Foo bar")) + doAssert(not isTitle("αlpha Βeta")) + doAssert(isTitle("Αlpha Βeta Γamma")) + doAssert(not isTitle("fFoo")) + + doAssert swapCase("FooBar") == "fOObAR" + doAssert swapCase(" ") == " " + doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA" + doAssert swapCase("a✓B") == "A✓b" + doAssert swapCase("") == "" + + doAssert reversed("Reverse this!") == "!siht esreveR" doAssert reversed("先秦兩漢") == "漢兩秦先" doAssert reversed("as⃝df̅") == "f̅ds⃝a" |