diff options
Diffstat (limited to 'lib/pure/strutils.nim')
-rw-r--r-- | lib/pure/strutils.nim | 439 |
1 files changed, 307 insertions, 132 deletions
diff --git a/lib/pure/strutils.nim b/lib/pure/strutils.nim index b6edb834c..757268cd1 100644 --- a/lib/pure/strutils.nim +++ b/lib/pure/strutils.nim @@ -26,6 +26,12 @@ include "system/inclrtl" {.pop.} +# Support old split with set[char] +when defined(nimOldSplit): + {.pragma: deprecatedSplit, deprecated.} +else: + {.pragma: deprecatedSplit.} + type CharSet* {.deprecated.} = set[char] # for compatibility with Nim {.deprecated: [TCharSet: CharSet].} @@ -64,8 +70,8 @@ const ## doAssert "01234".find(invalid) == -1 ## doAssert "01A34".find(invalid) == 2 -proc isAlpha*(c: char): bool {.noSideEffect, procvar, - rtl, extern: "nsuIsAlphaChar".}= +proc isAlphaAscii*(c: char): bool {.noSideEffect, procvar, + rtl, extern: "nsuIsAlphaAsciiChar".}= ## Checks whether or not `c` is alphabetical. ## ## This checks a-z, A-Z ASCII characters only. @@ -85,27 +91,27 @@ proc isDigit*(c: char): bool {.noSideEffect, procvar, ## This checks 0-9 ASCII characters only. return c in Digits -proc isSpace*(c: char): bool {.noSideEffect, procvar, - rtl, extern: "nsuIsSpaceChar".}= +proc isSpaceAscii*(c: char): bool {.noSideEffect, procvar, + rtl, extern: "nsuIsSpaceAsciiChar".}= ## Checks whether or not `c` is a whitespace character. return c in Whitespace -proc isLower*(c: char): bool {.noSideEffect, procvar, - rtl, extern: "nsuIsLowerChar".}= +proc isLowerAscii*(c: char): bool {.noSideEffect, procvar, + rtl, extern: "nsuIsLowerAsciiChar".}= ## Checks whether or not `c` is a lower case character. ## ## This checks ASCII characters only. return c in {'a'..'z'} -proc isUpper*(c: char): bool {.noSideEffect, procvar, - rtl, extern: "nsuIsUpperChar".}= +proc isUpperAscii*(c: char): bool {.noSideEffect, procvar, + rtl, extern: "nsuIsUpperAsciiChar".}= ## Checks whether or not `c` is an upper case character. ## ## This checks ASCII characters only. return c in {'A'..'Z'} -proc isAlpha*(s: string): bool {.noSideEffect, procvar, - rtl, extern: "nsuIsAlphaStr".}= +proc isAlphaAscii*(s: string): bool {.noSideEffect, procvar, + rtl, extern: "nsuIsAlphaAsciiStr".}= ## Checks whether or not `s` is alphabetical. ## ## This checks a-z, A-Z ASCII characters only. @@ -117,7 +123,7 @@ proc isAlpha*(s: string): bool {.noSideEffect, procvar, result = true for c in s: - result = c.isAlpha() and result + result = c.isAlphaAscii() and result proc isAlphaNumeric*(s: string): bool {.noSideEffect, procvar, rtl, extern: "nsuIsAlphaNumericStr".}= @@ -149,8 +155,8 @@ proc isDigit*(s: string): bool {.noSideEffect, procvar, for c in s: result = c.isDigit() and result -proc isSpace*(s: string): bool {.noSideEffect, procvar, - rtl, extern: "nsuIsSpaceStr".}= +proc isSpaceAscii*(s: string): bool {.noSideEffect, procvar, + rtl, extern: "nsuIsSpaceAsciiStr".}= ## Checks whether or not `s` is completely whitespace. ## ## Returns true if all characters in `s` are whitespace @@ -160,10 +166,11 @@ proc isSpace*(s: string): bool {.noSideEffect, procvar, result = true for c in s: - result = c.isSpace() and result + if not c.isSpaceAscii(): + return false -proc isLower*(s: string): bool {.noSideEffect, procvar, - rtl, extern: "nsuIsLowerStr".}= +proc isLowerAscii*(s: string): bool {.noSideEffect, procvar, + rtl, extern: "nsuIsLowerAsciiStr".}= ## Checks whether or not `s` contains all lower case characters. ## ## This checks ASCII characters only. @@ -174,10 +181,10 @@ proc isLower*(s: string): bool {.noSideEffect, procvar, result = true for c in s: - result = c.isLower() and result + result = c.isLowerAscii() and result -proc isUpper*(s: string): bool {.noSideEffect, procvar, - rtl, extern: "nsuIsUpperStr".}= +proc isUpperAscii*(s: string): bool {.noSideEffect, procvar, + rtl, extern: "nsuIsUpperAsciiStr".}= ## Checks whether or not `s` contains all upper case characters. ## ## This checks ASCII characters only. @@ -188,10 +195,10 @@ proc isUpper*(s: string): bool {.noSideEffect, procvar, result = true for c in s: - result = c.isUpper() and result + result = c.isUpperAscii() and result -proc toLower*(c: char): char {.noSideEffect, procvar, - rtl, extern: "nsuToLowerChar".} = +proc toLowerAscii*(c: char): char {.noSideEffect, procvar, + rtl, extern: "nsuToLowerAsciiChar".} = ## Converts `c` into lower case. ## ## This works only for the letters ``A-Z``. See `unicode.toLower @@ -202,8 +209,8 @@ proc toLower*(c: char): char {.noSideEffect, procvar, else: result = c -proc toLower*(s: string): string {.noSideEffect, procvar, - rtl, extern: "nsuToLowerStr".} = +proc toLowerAscii*(s: string): string {.noSideEffect, procvar, + rtl, extern: "nsuToLowerAsciiStr".} = ## Converts `s` into lower case. ## ## This works only for the letters ``A-Z``. See `unicode.toLower @@ -211,10 +218,10 @@ proc toLower*(s: string): string {.noSideEffect, procvar, ## character. result = newString(len(s)) for i in 0..len(s) - 1: - result[i] = toLower(s[i]) + result[i] = toLowerAscii(s[i]) -proc toUpper*(c: char): char {.noSideEffect, procvar, - rtl, extern: "nsuToUpperChar".} = +proc toUpperAscii*(c: char): char {.noSideEffect, procvar, + rtl, extern: "nsuToUpperAsciiChar".} = ## Converts `c` into upper case. ## ## This works only for the letters ``A-Z``. See `unicode.toUpper @@ -225,8 +232,8 @@ proc toUpper*(c: char): char {.noSideEffect, procvar, else: result = c -proc toUpper*(s: string): string {.noSideEffect, procvar, - rtl, extern: "nsuToUpperStr".} = +proc toUpperAscii*(s: string): string {.noSideEffect, procvar, + rtl, extern: "nsuToUpperAsciiStr".} = ## Converts `s` into upper case. ## ## This works only for the letters ``A-Z``. See `unicode.toUpper @@ -234,14 +241,145 @@ proc toUpper*(s: string): string {.noSideEffect, procvar, ## character. result = newString(len(s)) for i in 0..len(s) - 1: - result[i] = toUpper(s[i]) + result[i] = toUpperAscii(s[i]) + +proc capitalizeAscii*(s: string): string {.noSideEffect, procvar, + rtl, extern: "nsuCapitalizeAscii".} = + ## Converts the first character of `s` into upper case. + ## + ## This works only for the letters ``A-Z``. + result = toUpperAscii(s[0]) & substr(s, 1) + +proc isSpace*(c: char): bool {.noSideEffect, procvar, + rtl, deprecated, extern: "nsuIsSpaceChar".}= + ## Checks whether or not `c` is a whitespace character. + ## + ## **Deprecated since version 0.15.0**: use ``isSpaceAscii`` instead. + isSpaceAscii(c) + +proc isLower*(c: char): bool {.noSideEffect, procvar, + rtl, deprecated, extern: "nsuIsLowerChar".}= + ## Checks whether or not `c` is a lower case character. + ## + ## This checks ASCII characters only. + ## + ## **Deprecated since version 0.15.0**: use ``isLowerAscii`` instead. + isLowerAscii(c) + +proc isUpper*(c: char): bool {.noSideEffect, procvar, + rtl, deprecated, extern: "nsuIsUpperChar".}= + ## Checks whether or not `c` is an upper case character. + ## + ## This checks ASCII characters only. + ## + ## **Deprecated since version 0.15.0**: use ``isUpperAscii`` instead. + isUpperAscii(c) + +proc isAlpha*(c: char): bool {.noSideEffect, procvar, + rtl, deprecated, extern: "nsuIsAlphaChar".}= + ## Checks whether or not `c` is alphabetical. + ## + ## This checks a-z, A-Z ASCII characters only. + ## + ## **Deprecated since version 0.15.0**: use ``isAlphaAscii`` instead. + isAlphaAscii(c) + +proc isAlpha*(s: string): bool {.noSideEffect, procvar, + rtl, deprecated, extern: "nsuIsAlphaStr".}= + ## Checks whether or not `s` is alphabetical. + ## + ## This checks a-z, A-Z ASCII characters only. + ## Returns true if all characters in `s` are + ## alphabetic and there is at least one character + ## in `s`. + ## + ## **Deprecated since version 0.15.0**: use ``isAlphaAscii`` instead. + isAlphaAscii(s) + +proc isSpace*(s: string): bool {.noSideEffect, procvar, + rtl, deprecated, extern: "nsuIsSpaceStr".}= + ## Checks whether or not `s` is completely whitespace. + ## + ## Returns true if all characters in `s` are whitespace + ## characters and there is at least one character in `s`. + ## + ## **Deprecated since version 0.15.0**: use ``isSpaceAscii`` instead. + isSpaceAscii(s) + +proc isLower*(s: string): bool {.noSideEffect, procvar, + rtl, deprecated, extern: "nsuIsLowerStr".}= + ## Checks whether or not `s` contains all lower case characters. + ## + ## This checks ASCII characters only. + ## Returns true if all characters in `s` are lower case + ## and there is at least one character in `s`. + ## + ## **Deprecated since version 0.15.0**: use ``isLowerAscii`` instead. + isLowerAscii(s) + +proc isUpper*(s: string): bool {.noSideEffect, procvar, + rtl, deprecated, extern: "nsuIsUpperStr".}= + ## Checks whether or not `s` contains all upper case characters. + ## + ## This checks ASCII characters only. + ## Returns true if all characters in `s` are upper case + ## and there is at least one character in `s`. + ## + ## **Deprecated since version 0.15.0**: use ``isUpperAscii`` instead. + isUpperAscii(s) + +proc toLower*(c: char): char {.noSideEffect, procvar, + rtl, deprecated, extern: "nsuToLowerChar".} = + ## Converts `c` into lower case. + ## + ## This works only for the letters ``A-Z``. See `unicode.toLower + ## <unicode.html#toLower>`_ for a version that works for any Unicode + ## character. + ## + ## **Deprecated since version 0.15.0**: use ``toLowerAscii`` instead. + toLowerAscii(c) + +proc toLower*(s: string): string {.noSideEffect, procvar, + rtl, deprecated, extern: "nsuToLowerStr".} = + ## Converts `s` into lower case. + ## + ## This works only for the letters ``A-Z``. See `unicode.toLower + ## <unicode.html#toLower>`_ for a version that works for any Unicode + ## character. + ## + ## **Deprecated since version 0.15.0**: use ``toLowerAscii`` instead. + toLowerAscii(s) + +proc toUpper*(c: char): char {.noSideEffect, procvar, + rtl, deprecated, extern: "nsuToUpperChar".} = + ## Converts `c` into upper case. + ## + ## This works only for the letters ``A-Z``. See `unicode.toUpper + ## <unicode.html#toUpper>`_ for a version that works for any Unicode + ## character. + ## + ## **Deprecated since version 0.15.0**: use ``toUpperAscii`` instead. + toUpperAscii(c) + +proc toUpper*(s: string): string {.noSideEffect, procvar, + rtl, deprecated, extern: "nsuToUpperStr".} = + ## Converts `s` into upper case. + ## + ## This works only for the letters ``A-Z``. See `unicode.toUpper + ## <unicode.html#toUpper>`_ for a version that works for any Unicode + ## character. + ## + ## **Deprecated since version 0.15.0**: use ``toUpperAscii`` instead. + toUpperAscii(s) proc capitalize*(s: string): string {.noSideEffect, procvar, - rtl, extern: "nsuCapitalize".} = + rtl, deprecated, extern: "nsuCapitalize".} = ## Converts the first character of `s` into upper case. ## ## This works only for the letters ``A-Z``. - result = toUpper(s[0]) & substr(s, 1) + ## + ## **Deprecated since version 0.15.0**: use ``capitalizeAscii`` instead. + capitalizeAscii(s) proc normalize*(s: string): string {.noSideEffect, procvar, rtl, extern: "nsuNormalize".} = @@ -270,7 +408,7 @@ proc cmpIgnoreCase*(a, b: string): int {.noSideEffect, var i = 0 var m = min(a.len, b.len) while i < m: - result = ord(toLower(a[i])) - ord(toLower(b[i])) + result = ord(toLowerAscii(a[i])) - ord(toLowerAscii(b[i])) if result != 0: return inc(i) result = a.len - b.len @@ -291,8 +429,8 @@ proc cmpIgnoreStyle*(a, b: string): int {.noSideEffect, while true: while a[i] == '_': inc(i) while b[j] == '_': inc(j) # BUGFIX: typo - var aa = toLower(a[i]) - var bb = toLower(b[j]) + var aa = toLowerAscii(a[i]) + var bb = toLowerAscii(b[j]) result = ord(aa) - ord(bb) if result != 0 or aa == '\0': break inc(i) @@ -326,17 +464,79 @@ proc toOctal*(c: char): string {.noSideEffect, rtl, extern: "nsuToOctal".} = result[i] = chr(val mod 8 + ord('0')) val = val div 8 +proc isNilOrEmpty*(s: string): bool {.noSideEffect, procvar, rtl, extern: "nsuIsNilOrEmpty".} = + ## Checks if `s` is nil or empty. + result = len(s) == 0 + +proc isNilOrWhitespace*(s: string): bool {.noSideEffect, procvar, rtl, extern: "nsuIsNilOrWhitespace".} = + ## Checks if `s` is nil or consists entirely of whitespace characters. + if len(s) == 0: + return true + + result = true + for c in s: + if not c.isSpace(): + return false + +proc substrEq(s: string, pos: int, substr: string): bool = + var i = 0 + var length = substr.len + while i < length and s[pos+i] == substr[i]: + inc i + + return i == length + +# --------- Private templates for different split separators ----------- + +template stringHasSep(s: string, index: int, seps: set[char]): bool = + s[index] in seps + +template stringHasSep(s: string, index: int, sep: char): bool = + s[index] == sep + +template stringHasSep(s: string, index: int, sep: string): bool = + s.substrEq(index, sep) + +template splitCommon(s, sep, maxsplit, sepLen) = + ## Common code for split procedures + var last = 0 + var splits = maxsplit + + if len(s) > 0: + while last <= len(s): + var first = last + while last < len(s) and not stringHasSep(s, last, sep): + inc(last) + if splits == 0: last = len(s) + yield substr(s, first, last-1) + if splits == 0: break + dec(splits) + inc(last, sepLen) + +when defined(nimOldSplit): + template oldSplit(s, seps, maxsplit) = + ## Deprecated split[char] for transition period + var last = 0 + var splits = maxsplit + assert(not ('\0' in seps)) + while last < len(s): + while s[last] in seps: inc(last) + var first = last + while last < len(s) and s[last] notin seps: inc(last) # BUGFIX! + if first <= last-1: + if splits == 0: last = len(s) + yield substr(s, first, last-1) + if splits == 0: break + dec(splits) + iterator split*(s: string, seps: set[char] = Whitespace, maxsplit: int = -1): string = ## Splits the string `s` into substrings using a group of separators. ## - ## Substrings are separated by a substring containing only `seps`. Note - ## that whole sequences of characters found in ``seps`` will be counted as - ## a single split point and leading/trailing separators will be ignored. - ## The following example: + ## Substrings are separated by a substring containing only `seps`. ## ## .. code-block:: nim - ## for word in split(" this is an example "): + ## for word in split("this\lis an\texample"): ## writeLine(stdout, word) ## ## ...generates this output: @@ -350,7 +550,7 @@ iterator split*(s: string, seps: set[char] = Whitespace, ## And the following code: ## ## .. code-block:: nim - ## for word in split(";;this;is;an;;example;;;", {';'}): + ## for word in split("this:is;an$example", {';', ':', '$'}): ## writeLine(stdout, word) ## ## ...produces the same output as the first example. The code: @@ -371,26 +571,16 @@ iterator split*(s: string, seps: set[char] = Whitespace, ## "08" ## "08.398990" ## - var last = 0 - var splits = maxsplit - assert(not ('\0' in seps)) - while last < len(s): - while s[last] in seps: inc(last) - var first = last - while last < len(s) and s[last] notin seps: inc(last) # BUGFIX! - if first <= last-1: - if splits == 0: last = len(s) - yield substr(s, first, last-1) - if splits == 0: break - dec(splits) + when defined(nimOldSplit): + oldSplit(s, seps, maxsplit) + else: + splitCommon(s, seps, maxsplit, 1) iterator split*(s: string, sep: char, maxsplit: int = -1): string = ## Splits the string `s` into substrings using a single separator. ## ## Substrings are separated by the character `sep`. - ## Unlike the version of the iterator which accepts a set of separator - ## characters, this proc will not coalesce groups of the - ## separator, returning a string for each found character. The code: + ## The code: ## ## .. code-block:: nim ## for word in split(";;this;is;an;;example;;;", ';'): @@ -410,56 +600,27 @@ iterator split*(s: string, sep: char, maxsplit: int = -1): string = ## "" ## "" ## - var last = 0 - var splits = maxsplit - assert('\0' != sep) - if len(s) > 0: - # `<=` is correct here for the edge cases! - while last <= len(s): - var first = last - while last < len(s) and s[last] != sep: inc(last) - if splits == 0: last = len(s) - yield substr(s, first, last-1) - if splits == 0: break - dec(splits) - inc(last) - -proc substrEq(s: string, pos: int, substr: string): bool = - var i = 0 - var length = substr.len - while i < length and s[pos+i] == substr[i]: - inc i - - return i == length + splitCommon(s, sep, maxsplit, 1) iterator split*(s: string, sep: string, maxsplit: int = -1): string = ## Splits the string `s` into substrings using a string separator. ## ## Substrings are separated by the string `sep`. - var last = 0 - var splits = maxsplit - - if len(s) > 0: - while last <= len(s): - var first = last - while last < len(s) and not s.substrEq(last, sep): - inc(last) - if splits == 0: last = len(s) - yield substr(s, first, last-1) - if splits == 0: break - dec(splits) - inc(last, sep.len) - -# --------- Private templates for different rsplit separators ----------- - -template stringHasSep(s: string, index: int, seps: set[char]): bool = - s[index] in seps - -template stringHasSep(s: string, index: int, sep: char): bool = - s[index] == sep + ## The code: + ## + ## .. code-block:: nim + ## for word in split("thisDATAisDATAcorrupted", "DATA"): + ## writeLine(stdout, word) + ## + ## Results in: + ## + ## .. code-block:: + ## "this" + ## "is" + ## "corrupted" + ## -template stringHasSep(s: string, index: int, sep: string): bool = - s.substrEq(index, sep) + splitCommon(s, sep, maxsplit, sep.len) template rsplitCommon(s, sep, maxsplit, sepLen) = ## Common code for rsplit functions @@ -2123,13 +2284,13 @@ when isMainModule: doAssert " foo\n bar".indent(4, "Q") == "QQQQ foo\nQQQQ bar" - doAssert isAlpha('r') - doAssert isAlpha('A') - doAssert(not isAlpha('$')) + doAssert isAlphaAscii('r') + doAssert isAlphaAscii('A') + doAssert(not isAlphaAscii('$')) - doAssert isAlpha("Rasp") - doAssert isAlpha("Args") - doAssert(not isAlpha("$Tomato")) + doAssert isAlphaAscii("Rasp") + doAssert isAlphaAscii("Args") + doAssert(not isAlphaAscii("$Tomato")) doAssert isAlphaNumeric('3') doAssert isAlphaNumeric('R') @@ -2148,32 +2309,43 @@ when isMainModule: doAssert(not isDigit("12.33")) doAssert(not isDigit("A45b")) - doAssert isSpace('\t') - doAssert isSpace('\l') - doAssert(not isSpace('A')) + doAssert isSpaceAscii('\t') + doAssert isSpaceAscii('\l') + doAssert(not isSpaceAscii('A')) - doAssert isSpace("\t\l \v\r\f") - doAssert isSpace(" ") - doAssert(not isSpace("ABc \td")) + doAssert isSpaceAscii("\t\l \v\r\f") + doAssert isSpaceAscii(" ") + doAssert(not isSpaceAscii("ABc \td")) - doAssert isLower('a') - doAssert isLower('z') - doAssert(not isLower('A')) - doAssert(not isLower('5')) - doAssert(not isLower('&')) + doAssert(isNilOrEmpty("")) + doAssert(isNilOrEmpty(nil)) + doAssert(not isNilOrEmpty("test")) + doAssert(not isNilOrEmpty(" ")) - doAssert isLower("abcd") - doAssert(not isLower("abCD")) - doAssert(not isLower("33aa")) + doAssert(isNilOrWhitespace("")) + doAssert(isNilOrWhitespace(nil)) + doAssert(isNilOrWhitespace(" ")) + doAssert(isNilOrWhitespace("\t\l \v\r\f")) + doAssert(not isNilOrWhitespace("ABc \td")) - doAssert isUpper('A') - doAssert(not isUpper('b')) - doAssert(not isUpper('5')) - doAssert(not isUpper('%')) + doAssert isLowerAscii('a') + doAssert isLowerAscii('z') + doAssert(not isLowerAscii('A')) + doAssert(not isLowerAscii('5')) + doAssert(not isLowerAscii('&')) - doAssert isUpper("ABC") - doAssert(not isUpper("AAcc")) - doAssert(not isUpper("A#$")) + doAssert isLowerAscii("abcd") + doAssert(not isLowerAscii("abCD")) + doAssert(not isLowerAscii("33aa")) + + doAssert isUpperAscii('A') + doAssert(not isUpperAscii('b')) + doAssert(not isUpperAscii('5')) + doAssert(not isUpperAscii('%')) + + doAssert isUpperAscii("ABC") + doAssert(not isUpperAscii("AAcc")) + doAssert(not isUpperAscii("A#$")) doAssert rsplit("foo bar", seps=Whitespace) == @["foo", "bar"] doAssert rsplit(" foo bar", seps=Whitespace, maxsplit=1) == @[" foo", "bar"] @@ -2218,11 +2390,14 @@ bar bar """.unindent() == "foo\nfoo\nbar\n" - let s = " this is an example " - doAssert s.split() == @["this", "is", "an", "example"] - doAssert s.split(maxsplit=4) == @["this", "is", "an", "example"] - doAssert s.split(' ', maxsplit=4) == @["", "this", "", "", "is an example "] - doAssert s.split(" ", maxsplit=4) == @["", "this", "", "", "is an example "] + let s = " this is an example " + let s2 = ":this;is;an:example;;" + + doAssert s.split() == @["", "this", "is", "an", "example", "", ""] + doAssert s2.split(seps={':', ';'}) == @["", "this", "is", "an", "example", "", ""] + doAssert s.split(maxsplit=4) == @["", "this", "is", "an", "example "] + doAssert s.split(' ', maxsplit=1) == @["", "this is an example "] + doAssert s.split(" ", maxsplit=4) == @["", "this", "is", "an", "example "] block: # formatEng tests doAssert formatEng(0, 2, trim=false) == "0.00" |