# # # Nimrod's Runtime Library # (c) Copyright 2012 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # ## This module contains various string utility routines. ## See the module `re `_ for regular expression support. ## See the module `pegs `_ for PEG support. ## This module is available for the `JavaScript target ## `_. import parseutils {.deadCodeElim: on.} {.push debugger:off .} # the user does not want to trace a part # of the standard library! include "system/inclrtl" type TCharSet* = set[char] # for compatibility with Nim const Whitespace* = {' ', '\t', '\v', '\r', '\l', '\f'} ## All the characters that count as whitespace. Letters* = {'A'..'Z', 'a'..'z'} ## the set of letters Digits* = {'0'..'9'} ## the set of digits HexDigits* = {'0'..'9', 'A'..'F', 'a'..'f'} ## the set of hexadecimal digits IdentChars* = {'a'..'z', 'A'..'Z', '0'..'9', '_'} ## the set of characters an identifier can consist of IdentStartChars* = {'a'..'z', 'A'..'Z', '_'} ## the set of characters an identifier can start with NewLines* = {'\13', '\10'} ## the set of characters a newline terminator can start with AllChars* = {'\x00'..'\xFF'} ## A set with all the possible characters. ## ## Not very useful by its own, you can use it to create *inverted* sets to ## make the `find() proc <#find,string,set[char],int>`_ find **invalid** ## characters in strings. Example: ## ## .. code-block:: nimrod ## let invalid = AllChars - Digits ## doAssert "01234".find(invalid) == -1 ## doAssert "01A34".find(invalid) == 2 proc toLower*(c: char): char {.noSideEffect, procvar, rtl, extern: "nsuToLowerChar".} = ## Converts `c` into lower case. ## ## This works only for the letters ``A-Z``. See `unicode.toLower ## `_ for a version that works for any Unicode ## character. if c in {'A'..'Z'}: result = chr(ord(c) + (ord('a') - ord('A'))) else: result = c proc toLower*(s: string): string {.noSideEffect, procvar, rtl, extern: "nsuToLowerStr".} = ## Converts `s` into lower case. ## ## This works only for the letters ``A-Z``. See `unicode.toLower ## `_ for a version that works for any Unicode ## character. result = newString(len(s)) for i in 0..len(s) - 1: result[i] = toLower(s[i]) proc toUpper*(c: char): char {.noSideEffect, procvar, rtl, extern: "nsuToUpperChar".} = ## Converts `c` into upper case. ## ## This works only for the letters ``A-Z``. See `unicode.toUpper ## `_ for a version that works for any Unicode ## character. if c in {'a'..'z'}: result = chr(ord(c) - (ord('a') - ord('A'))) else: result = c proc toUpper*(s: string): string {.noSideEffect, procvar, rtl, extern: "nsuToUpperStr".} = ## Converts `s` into upper case. ## ## This works only for the letters ``A-Z``. See `unicode.toUpper ## `_ for a version that works for any Unicode ## character. result = newString(len(s)) for i in 0..len(s) - 1: result[i] = toUpper(s[i]) proc capitalize*(s: string): string {.noSideEffect, procvar, rtl, extern: "nsuCapitalize".} = ## Converts the first character of `s` into upper case. ## ## This works only for the letters ``A-Z``. result = toUpper(s[0]) & substr(s, 1) proc normalize*(s: string): string {.noSideEffect, procvar, rtl, extern: "nsuNormalize".} = ## Normalizes the string `s`. ## ## That means to convert it to lower case and remove any '_'. This is needed ## for Nimrod identifiers for example. result = newString(s.len) var j = 0 for i in 0..len(s) - 1: if s[i] in {'A'..'Z'}: result[j] = chr(ord(s[i]) + (ord('a') - ord('A'))) inc j elif s[i] != '_': result[j] = s[i] inc j if j != s.len: setLen(result, j) proc cmpIgnoreCase*(a, b: string): int {.noSideEffect, rtl, extern: "nsuCmpIgnoreCase", procvar, operator: 4.} = ## Compares two strings in a case insensitive manner. Returns: ## ## | 0 iff a == b ## | < 0 iff a < b ## | > 0 iff a > b var i = 0 var m = min(a.len, b.len) while i < m: result = ord(toLower(a[i])) - ord(toLower(b[i])) if result != 0: return inc(i) result = a.len - b.len {.push checks: off, line_trace: off .} # this is a hot-spot in the compiler! # thus we compile without checks here proc cmpIgnoreStyle*(a, b: string): int {.noSideEffect, rtl, extern: "nsuCmpIgnoreStyle", procvar, operator: 3.} = ## Compares two strings normalized (i.e. case and ## underscores do not matter). Returns: ## ## | 0 iff a == b ## | < 0 iff a < b ## | > 0 iff a > b var i = 0 var j = 0 while true: while a[i] == '_': inc(i) while b[j] == '_': inc(j) # BUGFIX: typo var aa = toLower(a[i]) var bb = toLower(b[j]) result = ord(aa) - ord(bb) if result != 0 or aa == '\0': break inc(i) inc(j) {.pop.} proc strip*(s: string, leading = true, trailing = true): string {.noSideEffect, rtl, extern: "nsuStrip", operator: 5.} = ## Strips whitespace from `s` and returns the resulting string. ## ## If `leading` is true, leading whitespace is stripped. ## If `trailing` is true, trailing whitespace is stripped. const chars: set[char] = Whitespace var first = 0 last = len(s)-1 if leading: while s[first] in chars: inc(first) if trailing: while last >= 0 and s[last] in chars: dec(last) result = substr(s, first, last) proc toOctal*(c: char): string {.noSideEffect, rtl, extern: "nsuToOctal".} = ## Converts a character `c` to its octal representation. ## ## The resulting string may not have a leading zero. Its length is always ## exactly 3. result = newString(3) var val = ord(c) for i in countdown(2, 0): result[i] = chr(val mod 8 + ord('0')) val = val div 8 iterator split*(s: string, seps: set[char] = Whitespace): string = ## Splits the string `s` into substrings using a group of separators. ## ## Substrings are separated by a substring containing only `seps`. Note ## that whole sequences of characters found in ``seps`` will be counted as ## a single split point and leading/trailing separators will be ignored. ## The following example: ## ## .. code-block:: nimrod ## for word in split(" this is an example "): ## writeln(stdout, word) ## ## ...generates this output: ## ## .. code-block:: ## "this" ## "is" ## "an" ## "example" ## ## And the following code: ## ## .. code-block:: nimrod ## for word in split(";;this;is;an;;example;;;", {';'}): ## writeln(stdout, word) ## ## ...produces the same output as the first example. The code: ## ## .. code-block:: nimrod ## let date = "2012-11-20T22:08:08.398990" ## let separators = {' ', '-', ':', 'T'} ## for number in split(date, separators): ## writeln(stdout, number) ## ## ...results in: ## ## .. code-block:: ## "2012" ## "11" ## "20" ## "22" ## "08" ## "08.398990" ## var last = 0 assert(not ('\0' in seps)) while last < len(s): while s[last] in seps: inc(last) var first = last while last < len(s) and s[last] notin seps: inc(last) # BUGFIX! if first <= last-1: yield substr(s, first, last-1) iterator split*(s: string, sep: char): string = ## Splits the string `s` into substrings using a single separator. ## ## Substrings are separated by the character `sep`. ## Unlike the version of the iterator which accepts a set of separator ## characters, this proc will not coalesce groups of the ## separator, returning a string for each found character. The code: ## ## .. code-block:: nimrod ## for word in split(";;this;is;an;;example;;;", ';'): ## writeln(stdout, word) ## ## Results in: ## ## .. code-block:: ## "" ## "" ## "this" ## "is" ## "an" ## "" ## "example" ## "" ## "" ## "" ## var last = 0 assert('\0' != sep) if len(s) > 0: # `<=` is correct here for the edge cases! while last <= len(s): var first = last while last < len(s) and s[last] != sep: inc(last) yield substr(s, first, last-1) inc(last) iterator split*(s: string, sep: string): string = ## Splits the string `s` into substrings using a string separator. ## ## Substrings are separated by the string `sep`. var last = 0 if len(s) > 0: while last <= len(s): var first = last while last < len(s) and s.substr(last, last + `_ newline ## combination (CR, LF, CR-LF) is supported. The result strings contain no ## trailing ``\n``. ## ## Example: ## ## .. code-block:: nimrod ## for line in splitLines("\nthis\nis\nan\n\nexample\n"): ## writeln(stdout, line) ## ## Results in: ## ## .. code-block:: nimrod ## "" ## "this" ## "is" ## "an" ## "" ## "example" ## "" var first = 0 var last = 0 while true: while s[last] notin {'\0', '\c', '\l'}: inc(last) yield substr(s, first, last-1) # skip newlines: if s[last] == '\l': inc(last) elif s[last] == '\c': inc(last) if s[last] == '\l': inc(last) else: break # was '\0' first = last proc splitLines*(s: string): seq[string] {.noSideEffect, rtl, extern: "nsuSplitLines".} = ## The same as the `splitLines <#splitLines.i,string>`_ iterator, but is a ## proc that returns a sequence of substrings. accumulateResult(splitLines(s)) proc countLines*(s: string): int {.noSideEffect, rtl, extern: "nsuCountLines".} = ## Returns the number of new line separators in the string `s`. ## ## This is the same as ``len(splitLines(s))``, but much more efficient ## because it doesn't modify the string creating temporal objects. Every ## `character literal `_ newline combination ## (CR, LF, CR-LF) is supported. ## ## Despite its name this proc might not actually return the *number of lines* ## in `s` because the concept of what a line is can vary. For example, a ## string like ``Hello world`` is a line of text, but the proc will return a ## value of zero because there are no newline separators. Also, text editors ## usually don't count trailing newline characters in a text file as a new ## empty line, but this proc will. var i = 0 while i < s.len: case s[i] of '\c': if s[i+1] == '\l': inc i inc result of '\l': inc result else: discard inc i proc split*(s: string, seps: set[char] = Whitespace): seq[string] {. noSideEffect, rtl, extern: "nsuSplitCharSet".} = ## The same as the `split iterator <#split.i,string,set[char]>`_, but is a ## proc that returns a sequence of substrings. accumulateResult(split(s, seps)) proc split*(s: string, sep: char): seq[string] {.noSideEffect, rtl, extern: "nsuSplitChar".} = ## The same as the `split iterator <#split.i,string,char>`_, but is a proc ## that returns a sequence of substrings. accumulateResult(split(s, sep)) proc split*(s: string, sep: string): seq[string] {.noSideEffect, rtl, extern: "nsuSplitString".} = ## Splits the string `s` into substrings using a string separator. ## ## Substrings are separated by the string `sep`. This is a wrapper around the ## `split iterator <#split.i,string,string>`_. accumulateResult(split(s, sep)) proc toHex*(x: BiggestInt, len: int): string {.noSideEffect, rtl, extern: "nsuToHex".} = ## Converts `x` to its hexadecimal representation. ## ## The resulting string will be exactly `len` characters long. No prefix like ## ``0x`` is generated. `x` is treated as an unsigned value. const HexChars = "0123456789ABCDEF" var shift: BiggestInt result = newString(len) for j in countdown(len-1, 0): result[j] = HexChars[toU32(x shr shift) and 0xF'i32] shift = shift + 4 proc intToStr*(x: int, minchars: int = 1): string {.noSideEffect, rtl, extern: "nsuIntToStr".} = ## Converts `x` to its decimal representation. ## ## The resulting string will be minimally `minchars` characters long. This is ## achieved by adding leading zeros. result = $abs(x) for i in 1 .. minchars - len(result): result = '0' & result if x < 0: result = '-' & result proc parseInt*(s: string): int {.noSideEffect, procvar, rtl, extern: "nsuParseInt".} = ## Parses a decimal integer value contained in `s`. ## ## If `s` is not a valid integer, `EInvalidValue` is raised. var L = parseutils.parseInt(s, result, 0) if L != s.len or L == 0: raise newException(EInvalidValue, "invalid integer: " & s) proc parseBiggestInt*(s: string): BiggestInt {.noSideEffect, procvar, rtl, extern: "nsuParseBiggestInt".} = ## Parses a decimal integer value contained in `s`. ## ## If `s` is not a valid integer, `EInvalidValue` is raised. var L = parseutils.parseBiggestInt(s, result, 0) if L != s.len or L == 0: raise newException(EInvalidValue, "invalid integer: " & s) proc parseFloat*(s: string): float {.noSideEffect, procvar, rtl, extern: "nsuParseFloat".} = ## Parses a decimal floating point value contained in `s`. If `s` is not ## a valid floating point number, `EInvalidValue` is raised. ``NAN``, ## ``INF``, ``-INF`` are also supported (case insensitive comparison). var L = parseutils.parseFloat(s, result, 0) if L != s.len or L == 0: raise newException(EInvalidValue, "invalid float: " & s) proc parseHexInt*(s: string): int {.noSideEffect, procvar, rtl, extern: "nsuParseHexInt".} = ## Parses a hexadecimal integer value contained in `s`. ## ## If `s` is not a valid integer, `EInvalidValue` is raised. `s` can have one ## of the following optional prefixes: ``0x``, ``0X``, ``#``. Underscores ## within `s` are ignored. var i = 0 if s[i] == '0' and (s[i+1] == 'x' or s[i+1] == 'X'): inc(i, 2) elif s[i] == '#': inc(i) while true: case s[i] of '_': inc(i) of '0'..'9': result = result shl 4 or (ord(s[i]) - ord('0')) inc(i) of 'a'..'f': result = result shl 4 or (ord(s[i]) - ord('a') + 10) inc(i) of 'A'..'F': result = result shl 4 or (ord(s[i]) - ord('A') + 10) inc(i) of '\0': break else: raise newException(EInvalidValue, "invalid integer: " & s) proc parseBool*(s: string): bool = ## Parses a value into a `bool`. ## ## If ``s`` is one of the following values: ``y, yes, true, 1, on``, then ## returns `true`. If ``s`` is one of the following values: ``n, no, false, ## 0, off``, then returns `false`. If ``s`` is something else a ## ``EInvalidValue`` exception is raised. case normalize(s) of "y", "yes", "true", "1", "on": result = true of "n", "no", "false", "0", "off": result = false else: raise newException(EInvalidValue, "cannot interpret as a bool: " & s) proc parseEnum*[T: enum](s: string): T = ## Parses an enum ``T``. ## ## Raises ``EInvalidValue`` for an invalid value in `s`. The comparison is ## done in a style insensitive way. for e in low(T)..high(T): if cmpIgnoreStyle(s, $e) == 0: return e raise newException(EInvalidValue, "invalid enum value: " & s) proc parseEnum*[T: enum](s: string, default: T): T = ## Parses an enum ``T``. ## ## Uses `default` for an invalid value in `s`. The comparison is done in a ## style insensitive way. for e in low(T)..high(T): if cmpIgnoreStyle(s, $e) == 0: return e result = default proc repeatChar*(count: int, c: char = ' '): string {.noSideEffect, rtl, extern: "nsuRepeatChar".} = ## Returns a string of length `count` consisting only of ## the character `c`. You can use this proc to left align strings. Example: ## ## .. code-block:: nimrod ## let ## width = 15 ## text1 = "Hello user!" ## text2 = "This is a very long string" ## echo text1 & repeatChar(max(0, width - text1.len)) & "|" ## echo text2 & repeatChar(max(0, width - text2.len)) & "|" result = newString(count) for i in 0..count-1: result[i] = c proc repeatStr*(count: int, s: string): string {.noSideEffect, rtl, extern: "nsuRepeatStr".} = ## Returns `s` concatenated `count` times. result = newStringOfCap(count*s.len) for i in 0..count-1: result.add(s) proc align*(s: string, count: int, padding = ' '): string {. noSideEffect, rtl, extern: "nsuAlignString".} = ## Aligns a string `s` with `padding`, so that is of length `count`. ## ## `padding` characters (by default spaces) are added before `s` resulting in ## right alignment. If ``s.len >= count``, no spaces are added and `s` is ## returned unchanged. If you need to left align a string use the `repeatChar ## proc <#repeatChar>`_. Example: ## ## .. code-block:: nimrod ## assert align("abc", 4) == " abc" ## assert align("a", 0) == "a" ## assert align("1232", 6) == " 1232" ## assert align("1232", 6, '#') == "##1232" if s.len < count: result = newString(count) let spaces = count - s.len for i in 0..spaces-1: result[i] = padding for i in spaces..count-1: result[i] = s[i-spaces] else: result = s iterator tokenize*(s: string, seps: set[char] = Whitespace): tuple[ token: string, isSep: bool] = ## Tokenizes the string `s` into substrings. ## ## Substrings are separated by a substring containing only `seps`. ## Examples: ## ## .. code-block:: nimrod ## for word in tokenize(" this is an example "): ## writeln(stdout, word) ## ## Results in: ## ## .. code-block:: nimrod ## (" ", true) ## ("this", false) ## (" ", true) ## ("is", false) ## (" ", true) ## ("an", false) ## (" ", true) ## ("example", false) ## (" ", true) var i = 0 while true: var j = i var isSep = s[j] in seps while j < s.len and (s[j] in seps) == isSep: inc(j) if j > i: yield (substr(s, i, j-1), isSep) else: break i = j proc wordWrap*(s: string, maxLineWidth = 80, splitLongWords = true, seps: set[char] = Whitespace, newLine = "\n"): string {. noSideEffect, rtl, extern: "nsuWordWrap".} = ## Word wraps `s`. result = newStringOfCap(s.len + s.len shr 6) var spaceLeft = maxLineWidth var lastSep = "" for word, isSep in tokenize(s, seps): if isSep: lastSep = word spaceLeft = spaceLeft - len(word) continue if len(word) > spaceLeft: if splitLongWords and len(word) > maxLineWidth: result.add(substr(word, 0, spaceLeft-1)) var w = spaceLeft+1 var wordLeft = len(word) - spaceLeft while wordLeft > 0: result.add(newLine) var L = min(maxLineWidth, wordLeft) spaceLeft = maxLineWidth - L result.add(substr(word, w, w+L-1)) inc(w, L) dec(wordLeft, L) else: spaceLeft = maxLineWidth - len(word) result.add(newLine) result.add(word) else: spaceLeft = spaceLeft - len(word) result.add(lastSep & word) lastSep.setLen(0) proc unindent*(s: string, eatAllIndent = false): string {. noSideEffect, rtl, extern: "nsuUnindent".} = ## Unindents `s`. result = newStringOfCap(s.len) var i = 0 var pattern = true var indent = 0 while s[i] == ' ': inc i var level = if i == 0: -1 else: i while i < s.len: if s[i] == ' ': if i > 0 and s[i-1] in {'\l', '\c'}: pattern = true indent = 0 if pattern: inc(indent) if indent > level and not eatAllIndent: result.add(s[i]) if level < 0: level = indent else: # a space somewhere: do not delete result.add(s[i]) else: pattern = false result.add(s[i]) inc i proc startsWith*(s, prefix: string): bool {.noSideEffect, rtl, extern: "nsuStartsWith".} = ## Returns true iff ``s`` starts with ``prefix``. ## ## If ``prefix == ""`` true is returned. var i = 0 while true: if prefix[i] == '\0': return true if s[i] != prefix[i]: return false inc(i) proc endsWith*(s, suffix: string): bool {.noSideEffect, rtl, extern: "nsuEndsWith".} = ## Returns true iff ``s`` ends with ``suffix``. ## ## If ``suffix == ""`` true is returned. var i = 0 var j = len(s) - len(suffix) while i+j <% s.len: if s[i+j] != suffix[i]: return false inc(i) if suffix[i] == '\0': return true proc continuesWith*(s, substr: string, start: int): bool {.noSideEffect, rtl, extern: "nsuContinuesWith".} = ## Returns true iff ``s`` continues with ``substr`` at position ``start``. ## ## If ``substr == ""`` true is returned. var i = 0 while true: if substr[i] == '\0': return true if s[i+start] != substr[i]: return false inc(i) proc addSep*(dest: var string, sep = ", ", startLen = 0) {.noSideEffect, inline.} = ## Adds a separator to `dest` only if its length is bigger than `startLen`. ## ## A shorthand for: ## ## .. code-block:: nimrod ## if dest.len > startLen: add(dest, sep) ## ## This is often useful for generating some code where the items need to ## be *separated* by `sep`. `sep` is only added if `dest` is longer than ## `startLen`. The following example creates a string describing ## an array of integers: ## ## .. code-block:: nimrod ## var arr = "[" ## for x in items([2, 3, 5, 7, 11]): ## addSep(arr, startLen=len("[")) ## add(arr, $x) ## add(arr, "]") if dest.len > startLen: add(dest, sep) proc allCharsInSet*(s: string, theSet: TCharSet): bool = ## Returns true iff each character of `s` is in the set `theSet`. for c in items(s): if c notin theSet: return false return true proc abbrev*(s: string, possibilities: openArray[string]): int = ## Returns the index of the first item in `possibilities` if not ambiguous. ## ## Returns -1 if no item has been found and -2 if multiple items match. result = -1 # none found for i in 0..possibilities.len-1: if possibilities[i].startsWith(s): if possibilities[i] == s: # special case: exact match shouldn't be ambiguous return i if result >= 0: return -2 # ambiguous result = i # --------------------------------------------------------------------------- proc join*(a: openArray[string], sep: string): string {. noSideEffect, rtl, extern: "nsuJoinSep".} = ## Concatenates all strings in `a` separating them with `sep`. if len(a) > 0: var L = sep.len * (a.len-1) for i in 0..high(a): inc(L, a[i].len) result = newStringOfCap(L) add(result, a[0]) for i in 1..high(a): add(result, sep) add(result, a[i]) else: result = "" proc join*(a: openArray[string]): string {. noSideEffect, rtl, extern: "nsuJoin".} = ## Concatenates all strings in `a`. if len(a) > 0: var L = 0 for i in 0..high(a): inc(L, a[i].len) result = newStringOfCap(L) for i in 0..high(a): add(result, a[i]) else: result = "" type TSkipTable = array[char, int] proc preprocessSub(sub: string, a: var TSkipTable) = var m = len(sub) for i in 0..0xff: a[chr(i)] = m+1 for i in 0..m-1: a[sub[i]] = m-i proc findAux(s, sub: string, start: int, a: TSkipTable): int = # Fast "quick search" algorithm: var m = len(sub) n = len(s) # search: var j = start while j <= n - m: block match: for k in 0..m-1: if sub[k] != s[k+j]: break match return j inc(j, a[s[j+m]]) return -1 proc find*(s, sub: string, start: int = 0): int {.noSideEffect, rtl, extern: "nsuFindStr", operator: 6.} = ## Searches for `sub` in `s` starting at position `start`. ## ## Searching is case-sensitive. If `sub` is not in `s`, -1 is returned. var a {.noinit.}: TSkipTable preprocessSub(sub, a) result = findAux(s, sub, start, a) proc find*(s: string, sub: char, start: int = 0): int {.noSideEffect, rtl, extern: "nsuFindChar".} = ## Searches for `sub` in `s` starting at position `start`. ## ## Searching is case-sensitive. If `sub` is not in `s`, -1 is returned. for i in start..len(s)-1: if sub == s[i]: return i return -1 proc find*(s: string, chars: set[char], start: int = 0): int {.noSideEffect, rtl, extern: "nsuFindCharSet".} = ## Searches for `chars` in `s` starting at position `start`. ## ## If `s` contains none of the characters in `chars`, -1 is returned. for i in start..s.len-1: if s[i] in chars: return i return -1 proc rfind*(s, sub: string, start: int = -1): int {.noSideEffect.} = ## Searches for `sub` in `s` in reverse, starting at `start` and going ## backwards to 0. ## ## Searching is case-sensitive. If `sub` is not in `s`, -1 is returned. let realStart = if start == -1: s.len else: start for i in countdown(realStart-sub.len, 0): for j in 0..sub.len-1: result = i if sub[j] != s[i+j]: result = -1 break if result != -1: return return -1 proc quoteIfContainsWhite*(s: string): string {.deprecated.} = ## Returns ``'"' & s & '"'`` if `s` contains a space and does not ## start with a quote, else returns `s`. ## ## **DEPRECATED** as it was confused for shell quoting function. For this ## application use `osproc.quoteShell `_. if find(s, {' ', '\t'}) >= 0 and s[0] != '"': result = '"' & s & '"' else: result = s proc contains*(s: string, c: char): bool {.noSideEffect.} = ## Same as ``find(s, c) >= 0``. return find(s, c) >= 0 proc contains*(s, sub: string): bool {.noSideEffect.} = ## Same as ``find(s, sub) >= 0``. return find(s, sub) >= 0 proc contains*(s: string, chars: set[char]): bool {.noSideEffect.} = ## Same as ``find(s, chars) >= 0``. return find(s, chars) >= 0 proc replace*(s, sub: string, by = ""): string {.noSideEffect, rtl, extern: "nsuReplaceStr", operator: 1.} = ## Replaces `sub` in `s` by the string `by`. var a {.noinit.}: TSkipTable result = "" preprocessSub(sub, a) var i = 0 while true: var j = findAux(s, sub, i, a) if j < 0: break add result, substr(s, i, j - 1) add result, by i = j + len(sub) # copy the rest: add result, substr(s, i) proc replace*(s: string, sub, by: char): string {.noSideEffect, rtl, extern: "nsuReplaceChar".} = ## Replaces `sub` in `s` by the character `by`. ## ## Optimized version of `replace <#replace,string,string>`_ for characters. result = newString(s.len) var i = 0 while i < s.len: if s[i] == sub: result[i] = by else: result[i] = s[i] inc(i) proc replaceWord*(s, sub: string, by = ""): string {.noSideEffect, rtl, extern: "nsuReplaceWord".} = ## Replaces `sub` in `s` by the string `by`. ## ## Each occurance of `sub` has to be surrounded by word boundaries ## (comparable to ``\\w`` in regular expressions), otherwise it is not ## replaced. const wordChars = {'a'..'z', 'A'..'Z', '0'..'9', '_', '\128'..'\255'} var a {.noinit.}: TSkipTable result = "" preprocessSub(sub, a) var i = 0 while true: var j = findAux(s, sub, i, a) if j < 0: break # word boundary? if (j == 0 or s[j-1] notin wordChars) and (j+sub.len >= s.len or s[j+sub.len] notin wordChars): add result, substr(s, i, j - 1) add result, by i = j + len(sub) else: add result, substr(s, i, j) i = j + 1 # copy the rest: add result, substr(s, i) proc delete*(s: var string, first, last: int) {.noSideEffect, rtl, extern: "nsuDelete".} = ## Deletes in `s` the characters at position `first` .. `last`. ## ## This modifies `s` itself, it does not return a copy. var i = first var j = last+1 var newLen = len(s)-j+i while i < newLen: s[i] = s[j] inc(i) inc(j) setLen(s, newLen) proc parseOctInt*(s: string): int {.noSideEffect, rtl, extern: "nsuParseOctInt".} = ## Parses an octal integer value contained in `s`. ## ## If `s` is not a valid integer, `EInvalidValue` is raised. `s` can have one ## of the following optional prefixes: ``0o``, ``0O``. Underscores within ## `s` are ignored. var i = 0 if s[i] == '0' and (s[i+1] == 'o' or s[i+1] == 'O'): inc(i, 2) while true: case s[i] of '_': inc(i) of '0'..'7': result = result shl 3 or (ord(s[i]) - ord('0')) inc(i) of '\0': break else: raise newException(EInvalidValue, "invalid integer: " & s) proc toOct*(x: BiggestInt, len: int): string {.noSideEffect, rtl, extern: "nsuToOct".} = ## Converts `x` into its octal representation. ## ## The resulting string is always `len` characters long. No leading ``0o`` ## prefix is generated. var mask: BiggestInt = 7 shift: BiggestInt = 0 assert(len > 0) result = newString(len) for j in countdown(len-1, 0): result[j] = chr(int((x and mask) shr shift) + ord('0')) shift = shift + 3 mask = mask shl 3 proc toBin*(x: BiggestInt, len: int): string {.noSideEffect, rtl, extern: "nsuToBin".} = ## Converts `x` into its binary representation. ## ## The resulting string is always `len` characters long. No leading ``0b`` ## prefix is generated. var mask: BiggestInt = 1 shift: BiggestInt = 0 assert(len > 0) result = newString(len) for j in countdown(len-1, 0): result[j] = chr(int((x and mask) shr shift) + ord('0')) shift = shift + 1 mask = mask shl 1 proc insertSep*(s: string, sep = '_', digits = 3): string {.noSideEffect, rtl, extern: "nsuInsertSep".} = ## Inserts the separator `sep` after `digits` digits from right to left. ## ## Even though the algorithm works with any string `s`, it is only useful ## if `s` contains a number. ## Example: ``insertSep("1000000") == "1_000_000"`` var L = (s.len-1) div digits + s.len result = newString(L) var j = 0 dec(L) for i in countdown(len(s)-1, 0): if j == digits: result[L] = sep dec(L) j = 0 result[L] = s[i] inc(j) dec(L) proc escape*(s: string, prefix = "\"", suffix = "\""): string {.noSideEffect, rtl, extern: "nsuEscape".} = ## Escapes a string `s`. ## ## This does these operations (at the same time): ## * replaces any ``\`` by ``\\`` ## * replaces any ``'`` by ``\'`` ## * replaces any ``"`` by ``\"`` ## * replaces any other character in the set ``{'\0'..'\31', '\128'..'\255'}`` ## by ``\xHH`` where ``HH`` is its hexadecimal value. ## The procedure has been designed so that its output is usable for many ## different common syntaxes. The resulting string is prefixed with ## `prefix` and suffixed with `suffix`. Both may be empty strings. result = newStringOfCap(s.len + s.len shr 2) result.add(prefix) for c in items(s): case c of '\0'..'\31', '\128'..'\255': add(result, "\\x") add(result, toHex(ord(c), 2)) of '\\': add(result, "\\\\") of '\'': add(result, "\\'") of '\"': add(result, "\\\"") else: add(result, c) add(result, suffix) proc unescape*(s: string, prefix = "\"", suffix = "\""): string {.noSideEffect, rtl, extern: "nsuUnescape".} = ## Unescapes a string `s`. ## ## This complements `escape <#escape>`_ as it performs the opposite ## operations. ## ## If `s` does not begin with ``prefix`` and end with ``suffix`` a ## EInvalidValue exception will be raised. result = newStringOfCap(s.len) var i = 0 if s[0 .. prefix.len-1] != prefix: raise newException(EInvalidValue, "String does not start with a prefix of: " & prefix) i.inc() while true: if i == s.len-suffix.len: break case s[i] of '\\': case s[i+1]: of 'x': let j = parseHexInt(s[i+2 .. i+3]) result.add(chr(j)) inc(i, 2) of '\\': result.add('\\') of '\'': result.add('\'') of '\"': result.add('\"') else: result.add("\\" & s[i+1]) inc(i) of '\0': break else: result.add(s[i]) i.inc() if s[i .. -1] != suffix: raise newException(EInvalidValue, "String does not end with a suffix of: " & suffix) proc validIdentifier*(s: string): bool {.noSideEffect, rtl, extern: "nsuValidIdentifier".} = ## Returns true if `s` is a valid identifier. ## ## A valid identifier starts with a character of the set `IdentStartChars` ## and is followed by any number of characters of the set `IdentChars`. if s[0] in IdentStartChars: for i in 1..s.len-1: if s[i] notin IdentChars: return false return true proc editDistance*(a, b: string): int {.noSideEffect, rtl, extern: "nsuEditDistance".} = ## Returns the edit distance between `a` and `b`. ## ## This uses the `Levenshtein`:idx: distance algorithm with only a linear ## memory overhead. This implementation is highly optimized! var len1 = a.len var len2 = b.len if len1 > len2: # make `b` the longer string return editDistance(b, a) # strip common prefix: var s = 0 while a[s] == b[s] and a[s] != '\0': inc(s) dec(len1) dec(len2) # strip common suffix: while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]: dec(len1) dec(len2) # trivial cases: if len1 == 0: return len2 if len2 == 0: return len1 # another special case: if len1 == 1: for j in s..len2-1: if a[s] == b[j]: return len2 - 1 return len2 inc(len1) inc(len2) var half = len1 shr 1 # initalize first row: #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int))) var row: seq[int] newSeq(row, len2) var e = s + len2 - 1 # end marker for i in 1..len2 - half - 1: row[i] = i row[0] = len1 - half - 1 for i in 1 .. len1 - 1: var char1 = a[i + s - 1] var char2p: int var D, x: int var p: int if i >= len1 - half: # skip the upper triangle: var offset = i - len1 + half char2p = offset p = offset var c3 = row[p] + ord(char1 != b[s + char2p]) inc(p) inc(char2p) x = row[p] + 1 D = x if x > c3: x = c3 row[p] = x inc(p) else: p = 1 char2p = 0 D = i x = i if i <= half + 1: # skip the lower triangle: e = len2 + i - half - 2 # main: while p <= e: dec(D) var c3 = D + ord(char1 != b[char2p + s]) inc(char2p) inc(x) if x > c3: x = c3 D = row[p] + 1 if x > D: x = D row[p] = x inc(p) # lower triangle sentinel: if i <= half: dec(D) var c3 = D + ord(char1 != b[char2p + s]) inc(x) if x > c3: x = c3 row[p] = x result = row[e] #dealloc(row) # floating point formating: proc c_sprintf(buf, frmt: cstring) {.header: "", importc: "sprintf", varargs, noSideEffect.} type TFloatFormat* = enum ## the different modes of floating point formating ffDefault, ## use the shorter floating point notation ffDecimal, ## use decimal floating point notation ffScientific ## use scientific notation (using ``e`` character) proc formatBiggestFloat*(f: BiggestFloat, format: TFloatFormat = ffDefault, precision: range[0..32] = 16): string {. noSideEffect, operator: 2, rtl, extern: "nsu$1".} = ## Converts a floating point value `f` to a string. ## ## If ``format == ffDecimal`` then precision is the number of digits to ## be printed after the decimal point. ## If ``format == ffScientific`` then precision is the maximum number ## of significant digits to be printed. ## `precision`'s default value is the maximum number of meaningful digits ## after the decimal point for Nimrod's ``biggestFloat`` type. ## ## If ``precision == 0``, it tries to format it nicely. const floatFormatToChar: array[TFloatFormat, char] = ['g', 'f', 'e'] var frmtstr {.noinit.}: array[0..5, char] buf {.noinit.}: array[0..2500, char] frmtstr[0] = '%' if precision > 0: frmtstr[1] = '#' frmtstr[2] = '.' frmtstr[3] = '*' frmtstr[4] = floatFormatToChar[format] frmtstr[5] = '\0' c_sprintf(buf, frmtstr, precision, f) else: frmtstr[1] = floatFormatToChar[format] frmtstr[2] = '\0' c_sprintf(buf, frmtstr, f) result = $buf proc formatFloat*(f: float, format: TFloatFormat = ffDefault, precision: range[0..32] = 16): string {. noSideEffect, operator: 2, rtl, extern: "nsu$1".} = ## Converts a floating point value `f` to a string. ## ## If ``format == ffDecimal`` then precision is the number of digits to ## be printed after the decimal point. ## If ``format == ffScientific`` then precision is the maximum number ## of significant digits to be printed. ## `precision`'s default value is the maximum number of meaningful digits ## after the decimal point for Nimrod's ``float`` type. result = formatBiggestFloat(f, format, precision) proc formatSize*(bytes: BiggestInt, decimalSep = '.'): string = ## Rounds and formats `bytes`. Examples: ## ## .. code-block:: nimrod ## ## formatSize(1'i64 shl 31 + 300'i64) == "2.204GB" ## formatSize(4096) == "4KB" ## template frmt(a, b, c: expr): expr = let bs = $b insertSep($a) & decimalSep & bs.substr(0, 2) & c let gigabytes = bytes shr 30 let megabytes = bytes shr 20 let kilobytes = bytes shr 10 if gigabytes != 0: result = frmt(gigabytes, megabytes, "GB") elif megabytes != 0: result = frmt(megabytes, kilobytes, "MB") elif kilobytes != 0: result = frmt(kilobytes, bytes, "KB") else: result = insertSep($bytes) & "B" proc findNormalized(x: string, inArray: openArray[string]): int = var i = 0 while i < high(inArray): if cmpIgnoreStyle(x, inArray[i]) == 0: return i inc(i, 2) # incrementing by 1 would probably lead to a # security hole... return -1 proc invalidFormatString() {.noinline.} = raise newException(EInvalidValue, "invalid format string") proc addf*(s: var string, formatstr: string, a: varargs[string, `$`]) {. noSideEffect, rtl, extern: "nsuAddf".} = ## The same as ``add(s, formatstr % a)``, but more efficient. const PatternChars = {'a'..'z', 'A'..'Z', '0'..'9', '\128'..'\255', '_'} var i = 0 var num = 0 while i < len(formatstr): if formatstr[i] == '$': case formatstr[i+1] # again we use the fact that strings # are zero-terminated here of '#': if num >% a.high: invalidFormatString() add s, a[num] inc i, 2 inc num of '$': add s, '$' inc(i, 2) of '1'..'9', '-': var j = 0 inc(i) # skip $ var negative = formatstr[i] == '-' if negative: inc i while formatstr[i] in Digits: j = j * 10 + ord(formatstr[i]) - ord('0') inc(i) let idx = if not negative: j-1 else: a.len-j if idx >% a.high: invalidFormatString() add s, a[idx] of '{': var j = i+1 while formatstr[j] notin {'\0', '}'}: inc(j) var x = findNormalized(substr(formatstr, i+2, j-1), a) if x >= 0 and x < high(a): add s, a[x+1] else: invalidFormatString() i = j+1 of 'a'..'z', 'A'..'Z', '\128'..'\255', '_': var j = i+1 while formatstr[j] in PatternChars: inc(j) var x = findNormalized(substr(formatstr, i+1, j-1), a) if x >= 0 and x < high(a): add s, a[x+1] else: invalidFormatString() i = j else: invalidFormatString() else: add s, formatstr[i] inc(i) proc `%` *(formatstr: string, a: openArray[string]): string {.noSideEffect, rtl, extern: "nsuFormatOpenArray".} = ## Interpolates a format string with the values from `a`. ## ## The `substitution`:idx: operator performs string substitutions in ## `formatstr` and returns a modified `formatstr`. This is often called ## `string interpolation`:idx:. ## ## This is best explained by an example: ## ## .. code-block:: nimrod ## "$1 eats $2." % ["The cat", "fish"] ## ## Results in: ## ## .. code-block:: nimrod ## "The cat eats fish." ## ## The substitution variables (the thing after the ``$``) are enumerated ## from 1 to ``a.len``. ## To produce a verbatim ``$``, use ``$$``. ## The notation ``$#`` can be used to refer to the next substitution ## variable: ## ## .. code-block:: nimrod ## "$# eats $#." % ["The cat", "fish"] ## ## Substitution variables can also be words (that is ## ``[A-Za-z_]+[A-Za-z0-9_]*``) in which case the arguments in `a` with even ## indices are keys and with odd indices are the corresponding values. ## An example: ## ## .. code-block:: nimrod ## "$animal eats $food." % ["animal", "The cat", "food", "fish"] ## ## Results in: ## ## .. code-block:: nimrod ## "The cat eats fish." ## ## The variables are compared with `cmpIgnoreStyle`. `EInvalidValue` is ## raised if an ill-formed format string has been passed to the `%` operator. result = newStringOfCap(formatstr.len + a.len shl 4) addf(result, formatstr, a) proc `%` *(formatstr, a: string): string {.noSideEffect, rtl, extern: "nsuFormatSingleElem".} = ## This is the same as ``formatstr % [a]``. result = newStringOfCap(formatstr.len + a.len) addf(result, formatstr, [a]) proc format*(formatstr: string, a: varargs[string, `$`]): string {.noSideEffect, rtl, extern: "nsuFormatVarargs".} = ## This is the same as ``formatstr % a`` except that it supports ## auto stringification. result = newStringOfCap(formatstr.len + a.len) addf(result, formatstr, a) {.pop.} when isMainModule: doAssert align("abc", 4) == " abc" doAssert align("a", 0) == "a" doAssert align("1232", 6) == " 1232" doAssert align("1232", 6, '#') == "##1232" echo wordWrap(""" this is a long text -- muchlongerthan10chars and here it goes""", 10, false) doAssert formatBiggestFloat(0.00000000001, ffDecimal, 11) == "0.00000000001" doAssert formatBiggestFloat(0.00000000001, ffScientific, 1) == "1.0e-11" doAssert "$# $3 $# $#" % ["a", "b", "c"] == "a c b c" echo formatSize(1'i64 shl 31 + 300'i64) # == "4,GB" echo formatSize(1'i64 shl 31) doAssert "$animal eats $food." % ["animal", "The cat", "food", "fish"] == "The cat eats fish." doAssert "-ld a-ldz -ld".replaceWord("-ld") == " a-ldz " doAssert "-lda-ldz -ld abc".replaceWord("-ld") == "-lda-ldz abc" type TMyEnum = enum enA, enB, enC, enuD, enE doAssert parseEnum[TMyEnum]("enu_D") == enuD doAssert parseEnum("invalid enum value", enC) == enC