diff options
author | Andreas Rumpf <rumpf_a@web.de> | 2009-06-08 08:06:25 +0200 |
---|---|---|
committer | Andreas Rumpf <rumpf_a@web.de> | 2009-06-08 08:06:25 +0200 |
commit | 4d4b3b1c04d41868ebb58bd9ccba7b303007e900 (patch) | |
tree | 909ed0aad0b145733521f4ac2bfb938dd4b43785 /lib/pure/strutils.nim | |
parent | ce88dc3e67436939b03f97e624c11ca6058fedce (diff) | |
download | Nim-4d4b3b1c04d41868ebb58bd9ccba7b303007e900.tar.gz |
version0.7.10
Diffstat (limited to 'lib/pure/strutils.nim')
-rw-r--r-- | lib/pure/strutils.nim | 975 |
1 files changed, 975 insertions, 0 deletions
diff --git a/lib/pure/strutils.nim b/lib/pure/strutils.nim new file mode 100644 index 000000000..a4895110b --- /dev/null +++ b/lib/pure/strutils.nim @@ -0,0 +1,975 @@ +# +# +# Nimrod's Runtime Library +# (c) Copyright 2009 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module contains various string utility routines. +## See the module `regexprs` for regular expression support. +## All the routines here are avaiable for the EMCAScript target too! + +{.deadCodeElim: on.} + +{.push debugger:off .} # the user does not want to trace a part + # of the standard library! + +# copied from excpt.nim, because I don't want to make this template public +template newException(exceptn, message: expr): expr = + block: # open a new scope + var + e: ref exceptn + new(e) + e.msg = message + e + + +type + TCharSet* = set[char] # for compability for Nim + +const + Whitespace* = {' ', '\t', '\v', '\r', '\l', '\f'} + ## All the characters that count as whitespace. + + Letters* = {'A'..'Z', 'a'..'z'} + ## the set of letters + + Digits* = {'0'..'9'} + ## the set of digits + + IdentChars* = {'a'..'z', 'A'..'Z', '0'..'9', '_'} + ## the set of characters an identifier can consist of + + IdentStartChars* = {'a'..'z', 'A'..'Z', '_'} + ## the set of characters an identifier can start with + + strStart* = 0 # this is only for bootstraping + # XXX: remove this someday + nl* = "\n" # this is only for bootstraping XXX: remove this somehow + +proc `%` *(formatstr: string, a: openarray[string]): string {.noSideEffect.} + ## The `substitution`:idx: operator performs string substitutions in + ## `formatstr` and returns a modified `formatstr`. This is often called + ## `string interpolation`:idx:. + ## + ## This is best explained by an example: + ## + ## .. code-block:: nimrod + ## "$1 eats $2." % ["The cat", "fish"] + ## + ## Results in: + ## + ## .. code-block:: nimrod + ## "The cat eats fish." + ## + ## The substitution variables (the thing after the ``$``) + ## are enumerated from 1 to 9. + ## Substitution variables can also be words (that is + ## ``[A-Za-z_]+[A-Za-z0-9_]*``) in which case the arguments in `a` with even + ## indices are keys and with odd indices are the corresponding values. + ## An example: + ## + ## .. code-block:: nimrod + ## "$animal eats $food." % ["animal", "The cat", "food", "fish"] + ## + ## Results in: + ## + ## .. code-block:: nimrod + ## "The cat eats fish." + ## + ## The variables are compared with `cmpIgnoreStyle`. `EInvalidValue` is + ## raised if an ill-formed format string has been passed to the `%` operator. + +proc `%` *(formatstr, a: string): string {.noSideEffect.} + ## This is the same as ``formatstr % [a]``. + +proc addf*(s: var string, formatstr: string, a: openarray[string]) + ## The same as ``add(s, formatstr % a)``, but more efficient. + +proc strip*(s: string, leading = true, trailing = true): string {.noSideEffect.} + ## Strips whitespace from `s` and returns the resulting string. + ## If `leading` is true, leading whitespace is stripped. + ## If `trailing` is true, trailing whitespace is stripped. + +proc toLower*(s: string): string {.noSideEffect.} + ## Converts `s` into lower case. This works only for the letters A-Z. + ## See `unicode.toLower` for a version that works for any Unicode character. + +proc toLower*(c: Char): Char {.noSideEffect.} + ## Converts `c` into lower case. This works only for the letters A-Z. + ## See `unicode.toLower` for a version that works for any Unicode character. + +proc toUpper*(s: string): string {.noSideEffect.} + ## Converts `s` into upper case. This works only for the letters a-z. + ## See `unicode.toUpper` for a version that works for any Unicode character. + +proc toUpper*(c: Char): Char {.noSideEffect.} + ## Converts `c` into upper case. This works only for the letters a-z. + ## See `unicode.toUpper` for a version that works for any Unicode character. + +proc capitalize*(s: string): string {.noSideEffect.} + ## Converts the first character of `s` into upper case. + ## This works only for the letters a-z. + +proc normalize*(s: string): string {.noSideEffect.} + ## Normalizes the string `s`. That means to convert it to lower case and + ## remove any '_'. This is needed for Nimrod identifiers for example. + +proc findSubStr*(sub, s: string, start: int = 0): int {. + noSideEffect, deprecated.} + ## Searches for `sub` in `s` starting at position `start`. Searching is + ## case-sensitive. If `sub` is not in `s`, -1 is returned. + ## **Deprecated since version 0.7.6**: Use `find` instead, but beware that + ## this has a different parameter order. + +proc findSubStr*(sub: char, s: string, start: int = 0): int {. + noSideEffect, deprecated.} + ## Searches for `sub` in `s` starting at position `start`. Searching is + ## case-sensitive. If `sub` is not in `s`, -1 is returned. + ## **Deprecated since version 0.7.6**: Use `find` instead, but beware that + ## this has a different parameter order. + +proc findChars*(chars: set[char], s: string, start: int = 0): int {. + noSideEffect, deprecated.} + ## Searches for `chars` in `s` starting at position `start`. If `s` contains + ## none of the characters in `chars`, -1 is returned. + ## **Deprecated since version 0.7.6**: Use `find` instead, but beware that + ## this has a different parameter order. + +proc find*(s, sub: string, start: int = 0): int {.noSideEffect.} + ## Searches for `sub` in `s` starting at position `start`. Searching is + ## case-sensitive. If `sub` is not in `s`, -1 is returned. + +proc find*(s: string, sub: char, start: int = 0): int {.noSideEffect.} + ## Searches for `sub` in `s` starting at position `start`. Searching is + ## case-sensitive. If `sub` is not in `s`, -1 is returned. + +proc find*(s: string, chars: set[char], start: int = 0): int {.noSideEffect.} + ## Searches for `chars` in `s` starting at position `start`. If `s` contains + ## none of the characters in `chars`, -1 is returned. + +proc replaceStr*(s, sub, by: string): string {.noSideEffect.} + ## Replaces `sub` in `s` by the string `by`. + +proc replaceStr*(s: string, sub, by: char): string {.noSideEffect.} + ## optimized version for characters. + +proc deleteStr*(s: var string, first, last: int) + ## Deletes in `s` the characters at position `first`..`last`. This modifies + ## `s` itself, it does not return a copy. + +proc toOctal*(c: char): string + ## Converts a character `c` to its octal representation. The resulting + ## string may not have a leading zero. Its length is always exactly 3. + +iterator split*(s: string, seps: set[char] = Whitespace): string = + ## Splits the string `s` into substrings. + ## + ## Substrings are separated by a substring containing only `seps`. + ## Examples: + ## + ## .. code-block:: nimrod + ## for word in split(" this is an example "): + ## writeln(stdout, word) + ## + ## Results in: + ## + ## .. code-block:: nimrod + ## "this" + ## "is" + ## "an" + ## "example" + ## + ## for word in split(";;this;is;an;;example;;;", {';'}): + ## writeln(stdout, word) + ## + ## produces in the same output. + var + first: int = 0 + last: int = 0 + assert(not ('\0' in seps)) + while last < len(s): + while s[last] in seps: inc(last) + first = last + while last < len(s) and s[last] not_in seps: inc(last) # BUGFIX! + yield copy(s, first, last-1) + +iterator split*(s: string, sep: char): string = + ## Splits the string `s` into substrings. + ## + ## Substrings are separated by the character `sep`. + ## Example: + ## + ## .. code-block:: nimrod + ## for word in split(";;this;is;an;;example;;;", ';'): + ## writeln(stdout, word) + ## + ## Results in: + ## + ## .. code-block:: nimrod + ## "" + ## "" + ## "this" + ## "is" + ## "an" + ## "" + ## "example" + ## "" + ## "" + ## "" + ## + var last = 0 + assert('\0' != sep) + if len(s) > 0: + # `<=` is correct here for the edge cases! + while last <= len(s): + var first = last + while last < len(s) and s[last] != sep: inc(last) + yield copy(s, first, last-1) + inc(last) + +iterator splitLines*(s: string): string = + ## Splits the string `s` into its containing lines. Each newline + ## combination (CR, LF, CR-LF) is supported. The result strings contain + ## no trailing ``\n``. + ## + ## Example: + ## + ## .. code-block:: nimrod + ## for line in lines("\nthis\nis\nan\n\nexample\n"): + ## writeln(stdout, line) + ## + ## Results in: + ## + ## .. code-block:: nimrod + ## "" + ## "this" + ## "is" + ## "an" + ## "" + ## "example" + ## "" + var first = 0 + var last = 0 + while true: + while s[last] notin {'\0', '\c', '\l'}: inc(last) + yield copy(s, first, last-1) + # skip newlines: + if s[last] == '\l': inc(last) + elif s[last] == '\c': + inc(last) + if s[last] == '\l': inc(last) + else: break # was '\0' + first = last + +proc splitLinesSeq*(s: string): seq[string] {.noSideEffect.} = + ## The same as `split`, but is a proc that returns a sequence of substrings. + result = @[] + for line in splitLines(s): add(result, line) + +proc splitSeq*(s: string, seps: set[char] = Whitespace): seq[string] {. + noSideEffect.} + ## The same as `split`, but is a proc that returns a sequence of substrings. + +proc splitSeq*(s: string, sep: char): seq[string] {.noSideEffect.} = + ## The same as `split`, but is a proc that returns a sequence of substrings. + result = @[] + for sub in split(s, sep): add(result, sub) + +proc cmpIgnoreCase*(a, b: string): int {.noSideEffect.} + ## Compares two strings in a case insensitive manner. Returns: + ## + ## | 0 iff a == b + ## | < 0 iff a < b + ## | > 0 iff a > b + +proc cmpIgnoreStyle*(a, b: string): int {.noSideEffect.} + ## Compares two strings normalized (i.e. case and + ## underscores do not matter). Returns: + ## + ## | 0 iff a == b + ## | < 0 iff a < b + ## | > 0 iff a > b + +proc contains*(s: string, c: char): bool {.noSideEffect.} + ## Same as ``findSubStr(c, s) >= 0``. + +proc contains*(s, sub: string): bool {.noSideEffect.} + ## Same as ``findSubStr(sub, s) >= 0``. + +proc contains*(s: string, chars: set[char]): bool {.noSideEffect.} + ## Same as ``findChars(s, chars) >= 0``. + +proc toHex*(x: BiggestInt, len: int): string {.noSideEffect.} + ## Converts `x` to its hexadecimal representation. The resulting string + ## will be exactly `len` characters long. No prefix like ``0x`` + ## is generated. `x` is treated as unsigned value. + +proc intToStr*(x: int, minchars: int = 1): string + ## Converts `x` to its decimal representation. The resulting string + ## will be minimally `minchars` characters long. This is achieved by + ## adding leading zeros. + +proc ParseInt*(s: string): int {.noSideEffect.} + ## Parses a decimal integer value contained in `s`. If `s` is not + ## a valid integer, `EInvalidValue` is raised. + # XXX: make this biggestint! + +proc ParseBiggestInt*(s: string): biggestInt {.noSideEffect.} + ## Parses a decimal integer value contained in `s`. If `s` is not + ## a valid integer, `EInvalidValue` is raised. + +proc ParseFloat*(s: string, start = 0): float {.noSideEffect.} + ## Parses a decimal floating point value contained in `s`. If `s` is not + ## a valid floating point number, `EInvalidValue` is raised. ``NAN``, + ## ``INF``, ``-INF`` are also supported (case insensitive comparison). + # XXX: make this biggestfloat. + +# the stringify and format operators: +proc toString*[Ty](x: Ty): string + ## This generic proc is the same as the stringify operator `$`. + +proc repeatChar*(count: int, c: Char = ' '): string + ## Returns a string of length `count` consisting only of + ## the character `c`. + +proc startsWith*(s, prefix: string): bool {.noSideEffect.} + ## Returns true iff ``s`` starts with ``prefix``. + ## If ``prefix == ""`` true is returned. + +proc endsWith*(s, suffix: string): bool {.noSideEffect.} + ## Returns true iff ``s`` ends with ``suffix``. + ## If ``suffix == ""`` true is returned. + +proc addSep*(dest: var string, sep = ", ", startLen = 0) {.noSideEffect, + inline.} = + ## A shorthand for: + ## + ## .. code-block:: nimrod + ## if dest.len > startLen: add(dest, sep) + ## + ## This is often useful for generating some code where the items need to + ## be *separated* by `sep`. `sep` is only added if `dest` is longer than + ## `startLen`. The following example creates a string describing + ## an array of integers: + ## + ## .. code-block:: nimrod + ## var arr = "[" + ## for x in items([2, 3, 5, 7, 11]): + ## addSep(arr, startLen=len("[")) + ## add(arr, $x) + ## add(arr, "]") + if dest.len > startLen: add(dest, sep) + +proc allCharsInSet*(s: string, theSet: TCharSet): bool = + ## returns true iff each character of `s` is in the set `theSet`. + for c in items(s): + if not (c in theSet): return false + return true + +proc quoteIfContainsWhite*(s: string): string = + ## returns ``'"' & s & '"'`` if `s` contains a space and does not + ## start with a quote, else returns `s` + if find(s, {' ', '\t'}) >= 0 and s[0] != '"': + result = '"' & s & '"' + else: + result = s + +proc startsWith(s, prefix: string): bool = + var i = 0 + while true: + if prefix[i] == '\0': return true + if s[i] != prefix[i]: return false + inc(i) + +proc endsWith(s, suffix: string): bool = + var + i = 0 + j = len(s) - len(suffix) + while true: + if suffix[i] == '\0': return true + if s[i+j] != suffix[i]: return false + inc(i) + +when false: + proc abbrev(s: string, possibilities: openarray[string]): int = + ## returns the index of the first item in `possibilities` if not + ## ambigious; -1 if no item has been found; -2 if multiple items + ## match. + result = -1 # none found + for i in 0..possibilities.len-1: + if possibilities[i].startsWith(s): + if result >= 0: return -2 # ambigious + result = i + +proc repeatChar(count: int, c: Char = ' '): string = + result = newString(count) + for i in 0..count-1: + result[i] = c + +proc intToStr(x: int, minchars: int = 1): string = + result = $abs(x) + for i in 1 .. minchars - len(result): + result = '0' & result + if x < 0: + result = '-' & result + +proc toString[Ty](x: Ty): string = return $x + +proc toOctal(c: char): string = + result = newString(3) + var val = ord(c) + for i in countdown(2, 0): + result[i] = Chr(val mod 8 + ord('0')) + val = val div 8 + +proc `%`(formatstr: string, a: string): string = + return formatstr % [a] + +proc findNormalized(x: string, inArray: openarray[string]): int = + var i = 0 + while i < high(inArray): + if cmpIgnoreStyle(x, inArray[i]) == 0: return i + inc(i, 2) # incrementing by 1 would probably result in a + # security whole ... + return -1 + +proc addf(s: var string, formatstr: string, a: openarray[string]) = + const PatternChars = {'a'..'z', 'A'..'Z', '0'..'9', '\128'..'\255', '_'} + var i = 0 + while i < len(formatstr): + if formatstr[i] == '$': + case formatstr[i+1] # again we use the fact that strings + # are zero-terminated here + of '$': + add s, '$' + inc(i, 2) + of '1'..'9': + var j = 0 + inc(i) # skip $ + while formatstr[i] in {'0'..'9'}: + j = j * 10 + ord(formatstr[i]) - ord('0') + inc(i) + add s, a[j - 1] + of '{': + var j = i+1 + while formatstr[j] notin {'\0', '}'}: inc(j) + var x = findNormalized(copy(formatstr, i+2, j-1), a) + if x >= 0 and x < high(a): add s, a[x+1] + else: raise newException(EInvalidValue, "invalid format string") + i = j+1 + of 'a'..'z', 'A'..'Z', '\128'..'\255', '_': + var j = i+1 + while formatstr[j] in PatternChars: inc(j) + var x = findNormalized(copy(formatstr, i+1, j-1), a) + if x >= 0 and x < high(a): add s, a[x+1] + else: raise newException(EInvalidValue, "invalid format string") + i = j + else: raise newException(EInvalidValue, "invalid format string") + else: + add s, formatstr[i] + inc(i) + +proc `%`(formatstr: string, a: openarray[string]): string = + result = "" + addf(result, formatstr, a) + +proc cmpIgnoreCase(a, b: string): int = + # makes usage of the fact that strings are zero-terminated + for i in 0..len(a)-1: + var aa = toLower(a[i]) + var bb = toLower(b[i]) + result = ord(aa) - ord(bb) + if result != 0: break + +{.push checks: off, line_trace: off .} # this is a hot-spot in the compiler! + # thus we compile without checks here + +proc cmpIgnoreStyle(a, b: string): int = + var i = 0 + var j = 0 + while True: + while a[i] == '_': inc(i) + while b[j] == '_': inc(j) # BUGFIX: typo + var aa = toLower(a[i]) + var bb = toLower(b[j]) + result = ord(aa) - ord(bb) + if result != 0 or aa == '\0': break + inc(i) + inc(j) + +{.pop.} + +# ---------- splitting ----------------------------------------------------- + +proc splitSeq(s: string, seps: set[char]): seq[string] = + result = @[] + for sub in split(s, seps): add result, sub + +# --------------------------------------------------------------------------- + +proc strip(s: string, leading = true, trailing = true): string = + const + chars: set[Char] = Whitespace + var + first = 0 + last = len(s)-1 + if leading: + while s[first] in chars: inc(first) + if trailing: + while last >= 0 and s[last] in chars: dec(last) + result = copy(s, first, last) + +proc toLower(c: Char): Char = + if c in {'A'..'Z'}: + result = chr(ord(c) + (ord('a') - ord('A'))) + else: + result = c + +proc toLower(s: string): string = + result = newString(len(s)) + for i in 0..len(s) - 1: + result[i] = toLower(s[i]) + +proc toUpper(c: Char): Char = + if c in {'a'..'z'}: + result = Chr(Ord(c) - (Ord('a') - Ord('A'))) + else: + result = c + +proc toUpper(s: string): string = + result = newString(len(s)) + for i in 0..len(s) - 1: + result[i] = toUpper(s[i]) + +proc capitalize(s: string): string = + result = toUpper(s[0]) & copy(s, 1) + +proc normalize(s: string): string = + result = "" + for i in 0..len(s) - 1: + if s[i] in {'A'..'Z'}: + add result, Chr(Ord(s[i]) + (Ord('a') - Ord('A'))) + elif s[i] != '_': + add result, s[i] + +type + TSkipTable = array[Char, int] + +proc preprocessSub(sub: string, a: var TSkipTable) = + var m = len(sub) + for i in 0..0xff: a[chr(i)] = m+1 + for i in 0..m-1: a[sub[i]] = m-i + +proc findSubStrAux(s, sub: string, start: int, a: TSkipTable): int = + # fast "quick search" algorithm: + var + m = len(sub) + n = len(s) + # search: + var j = start + while j <= n - m: + block match: + for k in 0..m-1: + if sub[k] != s[k+j]: break match + return j + inc(j, a[s[j+m]]) + return -1 + +proc findSubStr(sub, s: string, start: int = 0): int = + var a: TSkipTable + preprocessSub(sub, a) + result = findSubStrAux(s, sub, start, a) + # slow linear search: + #var + # i, j, M, N: int + #M = len(sub) + #N = len(s) + #i = start + #j = 0 + #if i >= N: + # result = -1 + #else: + # while True: + # if s[i] == sub[j]: + # Inc(i) + # Inc(j) + # else: + # i = i - j + 1 + # j = 0 + # if (j >= M): + # return i - M + # elif (i >= N): + # return -1 + +proc find(s, sub: string, start: int = 0): int = + var a: TSkipTable + preprocessSub(sub, a) + result = findSubStrAux(s, sub, start, a) + +proc find(s: string, sub: char, start: int = 0): int = + for i in start..len(s)-1: + if sub == s[i]: return i + return -1 + +proc find(s: string, chars: set[char], start: int = 0): int = + for i in start..s.len-1: + if s[i] in chars: return i + return -1 + +proc findSubStr(sub: char, s: string, start: int = 0): int = + for i in start..len(s)-1: + if sub == s[i]: return i + return -1 + +proc findChars(chars: set[char], s: string, start: int = 0): int = + for i in start..s.len-1: + if s[i] in chars: return i + return -1 + +proc contains(s: string, chars: set[char]): bool = + return find(s, chars) >= 0 + +proc contains(s: string, c: char): bool = + return find(s, c) >= 0 + +proc contains(s, sub: string): bool = + return find(s, sub) >= 0 + +proc replaceStr(s, sub, by: string): string = + var a: TSkipTable + result = "" + preprocessSub(sub, a) + var i = 0 + while true: + var j = findSubStrAux(s, sub, i, a) + if j < 0: break + add result, copy(s, i, j - 1) + add result, by + i = j + len(sub) + # copy the rest: + add result, copy(s, i) + +proc replaceStr(s: string, sub, by: char): string = + result = newString(s.len) + var i = 0 + while i < s.len: + if s[i] == sub: result[i] = by + else: result[i] = s[i] + inc(i) + +proc deleteStr(s: var string, first, last: int) = + # example: "abc___uvwxyz\0" (___ is to be deleted) + # --> first == 3, last == 5 + # s[first..] = s[last+1..] + var + i = first + while last+i+1 < len(s): + s[i] = s[last+i+1] + inc(i) + setlen(s, len(s)-(last-first+1)) + +# parsing numbers: + +proc toHex(x: BiggestInt, len: int): string = + const + HexChars = "0123456789ABCDEF" + var + shift: BiggestInt + result = newString(len) + for j in countdown(len-1, 0): + result[j] = HexChars[toU32(x shr shift) and 0xF'i32] + shift = shift + 4 + +{.push overflowChecks: on.} +# this must be compiled with overflow checking turned on: +proc rawParseInt(s: string, index: var int): BiggestInt = + # index contains the start position at proc entry; end position will be + # an index before the proc returns; index = -1 on error (no number at all) + # the problem here is that integers have an asymmetrical range: there is + # one more valid negative than prositive integer. Thus we perform the + # computation as a negative number and then change the sign at the end. + var + i: int = index # a local i is more efficient than accessing a var parameter + sign: BiggestInt = -1 + if s[i] == '+': + inc(i) + elif s[i] == '-': + inc(i) + sign = 1 + if s[i] in {'0'..'9'}: + result = 0 + while s[i] in {'0'..'9'}: + result = result * 10 - (ord(s[i]) - ord('0')) + inc(i) + while s[i] == '_': + inc(i) # underscores are allowed and ignored + result = result * sign + if s[i] == '\0': + index = i # store index back + else: + index = -1 # BUGFIX: error! + else: + index = -1 + +{.pop.} # overflowChecks + +proc parseInt(s: string): int = + var + index: int = 0 + res = rawParseInt(s, index) + if index == -1: + raise newException(EInvalidValue, "invalid integer: " & s) + elif (sizeof(int) <= 4) and + ((res < low(int)) or (res > high(int))): + raise newException(EOverflow, "overflow") + else: + result = int(res) # convert to smaller integer type + +proc ParseBiggestInt(s: string): biggestInt = + var index = 0 + result = rawParseInt(s, index) + if index == -1: + raise newException(EInvalidValue, "invalid integer: " & s) + +proc ParseFloat(s: string, start = 0): float = + var + esign = 1.0 + sign = 1.0 + i = start + exponent: int + flags: int + result = 0.0 + if s[i] == '+': inc(i) + elif s[i] == '-': + sign = -1.0 + inc(i) + if s[i] == 'N' or s[i] == 'n': + if s[i+1] == 'A' or s[i+1] == 'a': + if s[i+2] == 'N' or s[i+2] == 'n': + if s[i+3] == '\0': return NaN + raise newException(EInvalidValue, "invalid float: " & s) + if s[i] == 'I' or s[i] == 'i': + if s[i+1] == 'N' or s[i+1] == 'n': + if s[i+2] == 'F' or s[i+2] == 'f': + if s[i+3] == '\0': return Inf*sign + raise newException(EInvalidValue, "invalid float: " & s) + while s[i] in {'0'..'9'}: + # Read integer part + flags = flags or 1 + result = result * 10.0 + toFloat(ord(s[i]) - ord('0')) + inc(i) + while s[i] == '_': inc(i) + # Decimal? + if s[i] == '.': + var hd = 1.0 + inc(i) + while s[i] in {'0'..'9'}: + # Read fractional part + flags = flags or 2 + result = result * 10.0 + toFloat(ord(s[i]) - ord('0')) + hd = hd * 10.0 + inc(i) + while s[i] == '_': inc(i) + result = result / hd # this complicated way preserves precision + # Again, read integer and fractional part + if flags == 0: + raise newException(EInvalidValue, "invalid float: " & s) + # Exponent? + if s[i] in {'e', 'E'}: + inc(i) + if s[i] == '+': + inc(i) + elif s[i] == '-': + esign = -1.0 + inc(i) + if s[i] notin {'0'..'9'}: + raise newException(EInvalidValue, "invalid float: " & s) + while s[i] in {'0'..'9'}: + exponent = exponent * 10 + ord(s[i]) - ord('0') + inc(i) + while s[i] == '_': inc(i) + # Calculate Exponent + var hd = 1.0 + for j in 1..exponent: + hd = hd * 10.0 + if esign > 0.0: result = result * hd + else: result = result / hd + # Not all characters are read? + if s[i] != '\0': raise newException(EInvalidValue, "invalid float: " & s) + # evaluate sign + result = result * sign + +proc toOct*(x: BiggestInt, len: int): string = + ## converts `x` into its octal representation. The resulting string is + ## always `len` characters long. No leading ``0o`` prefix is generated. + var + mask: BiggestInt = 7 + shift: BiggestInt = 0 + assert(len > 0) + result = newString(len) + for j in countdown(len-1, 0): + result[j] = chr(int((x and mask) shr shift) + ord('0')) + shift = shift + 3 + mask = mask shl 3 + +proc toBin*(x: BiggestInt, len: int): string = + ## converts `x` into its binary representation. The resulting string is + ## always `len` characters long. No leading ``0b`` prefix is generated. + var + mask: BiggestInt = 1 + shift: BiggestInt = 0 + assert(len > 0) + result = newString(len) + for j in countdown(len-1, 0): + result[j] = chr(int((x and mask) shr shift) + ord('0')) + shift = shift + 1 + mask = mask shl 1 + +proc escape*(s: string, prefix = "\"", suffix = "\""): string = + ## Escapes a string `s`. This does these operations (at the same time): + ## * replaces any ``\`` by ``\\`` + ## * replaces any ``'`` by ``\'`` + ## * replaces any ``"`` by ``\"`` + ## * replaces any other character in the set ``{'\0'..'\31', '\128'..'\255'}`` + ## by ``\xHH`` where ``HH`` is its hexadecimal value. + ## The procedure has been designed so that its output is usable for many + ## different common syntaxes. The resulting string is prefixed with + ## ``prefix`` and suffixed with ``suffix``. Both may be empty strings. + result = prefix + for c in items(s): + case c + of '\0'..'\31', '\128'..'\255': + add(result, '\\') + add(result, toHex(ord(c), 2)) + of '\\': add(result, "\\\\") + of '\'': add(result, "\\'") + of '\"': add(result, "\\\"") + else: add(result, c) + add(result, suffix) + +proc validEmailAddress*(s: string): bool = + ## returns true if `s` seems to be a valid e-mail address. + ## The checking also uses a domain list. + const + chars = Letters + Digits + {'!','#','$','%','&', + '\'','*','+','/','=','?','^','_','`','{','}','|','~','-','.'} + var i = 0 + if s[i] notin chars or s[i] == '.': return false + while s[i] in chars: + if s[i] == '.' and s[i+1] == '.': return false + inc(i) + if s[i] != '@': return false + var j = len(s)-1 + if s[j] notin letters: return false + while j >= i and s[j] in letters: dec(j) + inc(i) # skip '@' + while s[i] in {'0'..'9', 'a'..'z', '-', '.'}: inc(i) + if s[i] != '\0': return false + + var x = copy(s, j+1) + if len(x) == 2 and x[0] in Letters and x[1] in Letters: return true + case toLower(x) + of "com", "org", "net", "gov", "mil", "biz", "info", "mobi", "name", + "aero", "jobs", "museum": return true + return false + +proc validIdentifier*(s: string): bool = + ## returns true if `s` is a valid identifier. A valid identifier starts + ## with a character of the set `IdentStartChars` and is followed by any + ## number of characters of the set `IdentChars`. + if s[0] in IdentStartChars: + for i in 1..s.len-1: + if s[i] notin IdentChars: return false + return true + +proc editDistance*(a, b: string): int = + ## returns the edit distance between `a` and `b`. This uses the Levenshtein + ## distance algorithm with only a linear memory overhead. This implementation + ## is highly optimized! + var len1 = a.len + var len2 = b.len + if len1 > len2: + # make `b` the longer string + return editDistance(b, a) + + # strip common prefix: + var s = 0 + while a[s] == b[s] and a[s] != '\0': + inc(s) + dec(len1) + dec(len2) + # strip common suffix: + while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]: + dec(len1) + dec(len2) + # trivial cases: + if len1 == 0: return len2 + if len2 == 0: return len1 + + # another special case: + if len1 == 1: + for j in s..len2-1: + if a[s] == b[j]: return len2 - 1 + return len2 + + inc(len1) + inc(len2) + var half = len1 shr 1 + # initalize first row: + #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2 * sizeof(int))) + var row: seq[int] + newSeq(row, len2) + var e = s + len2 - 1 # end marker + for i in 1..len2 - half - 1: row[i] = i + row[0] = len1 - half - 1 + for i in 1 .. len1 - 1: + var char1 = a[i + s - 1] + var char2p: int + var D, x: int + var p: int + if i >= len1 - half: + # skip the upper triangle: + var offset = i - len1 + half + char2p = offset + p = offset + var c3 = row[p] + ord(char1 != b[s + char2p]) + inc(p) + inc(char2p) + x = row[p] + 1 + D = x + if x > c3: x = c3 + row[p] = x + inc(p) + else: + p = 1 + char2p = 0 + D = i + x = i + if i <= half + 1: + # skip the lower triangle: + e = len2 + i - half - 2 + # main: + while p <= e: + dec(D) + var c3 = D + ord(char1 != b[char2p + s]) + inc(char2p) + inc(x) + if x > c3: x = c3 + D = row[p] + 1 + if x > D: x = D + row[p] = x + inc(p) + # lower triangle sentinel: + if i <= half: + dec(D) + var c3 = D + ord(char1 != b[char2p + s]) + inc(x) + if x > c3: x = c3 + row[p] = x + result = row[e] + #dealloc(row) + +{.pop.} |