summary refs log tree commit diff stats
path: root/lib/strutils.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/strutils.nim')
-rw-r--r--lib/strutils.nim312
1 files changed, 227 insertions, 85 deletions
diff --git a/lib/strutils.nim b/lib/strutils.nim
index 6189c6a88..e3a412053 100644
--- a/lib/strutils.nim
+++ b/lib/strutils.nim
@@ -1,7 +1,7 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2006 Andreas Rumpf
+#        (c) Copyright 2009 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -9,8 +9,7 @@
 
 ## This module contains various string utility routines.
 ## See the module `regexprs` for regular expression support.
-## All the routines here are avaiable for the EMCAScript target
-## too!
+## All the routines here are avaiable for the EMCAScript target too!
 
 {.deadCodeElim: on.}
 
@@ -33,13 +32,66 @@ type
 const
   Whitespace* = {' ', '\t', '\v', '\r', '\l', '\f'}
     ## All the characters that count as whitespace.
+    
+  Letters* = {'A'..'Z', 'a'..'z'}
+    ## the set of letters
+  
+  Digits* = {'0'..'9'}
+    ## the set of digits
+  
+  IdentChars* = {'a'..'z', 'A'..'Z', '0'..'9', '_'}
+    ## the set of characters an identifier can consist of
+  
+  IdentStartChars* = {'a'..'z', 'A'..'Z', '_'}
+    ## the set of characters an identifier can start with
 
   strStart* = 0 # this is only for bootstraping
                 # XXX: remove this someday
   nl* = "\n"    # this is only for bootstraping XXX: remove this somehow
 
-proc strip*(s: string): string {.noSideEffect.}
-  ## Strips leading and trailing whitespace from `s`.
+proc `%` *(formatstr: string, a: openarray[string]): string {.noSideEffect.}
+  ## The `substitution`:idx: operator performs string substitutions in
+  ## `formatstr` and returns a modified `formatstr`. This is often called
+  ## `string interpolation`:idx:.
+  ##
+  ## This is best explained by an example:
+  ##
+  ## .. code-block:: nimrod
+  ##   "$1 eats $2." % ["The cat", "fish"]
+  ##
+  ## Results in:
+  ##
+  ## .. code-block:: nimrod
+  ##   "The cat eats fish."
+  ##
+  ## The substitution variables (the thing after the ``$``)
+  ## are enumerated from 1 to 9.
+  ## Substitution variables can also be words (that is
+  ## ``[A-Za-z_]+[A-Za-z0-9_]*``) in which case the arguments in `a` with even
+  ## indices are keys and with odd indices are the corresponding values.
+  ## An example:
+  ##
+  ## .. code-block:: nimrod
+  ##   "$animal eats $food." % ["animal", "The cat", "food", "fish"]
+  ##
+  ## Results in:
+  ##
+  ## .. code-block:: nimrod
+  ##   "The cat eats fish."
+  ##
+  ## The variables are compared with `cmpIgnoreStyle`. `EInvalidValue` is
+  ## raised if an ill-formed format string has been passed to the `%` operator.
+
+proc `%` *(formatstr, a: string): string {.noSideEffect.}
+  ## This is the same as ``formatstr % [a]``.
+
+proc addf*(s: var string, formatstr: string, a: openarray[string])
+  ## The same as ``add(s, formatstr % a)``, but more efficient.
+
+proc strip*(s: string, leading = true, trailing = true): string {.noSideEffect.}
+  ## Strips whitespace from `s` and returns the resulting string.
+  ## If `leading` is true, leading whitespace is stripped.
+  ## If `trailing` is true, trailing whitespace is stripped.
 
 proc toLower*(s: string): string {.noSideEffect.}
   ## Converts `s` into lower case. This works only for the letters A-Z.
@@ -65,15 +117,36 @@ proc normalize*(s: string): string {.noSideEffect.}
   ## Normalizes the string `s`. That means to convert it to lower case and
   ## remove any '_'. This is needed for Nimrod identifiers for example.
 
-proc findSubStr*(sub, s: string, start: int = 0): int {.noSideEffect.}
+proc findSubStr*(sub, s: string, start: int = 0): int {.
+  noSideEffect, deprecated.}
+  ## Searches for `sub` in `s` starting at position `start`. Searching is
+  ## case-sensitive. If `sub` is not in `s`, -1 is returned.
+  ## **Deprecated since version 0.7.6**: Use `find` instead, but beware that
+  ## this has a different parameter order.
+
+proc findSubStr*(sub: char, s: string, start: int = 0): int {.
+  noSideEffect, deprecated.}
+  ## Searches for `sub` in `s` starting at position `start`. Searching is
+  ## case-sensitive. If `sub` is not in `s`, -1 is returned.
+  ## **Deprecated since version 0.7.6**: Use `find` instead, but beware that
+  ## this has a different parameter order.
+
+proc findChars*(chars: set[char], s: string, start: int = 0): int {.
+  noSideEffect, deprecated.}
+  ## Searches for `chars` in `s` starting at position `start`. If `s` contains
+  ## none of the characters in `chars`, -1 is returned.
+  ## **Deprecated since version 0.7.6**: Use `find` instead, but beware that
+  ## this has a different parameter order.
+
+proc find*(s, sub: string, start: int = 0): int {.noSideEffect.}
   ## Searches for `sub` in `s` starting at position `start`. Searching is
   ## case-sensitive. If `sub` is not in `s`, -1 is returned.
 
-proc findSubStr*(sub: char, s: string, start: int = 0): int {.noSideEffect.}
+proc find*(s: string, sub: char, start: int = 0): int {.noSideEffect.}
   ## Searches for `sub` in `s` starting at position `start`. Searching is
   ## case-sensitive. If `sub` is not in `s`, -1 is returned.
 
-proc findChars*(chars: set[char], s: string, start: int = 0): int {.noSideEffect.}
+proc find*(s: string, chars: set[char], start: int = 0): int {.noSideEffect.}
   ## Searches for `chars` in `s` starting at position `start`. If `s` contains
   ## none of the characters in `chars`, -1 is returned.
 
@@ -95,15 +168,15 @@ iterator split*(s: string, seps: set[char] = Whitespace): string =
   ## Splits the string `s` into substrings.
   ##
   ## Substrings are separated by a substring containing only `seps`.
-  ## The seperator substrings are not returned in `sub`, nor are they part
-  ## of `sub`.
-  ## Examples::
+  ## Examples:
   ##
+  ## .. code-block:: nimrod
   ##   for word in split("  this is an  example  "):
   ##     writeln(stdout, word)
   ##
-  ## Results in::
+  ## Results in:
   ##
+  ## .. code-block:: nimrod
   ##   "this"
   ##   "is"
   ##   "an"
@@ -123,18 +196,54 @@ iterator split*(s: string, seps: set[char] = Whitespace): string =
     while last < len(s) and s[last] not_in seps: inc(last) # BUGFIX!
     yield copy(s, first, last-1)
 
+iterator split*(s: string, sep: char): string =
+  ## Splits the string `s` into substrings.
+  ##
+  ## Substrings are separated by the character `sep`.
+  ## Example:
+  ##
+  ## .. code-block:: nimrod
+  ##   for word in split(";;this;is;an;;example;;;", ';'):
+  ##     writeln(stdout, word)
+  ##
+  ## Results in:
+  ##
+  ## .. code-block:: nimrod
+  ##   ""
+  ##   ""
+  ##   "this"
+  ##   "is"
+  ##   "an"
+  ##   ""
+  ##   "example"
+  ##   ""
+  ##   ""
+  ##   ""
+  ##
+  var last = 0
+  assert('\0' != sep)
+  if len(s) > 0:
+    # `<=` is correct here for the edge cases!
+    while last <= len(s):
+      var first = last
+      while last < len(s) and s[last] != sep: inc(last)
+      yield copy(s, first, last-1)
+      inc(last)
+
 iterator splitLines*(s: string): string =
   ## Splits the string `s` into its containing lines. Each newline
   ## combination (CR, LF, CR-LF) is supported. The result strings contain
   ## no trailing ``\n``.
   ##
-  ## Example::
+  ## Example:
   ##
+  ## .. code-block:: nimrod
   ##   for line in lines("\nthis\nis\nan\n\nexample\n"):
   ##     writeln(stdout, line)
   ##
-  ## Results in::
+  ## Results in:
   ##
+  ## .. code-block:: nimrod
   ##   ""
   ##   "this"
   ##   "is"
@@ -164,6 +273,11 @@ proc splitSeq*(s: string, seps: set[char] = Whitespace): seq[string] {.
   noSideEffect.}
   ## The same as `split`, but is a proc that returns a sequence of substrings.
 
+proc splitSeq*(s: string, sep: char): seq[string] {.noSideEffect.} =
+  ## The same as `split`, but is a proc that returns a sequence of substrings.
+  result = @[]
+  for sub in split(s, sep): add(result, sub)
+
 proc cmpIgnoreCase*(a, b: string): int {.noSideEffect.}
   ## Compares two strings in a case insensitive manner. Returns:
   ##
@@ -207,7 +321,7 @@ proc ParseBiggestInt*(s: string): biggestInt {.noSideEffect.}
   ## Parses a decimal integer value contained in `s`. If `s` is not
   ## a valid integer, `EInvalidValue` is raised.
 
-proc ParseFloat*(s: string): float {.noSideEffect.}
+proc ParseFloat*(s: string, start = 0): float {.noSideEffect.}
   ## Parses a decimal floating point value contained in `s`. If `s` is not
   ## a valid floating point number, `EInvalidValue` is raised. ``NAN``,
   ## ``INF``, ``-INF`` are also supported (case insensitive comparison).
@@ -217,37 +331,6 @@ proc ParseFloat*(s: string): float {.noSideEffect.}
 proc toString*[Ty](x: Ty): string
   ## This generic proc is the same as the stringify operator `$`.
 
-proc `%` *(formatstr: string, a: openarray[string]): string {.noSideEffect.}
-  ## The substitution operator performs string substitutions in `formatstr`
-  ## and returns the modified `formatstr`.
-  ##
-  ## This is best explained by an example::
-  ##
-  ##   "$1 eats $2." % ["The cat", "fish"]
-  ##
-  ## Results in::
-  ##
-  ##   "The cat eats fish."
-  ##
-  ## The substitution variables (the thing after the ``$``)
-  ## are enumerated from 1 to 9.
-  ## Substitution variables can also be words (that is
-  ## ``[A-Za-z_]+[A-Za-z0-9_]*``) in which case the arguments in `a` with even
-  ## indices are keys and with odd indices are the corresponding values. Again
-  ## an example::
-  ##
-  ##   "$animal eats $food." % ["animal", "The cat", "food", "fish"]
-  ##
-  ## Results in::
-  ##
-  ##   "The cat eats fish."
-  ##
-  ## The variables are compared with `cmpIgnoreStyle`. `EInvalidValue` is
-  ## raised if an ill-formed format string has been passed to the `%` operator.
-
-proc `%` *(formatstr, a: string): string {.noSideEffect.}
-  ## This is the same as `formatstr % [a]`.
-
 proc repeatChar*(count: int, c: Char = ' '): string
   ## Returns a string of length `count` consisting only of
   ## the character `c`.
@@ -260,7 +343,25 @@ proc endsWith*(s, suffix: string): bool {.noSideEffect.}
   ## Returns true iff ``s`` ends with ``suffix``.
   ## If ``suffix == ""`` true is returned.
 
-# implementation
+proc addSep*(dest: var string, sep = ", ", startLen = 0) {.noSideEffect,
+                                                           inline.} = 
+  ## A shorthand for: 
+  ## 
+  ## .. code-block:: nimrod
+  ##   if dest.len > startLen: add(dest, sep)
+  ## 
+  ## This is often useful for generating some code where the items need to
+  ## be *separated* by `sep`. `sep` is only added if `dest` is longer than
+  ## `startLen`. The following example creates a string describing
+  ## an array of integers:  
+  ## 
+  ## .. code-block:: nimrod
+  ##   var arr = "["
+  ##   for x in items([2, 3, 5, 7, 11]):
+  ##     addSep(arr, startLen=len("["))
+  ##     add(arr, $x)
+  ##   add(arr, "]")
+  if dest.len > startLen: add(dest, sep)
 
 proc allCharsInSet*(s: string, theSet: TCharSet): bool =
   ## returns true iff each character of `s` is in the set `theSet`.
@@ -271,7 +372,7 @@ proc allCharsInSet*(s: string, theSet: TCharSet): bool =
 proc quoteIfContainsWhite*(s: string): string =
   ## returns ``'"' & s & '"'`` if `s` contains a space and does not
   ## start with a quote, else returns `s`
-  if findChars({' ', '\t'}, s) >= 0 and s[0] != '"':
+  if find(s, {' ', '\t'}) >= 0 and s[0] != '"':
     result = '"' & s & '"'
   else:
     result = s
@@ -307,10 +408,8 @@ proc intToStr(x: int, minchars: int = 1): string =
 proc toString[Ty](x: Ty): string = return $x
 
 proc toOctal(c: char): string =
-  var
-    val: int
   result = newString(3)
-  val = ord(c)
+  var val = ord(c)
   for i in countdown(2, 0):
     result[i] = Chr(val mod 8 + ord('0'))
     val = val div 8
@@ -326,18 +425,15 @@ proc findNormalized(x: string, inArray: openarray[string]): int =
               # security whole ...
   return -1
 
-proc `%`(formatstr: string, a: openarray[string]): string =
-  # the format operator
-  const
-    PatternChars = {'a'..'z', 'A'..'Z', '0'..'9', '\128'..'\255', '_'}
-  result = ""
+proc addf(s: var string, formatstr: string, a: openarray[string]) =
+  const PatternChars = {'a'..'z', 'A'..'Z', '0'..'9', '\128'..'\255', '_'}
   var i = 0
   while i < len(formatstr):
     if formatstr[i] == '$':
       case formatstr[i+1] # again we use the fact that strings
                           # are zero-terminated here
       of '$':
-        add result, '$'
+        add s, '$'
         inc(i, 2)
       of '1'..'9':
         var j = 0
@@ -345,25 +441,29 @@ proc `%`(formatstr: string, a: openarray[string]): string =
         while formatstr[i] in {'0'..'9'}:
           j = j * 10 + ord(formatstr[i]) - ord('0')
           inc(i)
-        add result, a[j - 1]
+        add s, a[j - 1]
       of '{':
         var j = i+1
         while formatstr[j] notin {'\0', '}'}: inc(j)
         var x = findNormalized(copy(formatstr, i+2, j-1), a)
-        if x >= 0 and x < high(a): add result, a[x+1]
+        if x >= 0 and x < high(a): add s, a[x+1]
         else: raise newException(EInvalidValue, "invalid format string")
         i = j+1
       of 'a'..'z', 'A'..'Z', '\128'..'\255', '_':
         var j = i+1
         while formatstr[j] in PatternChars: inc(j)
         var x = findNormalized(copy(formatstr, i+1, j-1), a)
-        if x >= 0 and x < high(a): add result, a[x+1]
+        if x >= 0 and x < high(a): add s, a[x+1]
         else: raise newException(EInvalidValue, "invalid format string")
         i = j
       else: raise newException(EInvalidValue, "invalid format string")
     else:
-      add result, formatstr[i]
+      add s, formatstr[i]
       inc(i)
+  
+proc `%`(formatstr: string, a: openarray[string]): string =
+  result = ""
+  addf(result, formatstr, a)
 
 proc cmpIgnoreCase(a, b: string): int =
   # makes usage of the fact that strings are zero-terminated
@@ -377,9 +477,8 @@ proc cmpIgnoreCase(a, b: string): int =
                                        # thus we compile without checks here
 
 proc cmpIgnoreStyle(a, b: string): int =
-  var
-    i = 0
-    j = 0
+  var i = 0
+  var j = 0
   while True:
     while a[i] == '_': inc(i)
     while b[j] == '_': inc(j) # BUGFIX: typo
@@ -400,14 +499,16 @@ proc splitSeq(s: string, seps: set[char]): seq[string] =
 
 # ---------------------------------------------------------------------------
 
-proc strip(s: string): string =
+proc strip(s: string, leading = true, trailing = true): string =
   const
     chars: set[Char] = Whitespace
   var
     first = 0
     last = len(s)-1
-  while s[first] in chars: inc(first)
-  while last >= 0 and s[last] in chars: dec(last)
+  if leading: 
+    while s[first] in chars: inc(first)
+  if trailing:
+    while last >= 0 and s[last] in chars: dec(last)
   result = copy(s, first, last)
 
 proc toLower(c: Char): Char =
@@ -451,7 +552,7 @@ proc preprocessSub(sub: string, a: var TSkipTable) =
   for i in 0..0xff: a[chr(i)] = m+1
   for i in 0..m-1: a[sub[i]] = m-i
 
-proc findSubStrAux(sub, s: string, start: int, a: TSkipTable): int =
+proc findSubStrAux(s, sub: string, start: int, a: TSkipTable): int =
   # fast "quick search" algorithm:
   var
     m = len(sub)
@@ -469,7 +570,7 @@ proc findSubStrAux(sub, s: string, start: int, a: TSkipTable): int =
 proc findSubStr(sub, s: string, start: int = 0): int =
   var a: TSkipTable
   preprocessSub(sub, a)
-  result = findSubStrAux(sub, s, start, a)
+  result = findSubStrAux(s, sub, start, a)
   # slow linear search:
   #var
   #  i, j, M, N: int
@@ -492,6 +593,20 @@ proc findSubStr(sub, s: string, start: int = 0): int =
   #    elif (i >= N):
   #      return -1
 
+proc find(s, sub: string, start: int = 0): int =
+  var a: TSkipTable
+  preprocessSub(sub, a)
+  result = findSubStrAux(s, sub, start, a)
+
+proc find(s: string, sub: char, start: int = 0): int =
+  for i in start..len(s)-1:
+    if sub == s[i]: return i
+  return -1
+ 
+proc find(s: string, chars: set[char], start: int = 0): int =
+  for i in start..s.len-1:
+    if s[i] in chars: return i
+  return -1 
 
 proc findSubStr(sub: char, s: string, start: int = 0): int =
   for i in start..len(s)-1:
@@ -504,23 +619,21 @@ proc findChars(chars: set[char], s: string, start: int = 0): int =
   return -1
   
 proc contains(s: string, chars: set[char]): bool =
-  return findChars(chars, s) >= 0
+  return find(s, chars) >= 0
 
 proc contains(s: string, c: char): bool =
-  return findSubStr(c, s) >= 0
+  return find(s, c) >= 0
 
 proc contains(s, sub: string): bool =
-  return findSubStr(sub, s) >= 0
+  return find(s, sub) >= 0
 
 proc replaceStr(s, sub, by: string): string =
-  var
-    i, j: int
-    a: TSkipTable
+  var a: TSkipTable
   result = ""
   preprocessSub(sub, a)
-  i = 0
+  var i = 0
   while true:
-    j = findSubStrAux(sub, s, i, a)
+    var j = findSubStrAux(s, sub, i, a)
     if j < 0: break
     add result, copy(s, i, j - 1)
     add result, by
@@ -583,7 +696,10 @@ proc rawParseInt(s: string, index: var int): BiggestInt =
       while s[i] == '_':
         inc(i)               # underscores are allowed and ignored
     result = result * sign
-    index = i                # store index back
+    if s[i] == '\0':
+      index = i              # store index back
+    else:
+      index = -1 # BUGFIX: error!
   else:
     index = -1
 
@@ -602,17 +718,17 @@ proc parseInt(s: string): int =
     result = int(res) # convert to smaller integer type
 
 proc ParseBiggestInt(s: string): biggestInt =
-  var
-    index: int = 0
+  var index = 0
   result = rawParseInt(s, index)
   if index == -1:
     raise newException(EInvalidValue, "invalid integer: " & s)
 
-proc ParseFloat(s: string): float =
+proc ParseFloat(s: string, start = 0): float =
   var
     esign = 1.0
     sign = 1.0
-    exponent, i: int
+    i = start
+    exponent: int
     flags: int
   result = 0.0
   if s[i] == '+': inc(i)
@@ -677,7 +793,7 @@ proc ParseFloat(s: string): float =
 
 proc toOct*(x: BiggestInt, len: int): string =
   ## converts `x` into its octal representation. The resulting string is
-  ## always `len` characters long. No leading ``0c`` prefix is generated.
+  ## always `len` characters long. No leading ``0o`` prefix is generated.
   var
     mask: BiggestInt = 7
     shift: BiggestInt = 0
@@ -701,7 +817,7 @@ proc toBin*(x: BiggestInt, len: int): string =
     shift = shift + 1
     mask = mask shl 1
 
-proc escape*(s: string, prefix, suffix = "\""): string =
+proc escape*(s: string, prefix = "\"", suffix = "\""): string =
   ## Escapes a string `s`. This does these operations (at the same time):
   ## * replaces any ``\`` by ``\\``
   ## * replaces any ``'`` by ``\'``
@@ -723,8 +839,34 @@ proc escape*(s: string, prefix, suffix = "\""): string =
     else: add(result, c)
   add(result, suffix)
 
+proc validEmailAddress*(s: string): bool = 
+  ## returns true if `s` seems to be a valid e-mail address. 
+  ## The checking also uses a domain list.
+  const
+    chars = Letters + Digits + {'!','#','$','%','&',
+      '\'','*','+','/','=','?','^','_','`','{','}','|','~','-','.'}
+  var i = 0
+  if s[i] notin chars or s[i] == '.': return false
+  while s[i] in chars: 
+    if s[i] == '.' and s[i+1] == '.': return false
+    inc(i)
+  if s[i] != '@': return false
+  var j = len(s)-1
+  if s[j] notin letters: return false
+  while j >= i and s[j] in letters: dec(j)
+  inc(i) # skip '@'
+  while s[i] in {'0'..'9', 'a'..'z', '-', '.'}: inc(i) 
+  if s[i] != '\0': return false
+  
+  var x = copy(s, j+1)
+  if len(x) == 2 and x[0] in Letters and x[1] in Letters: return true
+  case toLower(x)
+  of "com", "org", "net", "gov", "mil", "biz", "info", "mobi", "name",
+     "aero", "jobs", "museum": return true
+  return false
+  
 proc editDistance*(a, b: string): int =
-  ## returns the edit distance between `s` and `t`. This uses the Levenshtein
+  ## returns the edit distance between `a` and `b`. This uses the Levenshtein
   ## distance algorithm with only a linear memory overhead. This implementation
   ## is highly optimized!
   var len1 = a.len