summary refs log tree commit diff stats
path: root/lib/strutils.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/strutils.nim')
-rw-r--r--lib/strutils.nim312
1 files changed, 227 insertions, 85 deletions
diff --git a/lib/strutils.nim b/lib/strutils.nim
index 6189c6a88..e3a412053 100644
--- a/lib/strutils.nim
+++ b/lib/strutils.nim
@@ -1,7 +1,7 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2006 Andreas Rumpf
+#        (c) Copyright 2009 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -9,8 +9,7 @@
 
 ## This module contains various string utility routines.
 ## See the module `regexprs` for regular expression support.
-## All the routines here are avaiable for the EMCAScript target
-## too!
+## All the routines here are avaiable for the EMCAScript target too!
 
 {.deadCodeElim: on.}
 
@@ -33,13 +32,66 @@ type
 const
   Whitespace* = {' ', '\t', '\v', '\r', '\l', '\f'}
     ## All the characters that count as whitespace.
+    
+  Letters* = {'A'..'Z', 'a'..'z'}
+    ## the set of letters
+  
+  Digits* = {'0'..'9'}
+    ## the set of digits
+  
+  IdentChars* = {'a'..'z', 'A'..'Z', '0'..'9', '_'}
+    ## the set of characters an identifier can consist of
+  
+  IdentStartChars* = {'a'..'z', 'A'..'Z', '_'}
+    ## the set of characters an identifier can start with
 
   strStart* = 0 # this is only for bootstraping
                 # XXX: remove this someday
   nl* = "\n"    # this is only for bootstraping XXX: remove this somehow
 
-proc strip*(s: string): string {.noSideEffect.}
-  ## Strips leading and trailing whitespace from `s`.
+proc `%` *(formatstr: string, a: openarray[string]): string {.noSideEffect.}
+  ## The `substitution`:idx: operator performs string substitutions in
+  ## `formatstr` and returns a modified `formatstr`. This is often called
+  ## `string interpolation`:idx:.
+  ##
+  ## This is best explained by an example:
+  ##
+  ## .. code-block:: nimrod
+  ##   "$1 eats $2." % ["The cat", "fish"]
+  ##
+  ## Results in:
+  ##
+  ## .. code-block:: nimrod
+  ##   "The cat eats fish."
+  ##
+  ## The substitution variables (the thing after the ``$``)
+  ## are enumerated from 1 to 9.
+  ## Substitution variables can also be words (that is
+  ## ``[A-Za-z_]+[A-Za-z0-9_]*``) in which case the arguments in `a` with even
+  ## indices are keys and with odd indices are the corresponding values.
+  ## An example:
+  ##
+  ## .. code-block:: nimrod
+  ##   "$animal eats $food." % ["animal", "The cat", "food", "fish"]
+  ##
+  ## Results in:
+  ##
+  ## .. code-block:: nimrod
+  ##   "The cat eats fish."
+  ##
+  ## The variables are compared with `cmpIgnoreStyle`. `EInvalidValue` is
+  ## raised if an ill-formed format string has been passed to the `%` operator.
+
+proc `%` *(formatstr, a: string): string {.noSideEffect.}
+  ## This is the same as ``formatstr % [a]``.
+
+proc addf*(s: var string, formatstr: string, a: openarray[string])
+  ## The same as ``add(s, formatstr % a)``, but more efficient.
+
+proc strip*(s: string, leading = true, trailing = true): string {.noSideEffect.}
+  ## Strips whitespace from `s` and returns the resulting string.
+  ## If `leading` is true, leading whitespace is stripped.
+  ## If `trailing` is true, trailing whitespace is stripped.
 
 proc toLower*(s: string): string {.noSideEffect.}
   ## Converts `s` into lower case. This works only for the letters A-Z.
@@ -65,15 +117,36 @@ proc normalize*(s: string): string {.noSideEffect.}
   ## Normalizes the string `s`. That means to convert it to lower case and
   ## remove any '_'. This is needed for Nimrod identifiers for example.
 
-proc findSubStr*(sub, s: string, start: int = 0): int {.noSideEffect.}
+proc findSubStr*(sub, s: string, start: int = 0): int {.
+  noSideEffect, deprecated.}
+  ## Searches for `sub` in `s` starting at position `start`. Searching is
+  ## case-sensitive. If `sub` is not in `s`, -1 is returned.
+  ## **Deprecated since version 0.7.6**: Use `find` instead, but beware that
+  ## this has a different parameter order.
+
+proc findSubStr*(sub: char, s: string, start: int = 0): int {.
+  noSideEffect, deprecated.}
+  ## Searches for `sub` in `s` starting at position `start`. Searching is
+  ## case-sensitive. If `sub` is not in `s`, -1 is returned.
+  ## **Deprecated since version 0.7.6**: Use `find` instead, but beware that
+  ## this has a different parameter order.
+
+proc findChars*(chars: set[char], s: string, start: int = 0): int {.
+  noSideEffect, deprecated.}
+  ## Searches for `chars` in `s` starting at position `start`. If `s` contains
+  ## none of the characters in `chars`, -1 is returned.
+  ## **Deprecated since version 0.7.6**: Use `find` instead, but beware that
+  ## this has a different parameter order.
+
+proc find*(s, sub: string, start: int = 0): int {.noSideEffect.}
   ## Searches for `sub` in `s` starting at position `start`. Searching is
   ## case-sensitive. If `sub` is not in `s`, -1 is returned.
 
-proc findSubStr*(sub: char, s: string, start: int = 0): int {.noSideEffect.}
+proc find*(s: string, sub: char, start: int = 0): int {.noSideEffect.}
   ## Searches for `sub` in `s` starting at position `start`. Searching is
   ## case-sensitive. If `sub` is not in `s`, -1 is returned.
 
-proc findChars*(chars: set[char], s: string, start: int = 0): int {.noSideEffect.}
+proc find*(s: string, chars: set[char], start: int = 0): int {.noSideEffect.}
   ## Searches for `chars` in `s` starting at position `start`. If `s` contains
   ## none of the characters in `chars`, -1 is returned.
 
@@ -95,15 +168,15 @@ iterator split*(s: string, seps: set[char] = Whitespace): string =
   ## Splits the string `s` into substrings.
   ##
   ## Substrings are separated by a substring containing only `seps`.
-  ## The seperator substrings are not returned in `sub`, nor are they part
-  ## of `sub`.
-  ## Examples::
+  ## Examples:
   ##
+  ## .. code-block:: nimrod
   ##   for word in split("  this is an  example  "):
   ##     writeln(stdout, word)
   ##
-  ## Results in::
+  ## Results in:
   ##
+  ## .. code-block:: nimrod
   ##   "this"
   ##   "is"
   ##   "an"
@@ -123,18 +196,54 @@ iterator split*(s: string, seps: set[char] = Whitespace): string =
     while last < len(s) and s[last] not_in seps: inc(last) # BUGFIX!
     yield copy(s, first, last-1)
 
+iterator split*(s: string, sep: char): string =
+  ## Splits the string `s` into substrings.
+  ##
+  ## Substrings are separated by the character `sep`.
+  ## Example:
+  ##
+  ## .. code-block:: nimrod
+  ##   for word in split(";;this;is;an;;example;;;", ';'):
+  ##     writeln(stdout, word)
+  ##
+  ## Results in:
+  ##
+  ## .. code-block:: nimrod
+  ##   ""
+  ##   ""
+  ##   "this"
+  ##   "is"
+  ##   "an"
+  ##   ""
+  ##   "example"
+  ##   ""
+  ##   ""
+  ##   ""
+  ##
+  var last = 0
+  assert('\0' != sep)
+  if len(s) > 0:
+    # `<=` is correct here for the edge cases!
+    while last <= len(s):
+      var first = last
+      while last < len(s) and s[last] != sep: inc(last)
+      yield copy(s, first, last-1)
+      inc(last)
+
 iterator splitLines*(s: string): string =
   ## Splits the string `s` into its containing lines. Each newline
   ## combination (CR, LF, CR-LF) is supported. The result strings contain
   ## no trailing ``\n``.
   ##
-  ## Example::
+  ## Example:
   ##
+  ## .. code-block:: nimrod
   ##   for line in lines("\nthis\nis\nan\n\nexample\n"):
   ##     writeln(stdout, line)
   ##
-  ## Results in::
+  ## Results in:
   ##
+  ## .. code-block:: nimrod
   ##   ""
   ##   "this"
   ##   "is"
@@ -164,6 +273,11 @@ proc splitSeq*(s: string, seps: set[char] = Whitespace): seq[string] {.
   noSideEffect.}
   ## The same as `split`, but is a proc that returns a sequence of substrings.
 
+proc splitSeq*(s: string, sep: char): seq[string] {.noSideEffect.} =
+  ## The same as `split`, but is a proc that returns a sequence of substrings.
+  result = @[]
+  for sub in split(s, sep): add(result, sub)
+
 proc cmpIgnoreCase*(a, b: string): int {.noSideEffect.}
   ## Compares two strings in a case insensitive manner. Returns:
   ##
@@ -207,7 +321,7 @@ proc ParseBiggestInt*(s: string): biggestInt {.noSideEffect.}
   ## Parses a decimal integer value contained in `s`. If `s` is not
   ## a valid integer, `EInvalidValue` is raised.
 
-proc ParseFloat*(s: string): float {.noSideEffect.}
+proc ParseFloat*(s: string, start = 0): float {.noSideEffect.}
   ## Parses a decimal floating point value contained in `s`. If `s` is not
   ## a valid floating point number, `EInvalidValue` is raised. ``NAN``,
   ## ``INF``, ``-INF`` are also supported (case insensitive comparison).
@@ -217,37 +331,6 @@ proc ParseFloat*(s: string): float {.noSideEffect.}
 proc toString*[Ty](x: Ty): string
   ## This generic proc is the same as the stringify operator `$`.
 
-proc `%` *(formatstr: string, a: openarray[string]): string {.noSideEffect.}
-  ## The substitution operator performs string substitutions in `formatstr`
-  ## and returns the modified `formatstr`.
-  ##
-  ## This is best explained by an example::
-  ##
-  ##   "$1 eats $2." % ["The cat", "fish"]
-  ##
-  ## Results in::
-  ##
-  ##   "The cat eats fish."
-  ##
-  ## The substitution variables (the thing after the ``$``)
-  ## are enumerated from 1 to 9.
-  ## Substitution variables can also be words (that is
-  ## ``[A-Za-z_]+[A-Za-z0-9_]*``) in which case the arguments in `a` with even
-  ## indices are keys and with odd indices are the corresponding values. Again
-  ## an example::
-  ##
-  ##   "$animal eats $food." % ["animal", "The cat", "food", "fish"]
-  ##
-  ## Results in::
-  ##
-  ##   "The cat eats fish."
-  ##
-  ## The variables are compared with `cmpIgnoreStyle`. `EInvalidValue` is
-  ## raised if an ill-formed format string has been passed to the `%` operator.
-
-proc `%` *(formatstr, a: string): string {.noSideEffect.}
-  ## This is the same as `formatstr % [a]`.
-
 proc repeatChar*(count: int, c: Char = ' '): string
   ## Returns a string of length `count` consisting only of
   ## the character `c`.
@@ -260,7 +343,25 @@ proc endsWith*(s, suffix: string): bool {.noSideEffect.}
   ## Returns true iff ``s`` ends with ``suffix``.
   ## If ``suffix == ""`` true is returned.
 
-# implementation
+proc addSep*(dest: var string, sep = ", ", startLen = 0) {.noSideEffect,
+                                                           inline.} = 
+  ## A shorthand for: 
+  ## 
+  ## .. code-block:: nimrod
+  ##   if dest.len > startLen: add(dest, sep)
+  ## 
+  ## This is often useful for generating some code where the items need to
+  ## be *separated* by `sep`. `sep` is only added if `dest` is longer than
+  ## `startLen`. The following example creates a string describing
+  ## an array of integers:  
+  ## 
+  ## .. code-block:: nimrod
+  ##   var arr = "["
+  ##   for x in items([2, 3, 5, 7, 11]):
+  ##     addSep(arr, startLen=len("["))
+  ##     add(arr, $x)
+  ##   add(arr, "]")
+  if dest.len > startLen: add(dest, sep)
 
 proc allCharsInSet*(s: string, theSet: TCharSet): bool =
   ## returns true iff each character of `s` is in the set `theSet`.
@@ -271,7 +372,7 @@ proc allCharsInSet*(s: string, theSet: TCharSet): bool =
 proc quoteIfContainsWhite*(s: string): string =
   ## returns ``'"' & s & '"'`` if `s` contains a space and does not
   ## start with a quote, else returns `s`
-  if findChars({' ', '\t'}, s) >= 0 and s[0] != '"':
+  if find(s, {' ', '\t'}) >= 0 and s[0] != '"':
     result = '"' & s & '"'
   else:
     result = s
@@ -307,10 +408,8 @@ proc intToStr(x: int, minchars: int = 1): string =
 proc toString[Ty](x: Ty): string = return $x
 
 proc toOctal(c: char): string =
-  var
-    val: int
   result = newString(3)
-  val = ord(c)
+  var val = ord(c)
   for i in countdown(2, 0):
     result[i] = Chr(val mod 8 + ord('0'))
     val = val div 8
@@ -326,18 +425,15 @@ proc findNormalized(x: string, inArray: openarray[string]): int =
               # security whole ...
   return -1
 
-proc `%`(formatstr: string, a: openarray[string]): string =
-  # the format operator
-  const
-    PatternChars = {'a'..'z', 'A'..'Z', '0'..'9', '\128'..'\255', '_'}
-  result = ""
+proc addf(s: var string, formatstr: string, a: openarray[string]) =
+  const PatternChars = {'a'..'z', 'A'..'Z', '0'..'9', '\128'..'\255', '_'}
   var i = 0
   while i < len(formatstr):
     if formatstr[i] == '$':
       case formatstr[i+1] # again we use the fact that strings
                           # are zero-terminated here
       of '$':
-        add result, '$'
+        add s, '$'
         inc(i, 2)
       of '1'..'9':
         var j = 0
@@ -345,25 +441,29 @@ proc `%`(formatstr: string, a: openarray[string]): string =
         while formatstr[i] in {'0'..'9'}:
           j = j * 10 + ord(formatstr[i]) - ord('0')
           inc(i)
-        add result, a[j - 1]
+        add s, a[j - 1]
       of '{':
         var j = i+1
         while formatstr[j] notin {'\0', '}'}: inc(j)
         var x = findNormalized(copy(formatstr, i+2, j-1), a)
-        if x >= 0 and x < high(a): add result, a[x+1]
+        if x >= 0 and x < high(a): add s, a[x+1]
         else: raise newException(EInvalidValue, "invalid format string")
         i = j+1
       of 'a'..'z', 'A'..'Z', '\128'..'\255', '_':
         var j = i+1
         while formatstr[j] in PatternChars: inc(j)
         var x = findNormalized(copy(formatstr, i+1, j-1), a)
-        if x >= 0 and x < high(a): add result, a[x+1]
+        if x >= 0 and x < high(a): add s, a[x+1]
         else: raise newException(EInvalidValue, "invalid format string")
         i = j
       else: raise newException(EInvalidValue, "invalid format string")
     else:
-      add result, formatstr[i]
+      add s, formatstr[i]
       inc(i)
+  
+proc `%`(formatstr: string, a: openarray[string]): string =
+  result = ""
+  addf(result, formatstr, a)
 
 proc cmpIgnoreCase(a, b: string): int =
   # makes usage of the fact that strings are zero-terminated
@@ -377,9 +477,8 @@ proc cmpIgnoreCase(a, b: string): int =
                                        # thus we compile without checks here
 
 proc cmpIgnoreStyle(a, b: string): int =
-  var
-    i = 0
-    j = 0
+  var i = 0
+  var j = 0
   while True:
     while a[i] == '_': inc(i)
     while b[j] == '_': inc(j) # BUGFIX: typo
@@ -400,14 +499,16 @@ proc splitSeq(s: string, seps: set[char]): seq[string] =
 
 # ---------------------------------------------------------------------------
 
-proc strip(s: string): string =
+proc strip(s: string, leading = true, trailing = true): string =
   const
     chars: set[Char] = Whitespace
   var
     first = 0
     last = len(s)-1
-  while s[first] in chars: inc(first)
-  while last >= 0 and s[last] in chars: dec(last)
+  if leading: 
+    while s[first] in chars: inc(first)
+  if trailing:
+    while last >= 0 and s[last] in chars: dec(last)
   result = copy(s, first, last)
 
 proc toLower(c: Char): Char =
@@ -451,7 +552,7 @@ proc preprocessSub(sub: string, a: var TSkipTable) =
   for i in 0..0xff: a[chr(i)] = m+1
   for i in 0..m-1: a[sub[i]] = m-i
 
-proc findSubStrAux(sub, s: string, start: int, a: TSkipTable): int =
+proc findSubStrAux(s, sub: string, start: int, a: TSkipTable): int =
   # fast "quick search" algorithm:
   var
     m = len(sub)
@@ -469,7 +570,7 @@ proc findSubStrAux(sub, s: string, start: int, a: TSkipTable): int =
 proc findSubStr(sub, s: string, start: int = 0): int =
   var a: TSkipTable
   preprocessSub(sub, a)
-  result = findSubStrAux(sub, s, start, a)
+  result = findSubStrAux(s, sub, start, a)
   # slow linear search:
   #var
   #  i, j, M, N: int
@@ -492,6 +593,20 @@ proc findSubStr(sub, s: string, start: int = 0): int =
   #    elif (i >= N):
   #      return -1
 
+proc find(s, sub: string, start: int = 0): int =
+  var a: TSkipTable
+  preprocessSub(sub, a)
+  result = findSubStrAux(s, sub, start, a)
+
+proc find(s: string, sub: char, start: int = 0): int =
+  for i in start..len(s)-1:
+    if sub == s[i]: return i
+  return -1
+ 
+proc find(s: string, chars: set[char], start: int = 0): int =
+  for i in start..s.len-1:
+    if s[i] in chars: return i
+  return -1 
 
 proc findSubStr(sub: char, s: string, start: int = 0): int =
   for i in start..len(s)-1:
@@ -504,23 +619,21 @@ proc findChars(chars: set[char], s: string, start: int = 0): int =
   return -1
   
 proc contains(s: string, chars: set[char]): bool =
-  return findChars(chars, s) >= 0
+  return find(s, chars) >= 0
 
 proc contains(s: string, c: char): bool =
-  return findSubStr(c, s) >= 0
+  return find(s, c) >= 0
 
 proc contains(s, sub: string): bool =
-  return findSubStr(sub, s) >= 0
+  return find(s, sub) >= 0
 
 proc replaceStr(s, sub, by: string): string =
-  var
-    i, j: int
-    a: TSkipTable
+  var a: TSkipTable
   result = ""
   preprocessSub(sub, a)
-  i = 0
+  var i = 0
   while true:
-    j = findSubStrAux(sub, s, i, a)
+    var j = findSubStrAux(s, sub, i, a)
     if j < 0: break
     add result, copy(s, i, j - 1)
     add result, by
@@ -583,7 +696,10 @@ proc rawParseInt(s: string, index: var int): BiggestInt =
       while s[i] == '_':
         inc(i)               # underscores are allowed and ignored
     result = result * sign
-    index = i                # store index back
+    if s[i] == '\0':
+      index = i              # store index back
+    else:
+      index = -1 # BUGFIX: error!
   else:
     index = -1
 
@@ -602,17 +718,17 @@ proc parseInt(s: string): int =
     result = int(res) # convert to smaller integer type
 
 proc ParseBiggestInt(s: string): biggestInt =
-  var
-    index: int = 0
+  var index = 0
   result = rawParseInt(s, index)
   if index == -1:
     raise newException(EInvalidValue, "invalid integer: " & s)
 
-proc ParseFloat(s: string): float =
+proc ParseFloat(s: string, start = 0): float =
   var
     esign = 1.0
     sign = 1.0
-    exponent, i: int
+    i = start
+    exponent: int
     flags: int
   result = 0.0
   if s[i] == '+': inc(i)
@@ -677,7 +793,7 @@ proc ParseFloat(s: string): float =
 
 proc toOct*(x: BiggestInt, len: int): string =
   ## converts `x` into its octal representation. The resulting string is
-  ## always `len` characters long. No leading ``0c`` prefix is generated.
+  ## always `len` characters long. No leading ``0o`` prefix is generated.
   var
     mask: BiggestInt = 7
     shift: BiggestInt = 0
@@ -701,7 +817,7 @@ proc toBin*(x: BiggestInt, len: int): string =
     shift = shift + 1
     mask = mask shl 1
 
-proc escape*(s: string, prefix, suffix = "\""): string =
+proc escape*(s: string, prefix = "\"", suffix = "\""): string =
   ## Escapes a string `s`. This does these operations (at the same time):
   ## * replaces any ``\`` by ``\\``
   ## * replaces any ``'`` by ``\'``
@@ -723,8 +839,34 @@ proc escape*(s: string, prefix, suffix = "\""): string =
     else: add(result, c)
   add(result, suffix)
 
+proc validEmailAddress*(s: string): bool = 
+  ## returns true if `s` seems to be a valid e-mail address. 
+  ## The checking also uses a domain list.
+  const
+    chars = Letters + Digits + {'!','#','$','%','&',
+      '\'','*','+','/','=','?','^','_','`','{','}','|','~','-','.'}
+  var i = 0
+  if s[i] notin chars or s[i] == '.': return false
+  while s[i] in chars: 
+    if s[i] == '.' and s[i+1] == '.': return false
+    inc(i)
+  if s[i] != '@': return false
+  var j = len(s)-1
+  if s[j] notin letters: return false
+  while j >= i and s[j] in letters: dec(j)
+  inc(i) # skip '@'
+  while s[i] in {'0'..'9', 'a'..'z', '-', '.'}: inc(i) 
+  if s[i] != '\0': return false
+  
+  var x = copy(s, j+1)
+  if len(x) == 2 and x[0] in Letters and x[1] in Letters: return true
+  case toLower(x)
+  of "com", "org", "net", "gov", "mil", "biz", "info", "mobi", "name",
+     "aero", "jobs", "museum": return true
+  return false
+  
 proc editDistance*(a, b: string): int =
-  ## returns the edit distance between `s` and `t`. This uses the Levenshtein
+  ## returns the edit distance between `a` and `b`. This uses the Levenshtein
   ## distance algorithm with only a linear memory overhead. This implementation
   ## is highly optimized!
   var len1 = a.len
le='Blame the previous revision' href='/acidbong/suckless/dwm/blame/dwm.1?h=4.5&id=6458d72572a30d2cc4af3385755033b2cca93057'>^
df74b26 ^


0c3544d ^

dc5c070 ^

4bb89e2 ^
60b3dce ^
3e06ede ^
4bb89e2 ^




3e06ede ^
5ef6ef1 ^

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150