# # # Nimrod's Runtime Library # (c) Copyright 2009 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # ## This module contains various string utility routines. ## See the module `regexprs` for regular expression support. ## All the routines here are avaiable for the EMCAScript target too! {.deadCodeElim: on.} {.push debugger:off .} # the user does not want to trace a part # of the standard library! # copied from excpt.nim, because I don't want to make this template public template newException(exceptn, message: expr): expr = block: # open a new scope var e: ref exceptn new(e) e.msg = message e type TCharSet* = set[char] # for compability for Nim const Whitespace* = {' ', '\t', '\v', '\r', '\l', '\f'} ## All the characters that count as whitespace. Letters* = {'A'..'Z', 'a'..'z'} ## the set of letters Digits* = {'0'..'9'} ## the set of digits IdentChars* = {'a'..'z', 'A'..'Z', '0'..'9', '_'} ## the set of characters an identifier can consist of IdentStartChars* = {'a'..'z', 'A'..'Z', '_'} ## the set of characters an identifier can start with strStart* = 0 # this is only for bootstraping # XXX: remove this someday nl* = "\n" # this is only for bootstraping XXX: remove this somehow proc `%` *(formatstr: string, a: openarray[string]): string {.noSideEffect.} ## The `substitution`:idx: operator performs string substitutions in ## `formatstr` and returns a modified `formatstr`. This is often called ## `string interpolation`:idx:. ## ## This is best explained by an example: ## ## .. code-block:: nimrod ## "$1 eats $2." % ["The cat", "fish"] ## ## Results in: ## ## .. code-block:: nimrod ## "The cat eats fish." ## ## The substitution variables (the thing after the ``$``) ## are enumerated from 1 to 9. ## Substitution variables can also be words (that is ## ``[A-Za-z_]+[A-Za-z0-9_]*``) in which case the arguments in `a` with even ## indices are keys and with odd indices are the corresponding values. ## An example: ## ## .. code-block:: nimrod ## "$animal eats $food." % ["animal", "The cat", "food", "fish"] ## ## Results in: ## ## .. code-block:: nimrod ## "The cat eats fish." ## ## The variables are compared with `cmpIgnoreStyle`. `EInvalidValue` is ## raised if an ill-formed format string has been passed to the `%` operator. proc `%` *(formatstr, a: string): string {.noSideEffect.} ## This is the same as ``formatstr % [a]``. proc addf*(s: var string, formatstr: string, a: openarray[string]) ## The same as ``add(s, formatstr % a)``, but more efficient. proc strip*(s: string, leading = true, trailing = true): string {.noSideEffect.} ## Strips whitespace from `s` and returns the resulting string. ## If `leading` is true, leading whitespace is stripped. ## If `trailing` is true, trailing whitespace is stripped. proc toLower*(s: string): string {.noSideEffect.} ## Converts `s` into lower case. This works only for the letters A-Z. ## See `unicode.toLower` for a version that works for any Unicode character. proc toLower*(c: Char): Char {.noSideEffect.} ## Converts `c` into lower case. This works only for the letters A-Z. ## See `unicode.toLower` for a version that works for any Unicode character. proc toUpper*(s: string): string {.noSideEffect.} ## Converts `s` into upper case. This works only for the letters a-z. ## See `unicode.toUpper` for a version that works for any Unicode character. proc toUpper*(c: Char): Char {.noSideEffect.} ## Converts `c` into upper case. This works only for the letters a-z. ## See `unicode.toUpper` for a version that works for any Unicode character. proc capitalize*(s: string): string {.noSideEffect.} ## Converts the first character of `s` into upper case. ## This works only for the letters a-z. proc normalize*(s: string): string {.noSideEffect.} ## Normalizes the string `s`. That means to convert it to lower case and ## remove any '_'. This is needed for Nimrod identifiers for example. proc findSubStr*(sub, s: string, start: int = 0): int {. noSideEffect, deprecated.} ## Searches for `sub` in `s` starting at position `start`. Searching is ## case-sensitive. If `sub` is not in `s`, -1 is returned. ## **Deprecated since version 0.7.6**: Use `find` instead, but beware that ## this has a different parameter order. proc findSubStr*(sub: char, s: string, start: int = 0): int {. noSideEffect, deprecated.} ## Searches for `sub` in `s` starting at position `start`. Searching is ## case-sensitive. If `sub` is not in `s`, -1 is returned. ## **Deprecated since version 0.7.6**: Use `find` instead, but beware that ## this has a different parameter order. proc findChars*(chars: set[char], s: string, start: int = 0): int {. noSideEffect, deprecated.} ## Searches for `chars` in `s` starting at position `start`. If `s` contains ## none of the characters in `chars`, -1 is returned. ## **Deprecated since version 0.7.6**: Use `find` instead, but beware that ## this has a different parameter order. proc find*(s, sub: string, start: int = 0): int {.noSideEffect.} ## Searches for `sub` in `s` starting at position `start`. Searching is ## case-sensitive. If `sub` is not in `s`, -1 is returned. proc find*(s: string, sub: char, start: int = 0): int {.noSideEffect.} ## Searches for `sub` in `s` starting at position `start`. Searching is ## case-sensitive. If `sub` is not in `s`, -1 is returned. proc find*(s: string, chars: set[char], start: int = 0): int {.noSideEffect.} ## Searches for `chars` in `s` starting at position `start`. If `s` contains ## none of the characters in `chars`, -1 is returned. proc replaceStr*(s, sub, by: string): string {.noSideEffect.} ## Replaces `sub` in `s` by the string `by`. proc replaceStr*(s: string, sub, by: char): string {.noSideEffect.} ## optimized version for characters. proc deleteStr*(s: var string, first, last: int) ## Deletes in `s` the characters at position `first`..`last`. This modifies ## `s` itself, it does not return a copy. proc toOctal*(c: char): string ## Converts a character `c` to its octal representation. The resulting ## string may not have a leading zero. Its length is always exactly 3. iterator split*(s: string, seps: set[char] = Whitespace): string = ## Splits the string `s` into substrings. ## ## Substrings are separated by a substring containing only `seps`. ## Examples: ## ## .. code-block:: nimrod ## for word in split(" this is an example "): ## writeln(stdout, word) ## ## Results in: ## ## .. code-block:: nimrod ## "this" ## "is" ## "an" ## "example" ## ## for word in split(";;this;is;an;;example;;;", {';'}): ## writeln(stdout, word) ## ## produces in the same output. var first: int = 0 last: int = 0 assert(not ('\0' in seps)) while last < len(s): while s[last] in seps: inc(last) first = last while last < len(s) and s[last] not_in seps: inc(last) # BUGFIX! yield copy(s, first, last-1) iterator split*(s: string, sep: char): string = ## Splits the string `s` into substrings. ## ## Substrings are separated by the character `sep`. ## Example: ## ## .. code-block:: nimrod ## for word in split(";;this;is;an;;example;;;", ';'): ## writeln(stdout, word) ## ## Results in: ## ## .. code-block:: nimrod ## "" ## "" ## "this" ## "is" ## "an" ## "" ## "example" ## "" ## "" ## "" ## var last = 0 assert('\0' != sep) if len(s) > 0: # `<=` is correct here for the edge cases! while last <= len(s): var first = last while last < len(s) and s[last] != sep: inc(last) yield copy(s, first, last-1) inc(last) iterator splitLines*(s: string): string = ## Splits the string `s` into its containing lines. Each newline ## combination (CR, LF, CR-LF) is supported. The result strings contain ## no trailing ``\n``. ## ## Example: ## ## .. code-block:: nimrod ## for line in lines("\nthis\nis\nan\n\nexample\n"): ## writeln(stdout, line) ## ## Results in: ## ## .. code-block:: nimrod ## "" ## "this" ## "is" ## "an" ## "" ## "example" ## "" var first = 0 var last = 0 while true: while s[last] notin {'\0', '\c', '\l'}: inc(last) yield copy(s, first, last-1) # skip newlines: if s[last] == '\l': inc(last) elif s[last] == '\c': inc(last) if s[last] == '\l': inc(last) else: break # was '\0' first = last proc splitLinesSeq*(s: string): seq[string] {.noSideEffect.} = ## The same as `split`, but is a proc that returns a sequence of substrings. result = @[] for line in splitLines(s): add(result, line) proc splitSeq*(s: string, seps: set[char] = Whitespace): seq[string] {. noSideEffect.} ## The same as `split`, but is a proc that returns a sequence of substrings. proc splitSeq*(s: string, sep: char): seq[string] {.noSideEffect.} = ## The same as `split`, but is a proc that returns a sequence of substrings. result = @[] for sub in split(s, sep): add(result, sub) proc cmpIgnoreCase*(a, b: string): int {.noSideEffect.} ## Compares two strings in a case insensitive manner. Returns: ## ## | 0 iff a == b ## | < 0 iff a < b ## | > 0 iff a > b proc cmpIgnoreStyle*(a, b: string): int {.noSideEffect.} ## Compares two strings normalized (i.e. case and ## underscores do not matter). Returns: ## ## | 0 iff a == b ## | < 0 iff a < b ## | > 0 iff a > b proc contains*(s: string, c: char): bool {.noSideEffect.} ## Same as ``findSubStr(c, s) >= 0``. proc contains*(s, sub: string): bool {.noSideEffect.} ## Same as ``findSubStr(sub, s) >= 0``. proc contains*(s: string, chars: set[char]): bool {.noSideEffect.} ## Same as ``findChars(s, chars) >= 0``. proc toHex*(x: BiggestInt, len: int): string {.noSideEffect.} ## Converts `x` to its hexadecimal representation. The resulting string ## will be exactly `len` characters long. No prefix like ``0x`` ## is generated. `x` is treated as unsigned value. proc intToStr*(x: int, minchars: int = 1): string ## Converts `x` to its decimal representation. The resulting string ## will be minimally `minchars` characters long. This is achieved by ## adding leading zeros. proc ParseInt*(s: string): int {.noSideEffect.} ## Parses a decimal integer value contained in `s`. If `s` is not ## a valid integer, `EInvalidValue` is raised. # XXX: make this biggestint! proc ParseBiggestInt*(s: string): biggestInt {.noSideEffect.} ## Parses a decimal integer value contained in `s`. If `s` is not ## a valid integer, `EInvalidValue` is raised. proc ParseFloat*(s: string, start = 0): float {.noSideEffect.} ## Parses a decimal floating point value contained in `s`. If `s` is not ## a valid floating point number, `EInvalidValue` is raised. ``NAN``, ## ``INF``, ``-INF`` are also supported (case insensitive comparison). # XXX: make this biggestfloat. # the stringify and format operators: proc toString*[Ty](x: Ty): string ## This generic proc is the same as the stringify operator `$`. proc repeatChar*(count: int, c: Char = ' '): string ## Returns a string of length `count` consisting only of ## the character `c`. proc startsWith*(s, prefix: string): bool {.noSideEffect.} ## Returns true iff ``s`` starts with ``prefix``. ## If ``prefix == ""`` true is returned. proc endsWith*(s, suffix: string): bool {.noSideEffect.} ## Returns true iff ``s`` ends with ``suffix``. ## If ``suffix == ""`` true is returned. proc addSep*(dest: var string, sep = ", ", startLen = 0) {.noSideEffect, inline.} = ## A shorthand for: ## ## .. code-block:: nimrod ## if dest.len > startLen: add(dest, sep) ## ## This is often useful for generating some code where the items need to ## be *separated* by `sep`. `sep` is only added if `dest` is longer than ## `startLen`. The following example creates a string describing ## an array of integers: ## ## .. code-block:: nimrod ## var arr = "[" ## for x in items([2, 3, 5, 7, 11]): ## addSep(arr, startLen=len("[")) ## add(arr, $x) ## add(arr, "]") if dest.len > startLen: add(dest, sep) proc allCharsInSet*(s: string, theSet: TCharSet): bool = ## returns true iff each character of `s` is in the set `theSet`. for c in items(s): if not (c in theSet): return false return true proc quoteIfContainsWhite*(s: string): string = ## returns ``'"' & s & '"'`` if `s` contains a space and does not ## start with a quote, else returns `s` if find(s, {' ', '\t'}) >= 0 and s[0] != '"': result = '"' & s & '"' else: result = s proc startsWith(s, prefix: string): bool = var i = 0 while true: if prefix[i] == '\0': return true if s[i] != prefix[i]: return false inc(i) proc endsWith(s, suffix: string): bool = var i = 0 j = len(s) - len(suffix) while true: if suffix[i] == '\0': return true if s[i+j] != suffix[i]: return false inc(i) proc repeatChar(count: int, c: Char = ' '): string = result = newString(count) for i in 0..count-1: result[i] = c proc intToStr(x: int, minchars: int = 1): string = result = $abs(x) for i in 1 .. minchars - len(result): result = '0' & result if x < 0: result = '-' & result proc toString[Ty](x: Ty): string = return $x proc toOctal(c: char): string = result = newString(3) var val = ord(c) for i in countdown(2, 0): result[i] = Chr(val mod 8 + ord('0')) val = val div 8 proc `%`(formatstr: string, a: string): string = return formatstr % [a] proc findNormalized(x: string, inArray: openarray[string]): int = var i = 0 while i < high(inArray): if cmpIgnoreStyle(x, inArray[i]) == 0: return i inc(i, 2) # incrementing by 1 would probably result in a # security whole ... return -1 proc addf(s: var string, formatstr: string, a: openarray[string]) = const PatternChars = {'a'..'z', 'A'..'Z', '0'..'9', '\128'..'\255', '_'} var i = 0 while i < len(formatstr): if formatstr[i] == '$': case formatstr[i+1] # again we use the fact that strings # are zero-terminated here of '$': add s, '$' inc(i, 2) of '1'..'9': var j = 0 inc(i) # skip $ while formatstr[i] in {'0'..'9'}: j = j * 10 + ord(formatstr[i]) - ord('0') inc(i) add s, a[j - 1] of '{': var j = i+1 while formatstr[j] notin {'\0', '}'}: inc(j) var x = findNormalized(copy(formatstr, i+2, j-1), a) if x >= 0 and x < high(a): add s, a[x+1] else: raise newException(EInvalidValue, "invalid format string") i = j+1 of 'a'..'z', 'A'..'Z', '\128'..'\255', '_': var j = i+1 while formatstr[j] in PatternChars: inc(j) var x = findNormalized(copy(formatstr, i+1, j-1), a) if x >= 0 and x < high(a): add s, a[x+1] else: raise newException(EInvalidValue, "invalid format string") i = j else: raise newException(EInvalidValue, "invalid format string") else: add s, formatstr[i] inc(i) proc `%`(formatstr: string, a: openarray[string]): string = result = "" addf(result, formatstr, a) proc cmpIgnoreCase(a, b: string): int = # makes usage of the fact that strings are zero-terminated for i in 0..len(a)-1: var aa = toLower(a[i]) var bb = toLower(b[i]) result = ord(aa) - ord(bb) if result != 0: break {.push checks: off, line_trace: off .} # this is a hot-spot in the compiler! # thus we compile without checks here proc cmpIgnoreStyle(a, b: string): int = var i = 0 var j = 0 while True: while a[i] == '_': inc(i) while b[j] == '_': inc(j) # BUGFIX: typo var aa = toLower(a[i]) var bb = toLower(b[j]) result = ord(aa) - ord(bb) if result != 0 or aa == '\0': break inc(i) inc(j) {.pop.} # ---------- splitting ----------------------------------------------------- proc splitSeq(s: string, seps: set[char]): seq[string] = result = @[] for sub in split(s, seps): add result, sub # --------------------------------------------------------------------------- proc strip(s: string, leading = true, trailing = true): string = const chars: set[Char] = Whitespace var first = 0 last = len(s)-1 if leading: while s[first] in chars: inc(first) if trailing: while last >= 0 and s[last] in chars: dec(last) result = copy(s, first, last) proc toLower(c: Char): Char = if c in {'A'..'Z'}: result = chr(ord(c) + (ord('a') - ord('A'))) else: result = c proc toLower(s: string): string = result = newString(len(s)) for i in 0..len(s) - 1: result[i] = toLower(s[i]) proc toUpper(c: Char): Char = if c in {'a'..'z'}: result = Chr(Ord(c) - (Ord('a') - Ord('A'))) else: result = c proc toUpper(s: string): string = result = newString(len(s)) for i in 0..len(s) - 1: result[i] = toUpper(s[i]) proc capitalize(s: string): string = result = toUpper(s[0]) & copy(s, 1) proc normalize(s: string): string = result = "" for i in 0..len(s) - 1: if s[i] in {'A'..'Z'}: add result, Chr(Ord(s[i]) + (Ord('a') - Ord('A'))) elif s[i] != '_': add result, s[i] type TSkipTable = array[Char, int] proc preprocessSub(sub: string, a: var TSkipTable) = var m = len(sub) for i in 0..0xff: a[chr(i)] = m+1 for i in 0..m-1: a[sub[i]] = m-i proc findSubStrAux(s, sub: string, start: int, a: TSkipTable): int = # fast "quick search" algorithm: var m = len(sub) n = len(s) # search: var j = start while j <= n - m: block match: for k in 0..m-1: if sub[k] != s[k+j]: break match return j inc(j, a[s[j+m]]) return -1 proc findSubStr(sub, s: string, start: int = 0): int = var a: TSkipTable preprocessSub(sub, a) result = findSubStrAux(s, sub, start, a) # slow linear search: #var # i, j, M, N: int #M = len(sub) #N = len(s) #i = start #j = 0 #if i >= N: # result = -1 #else: # while True: # if s[i] == sub[j]: # Inc(i) # Inc(j) # else: # i = i - j + 1 # j = 0 # if (j >= M): # return i - M # elif (i >= N): # return -1 proc find(s, sub: string, start: int = 0): int = var a: TSkipTable preprocessSub(sub, a) result = findSubStrAux(s, sub, start, a) proc find(s: string, sub: char, start: int = 0): int = for i in start..len(s)-1: if sub == s[i]: return i return -1 proc find(s: string, chars: set[char], start: int = 0): int = for i in start..s.len-1: if s[i] in chars: return i return -1 proc findSubStr(sub: char, s: string, start: int = 0): int = for i in start..len(s)-1: if sub == s[i]: return i return -1 proc findChars(chars: set[char], s: string, start: int = 0): int = for i in start..s.len-1: if s[i] in chars: return i return -1 proc contains(s: string, chars: set[char]): bool = return find(s, chars) >= 0 proc contains(s: string, c: char): bool = return find(s, c) >= 0 proc contains(s, sub: string): bool = return find(s, sub) >= 0 proc replaceStr(s, sub, by: string): string = var a: TSkipTable result = "" preprocessSub(sub, a) var i = 0 while true: var j = findSubStrAux(s, sub, i, a) if j < 0: break add result, copy(s, i, j - 1) add result, by i = j + len(sub) # copy the rest: add result, copy(s, i) proc replaceStr(s: string, sub, by: char): string = result = newString(s.len) var i = 0 while i < s.len: if s[i] == sub: result[i] = by else: result[i] = s[i] inc(i) proc deleteStr(s: var string, first, last: int) = # example: "abc___uvwxyz\0" (___ is to be deleted) # --> first == 3, last == 5 # s[first..] = s[last+1..] var i = first while last+i+1 < len(s): s[i] = s[last+i+1] inc(i) setlen(s, len(s)-(last-first+1)) # parsing numbers: proc toHex(x: BiggestInt, len: int): string = const HexChars = "0123456789ABCDEF" var shift: BiggestInt result = newString(len) for j in countdown(len-1, 0): result[j] = HexChars[toU32(x shr shift) and 0xF'i32] shift = shift + 4 {.push overflowChecks: on.} # this must be compiled with overflow checking turned on: proc rawParseInt(s: string, index: var int): BiggestInt = # index contains the start position at proc entry; end position will be # an index before the proc returns; index = -1 on error (no number at all) # the problem here is that integers have an asymmetrical range: there is # one more valid negative than prositive integer. Thus we perform the # computation as a negative number and then change the sign at the end. var i: int = index # a local i is more efficient than accessing a var parameter sign: BiggestInt = -1 if s[i] == '+': inc(i) elif s[i] == '-': inc(i) sign = 1 if s[i] in {'0'..'9'}: result = 0 while s[i] in {'0'..'9'}: result = result * 10 - (ord(s[i]) - ord('0')) inc(i) while s[i] == '_': inc(i) # underscores are allowed and ignored result = result * sign if s[i] == '\0': index = i # store index back else: index = -1 # BUGFIX: error! else: index = -1 {.pop.} # overflowChecks proc parseInt(s: string): int = var index: int = 0 res = rawParseInt(s, index) if index == -1: raise newException(EInvalidValue, "invalid integer: " & s) elif (sizeof(int) <= 4) and ((res < low(int)) or (res > high(int))): raise newException(EOverflow, "overflow") else: result = int(res) # convert to smaller integer type proc ParseBiggestInt(s: string): biggestInt = var index = 0 result = rawParseInt(s, index) if index == -1: raise newException(EInvalidValue, "invalid integer: " & s) proc ParseFloat(s: string, start = 0): float = var esign = 1.0 sign = 1.0 i = start exponent: int flags: int result = 0.0 if s[i] == '+': inc(i) elif s[i] == '-': sign = -1.0 inc(i) if s[i] == 'N' or s[i] == 'n': if s[i+1] == 'A' or s[i+1] == 'a': if s[i+2] == 'N' or s[i+2] == 'n': if s[i+3] == '\0': return NaN raise newException(EInvalidValue, "invalid float: " & s) if s[i] == 'I' or s[i] == 'i': if s[i+1] == 'N' or s[i+1] == 'n': if s[i+2] == 'F' or s[i+2] == 'f': if s[i+3] == '\0': return Inf*sign raise newException(EInvalidValue, "invalid float: " & s) while s[i] in {'0'..'9'}: # Read integer part flags = flags or 1 result = result * 10.0 + toFloat(ord(s[i]) - ord('0')) inc(i) while s[i] == '_': inc(i) # Decimal? if s[i] == '.': var hd = 1.0 inc(i) while s[i] in {'0'..'9'}: # Read fractional part flags = flags or 2 result = result * 10.0 + toFloat(ord(s[i]) - ord('0')) hd = hd * 10.0 inc(i) while s[i] == '_': inc(i) result = result / hd # this complicated way preserves precision # Again, read integer and fractional part if flags == 0: raise newException(EInvalidValue, "invalid float: " & s) # Exponent? if s[i] in {'e', 'E'}: inc(i) if s[i] == '+': inc(i) elif s[i] == '-': esign = -1.0 inc(i) if s[i] notin {'0'..'9'}: raise newException(EInvalidValue, "invalid float: " & s) while s[i] in {'0'..'9'}: exponent = exponent * 10 + ord(s[i]) - ord('0') inc(i) while s[i] == '_': inc(i) # Calculate Exponent var hd = 1.0 for j in 1..exponent: hd = hd * 10.0 if esign > 0.0: result = result * hd else: result = result / hd # Not all characters are read? if s[i] != '\0': raise newException(EInvalidValue, "invalid float: " & s) # evaluate sign result = result * sign proc toOct*(x: BiggestInt, len: int): string = ## converts `x` into its octal representation. The resulting string is ## always `len` characters long. No leading ``0o`` prefix is generated. var mask: BiggestInt = 7 shift: BiggestInt = 0 assert(len > 0) result = newString(len) for j in countdown(len-1, 0): result[j] = chr(int((x and mask) shr shift) + ord('0')) shift = shift + 3 mask = mask shl 3 proc toBin*(x: BiggestInt, len: int): string = ## converts `x` into its binary representation. The resulting string is ## always `len` characters long. No leading ``0b`` prefix is generated. var mask: BiggestInt = 1 shift: BiggestInt = 0 assert(len > 0) result = newString(len) for j in countdown(len-1, 0): result[j] = chr(int((x and mask) shr shift) + ord('0')) shift = shift + 1 mask = mask shl 1 proc escape*(s: string, prefix = "\"", suffix = "\""): string = ## Escapes a string `s`. This does these operations (at the same time): ## * replaces any ``\`` by ``\\`` ## * replaces any ``'`` by ``\'`` ## * replaces any ``"`` by ``\"`` ## * replaces any other character in the set ``{'\0'..'\31', '\128'..'\255'}`` ## by ``\xHH`` where ``HH`` is its hexadecimal value. ## The procedure has been designed so that its output is usable for many ## different common syntaxes. The resulting string is prefixed with ## ``prefix`` and suffixed with ``suffix``. Both may be empty strings. result = prefix for c in items(s): case c of '\0'..'\31', '\128'..'\255': add(result, '\\') add(result, toHex(ord(c), 2)) of '\\': add(result, "\\\\") of '\'': add(result, "\\'") of '\"': add(result, "\\\"") else: add(result, c) add(result, suffix) proc validEmailAddress*(s: string): bool = ## returns true if `s` seems to be a valid e-mail address. ## The checking also uses a domain list. const chars = Letters + Digits + {'!','#','$','%','&', '\'','*','+','/','=','?','^','_','`','{','}','|','~','-','.'} var i = 0 if s[i] notin chars or s[i] == '.': return false while s[i] in chars: if s[i] == '.' and s[i+1] == '.': return false inc(i) if s[i] != '@': return false var j = len(s)-1 if s[j] notin letters: return false while j >= i and s[j] in letters: dec(j) inc(i) # skip '@' while s[i] in {'0'..'9', 'a'..'z', '-', '.'}: inc(i) if s[i] != '\0': return false var x = copy(s, j+1) if len(x) == 2 and x[0] in Letters and x[1] in Letters: return true case toLower(x) of "com", "org", "net", "gov", "mil", "biz", "info", "mobi", "name", "aero", "jobs", "museum": return true return false proc editDistance*(a, b: string): int = ## returns the edit distance between `a` and `b`. This uses the Levenshtein ## distance algorithm with only a linear memory overhead. This implementation ## is highly optimized! var len1 = a.len var len2 = b.len if len1 > len2: # make `b` the longer string return editDistance(b, a) # strip common prefix: var s = 0 while a[s] == b[s] and a[s] != '\0': inc(s) dec(len1) dec(len2) # strip common suffix: while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]: dec(len1) dec(len2) # trivial cases: if len1 == 0: return len2 if len2 == 0: return len1 # another special case: if len1 == 1: for j in s..len2-1: if a[s] == b[j]: return len2 - 1 return len2 inc(len1) inc(len2) var half = len1 shr 1 # initalize first row: #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2 * sizeof(int))) var row: seq[int] newSeq(row, len2) var e = s + len2 - 1 # end marker for i in 1..len2 - half - 1: row[i] = i row[0] = len1 - half - 1 for i in 1 .. len1 - 1: var char1 = a[i + s - 1] var char2p: int var D, x: int var p: int if i >= len1 - half: # skip the upper triangle: var offset = i - len1 + half char2p = offset p = offset var c3 = row[p] + ord(char1 != b[s + char2p]) inc(p) inc(char2p) x = row[p] + 1 D = x if x > c3: x = c3 row[p] = x inc(p) else: p = 1 char2p = 0 D = i x = i if i <= half + 1: # skip the lower triangle: e = len2 + i - half - 2 # main: while p <= e: dec(D) var c3 = D + ord(char1 != b[char2p + s]) inc(char2p) inc(x) if x > c3: x = c3 D = row[p] + 1 if x > D: x = D row[p] = x inc(p) # lower triangle sentinel: if i <= half: dec(D) var c3 = D + ord(char1 != b[char2p + s]) inc(x) if x > c3: x = c3 row[p] = x result = row[e] #dealloc(row) {.pop.}