summary refs log tree commit diff stats
path: root/lib/std/editdistance.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/std/editdistance.nim')
-rw-r--r--lib/std/editdistance.nim266
1 files changed, 266 insertions, 0 deletions
diff --git a/lib/std/editdistance.nim b/lib/std/editdistance.nim
new file mode 100644
index 000000000..40c0017ae
--- /dev/null
+++ b/lib/std/editdistance.nim
@@ -0,0 +1,266 @@
+#
+#
+#            Nim's Runtime Library
+#        (c) Copyright 2018 Nim contributors
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module implements an algorithm to compute the
+## `edit distance`:idx: between two Unicode strings.
+
+import std/unicode
+
+proc editDistance*(a, b: string): int {.noSideEffect.} =
+  ## Returns the **unicode-rune** edit distance between `a` and `b`.
+  ##
+  ## This uses the `Levenshtein`:idx: distance algorithm with only a linear
+  ## memory overhead.
+  runnableExamples: static: doAssert editdistance("Kitten", "Bitten") == 1
+  if runeLen(a) > runeLen(b):
+    # make `b` the longer string
+    return editDistance(b, a)
+  # strip common prefix
+  var
+    iStart = 0 ## The character starting index of the first rune in both strings `a` and `b`
+    iNextA = 0
+    iNextB = 0
+    runeA, runeB: Rune
+    lenRunesA = 0 ## The number of relevant runes in string `a`.
+    lenRunesB = 0 ## The number of relevant runes in string `b`.
+  block commonPrefix:
+    # `a` is the shorter string
+    while iStart < len(a):
+      iNextA = iStart
+      a.fastRuneAt(iNextA, runeA, doInc = true)
+      iNextB = iStart
+      b.fastRuneAt(iNextB, runeB, doInc = true)
+      if runeA != runeB:
+        inc(lenRunesA)
+        inc(lenRunesB)
+        break
+      iStart = iNextA
+  var
+    # we know that we are either at the start of the strings
+    # or that the current value of runeA is not equal to runeB
+    # => start search for common suffix after the current rune (`i_next_*`)
+    iEndA = iNextA ## The exclusive upper index bound of string `a`.
+    iEndB = iNextB ## The exclusive upper index bound of string `b`.
+    iCurrentA = iNextA
+    iCurrentB = iNextB
+  block commonSuffix:
+    var
+      addRunesA = 0
+      addRunesB = 0
+    while iCurrentA < len(a) and iCurrentB < len(b):
+      iNextA = iCurrentA
+      a.fastRuneAt(iNextA, runeA)
+      iNextB = iCurrentB
+      b.fastRuneAt(iNextB, runeB)
+      inc(addRunesA)
+      inc(addRunesB)
+      if runeA != runeB:
+        iEndA = iNextA
+        iEndB = iNextB
+        inc(lenRunesA, addRunesA)
+        inc(lenRunesB, addRunesB)
+        addRunesA = 0
+        addRunesB = 0
+      iCurrentA = iNextA
+      iCurrentB = iNextB
+    if iCurrentA >= len(a): # `a` exhausted
+      if iCurrentB < len(b): # `b` not exhausted
+        iEndA = iCurrentA
+        iEndB = iCurrentB
+        inc(lenRunesA, addRunesA)
+        inc(lenRunesB, addRunesB)
+        while true:
+          b.fastRuneAt(iEndB, runeB)
+          inc(lenRunesB)
+          if iEndB >= len(b): break
+    elif iCurrentB >= len(b): # `b` exhausted and `a` not exhausted
+      iEndA = iCurrentA
+      iEndB = iCurrentB
+      inc(lenRunesA, addRunesA)
+      inc(lenRunesB, addRunesB)
+      while true:
+        a.fastRuneAt(iEndA, runeA)
+        inc(lenRunesA)
+        if iEndA >= len(a): break
+  block specialCases:
+    # trivial cases:
+    if lenRunesA == 0: return lenRunesB
+    if lenRunesB == 0: return lenRunesA
+    # another special case:
+    if lenRunesA == 1:
+      a.fastRuneAt(iStart, runeA, doInc = false)
+      var iCurrentB = iStart
+      while iCurrentB < iEndB:
+        b.fastRuneAt(iCurrentB, runeB, doInc = true)
+        if runeA == runeB: return lenRunesB - 1
+      return lenRunesB
+  # common case:
+  var
+    len1 = lenRunesA + 1
+    len2 = lenRunesB + 1
+    row: seq[int]
+  let half = lenRunesA div 2
+  newSeq(row, len2)
+  var e = iStart + len2 - 1 # end marker
+  # initialize first row:
+  for i in 1 .. (len2 - half - 1): row[i] = i
+  row[0] = len1 - half - 1
+  iCurrentA = iStart
+  var
+    char2pI = -1
+    char2pPrev: int
+  for i in 1 .. (len1 - 1):
+    iNextA = iCurrentA
+    a.fastRuneAt(iNextA, runeA)
+    var
+      char2p: int
+      diff, x: int
+      p: int
+    if i >= (len1 - half):
+      # skip the upper triangle:
+      let offset = i + half - len1
+      if char2pI == i:
+        b.fastRuneAt(char2pPrev, runeB)
+        char2p = char2pPrev
+        char2pI = i + 1
+      else:
+        char2p = iStart
+        for j in 0 ..< offset:
+          runeB = b.runeAt(char2p)
+          inc(char2p, runeB.size)
+        char2pI = i + 1
+        char2pPrev = char2p
+      p = offset
+      runeB = b.runeAt(char2p)
+      var c3 = row[p] + (if runeA != runeB: 1 else: 0)
+      inc(char2p, runeB.size)
+      inc(p)
+      x = row[p] + 1
+      diff = x
+      if x > c3: x = c3
+      row[p] = x
+      inc(p)
+    else:
+      p = 1
+      char2p = iStart
+      diff = i
+      x = i
+    if i <= (half + 1):
+      # skip the lower triangle:
+      e = len2 + i - half - 2
+    # main:
+    while p <= e:
+      dec(diff)
+      runeB = b.runeAt(char2p)
+      var c3 = diff + (if runeA != runeB: 1 else: 0)
+      inc(char2p, runeB.size)
+      inc(x)
+      if x > c3: x = c3
+      diff = row[p] + 1
+      if x > diff: x = diff
+      row[p] = x
+      inc(p)
+    # lower triangle sentinel:
+    if i <= half:
+      dec(diff)
+      runeB = b.runeAt(char2p)
+      var c3 = diff + (if runeA != runeB: 1 else: 0)
+      inc(x)
+      if x > c3: x = c3
+      row[p] = x
+    iCurrentA = iNextA
+  result = row[e]
+
+proc editDistanceAscii*(a, b: string): int {.noSideEffect.} =
+  ## Returns the edit distance between `a` and `b`.
+  ##
+  ## This uses the `Levenshtein`:idx: distance algorithm with only a linear
+  ## memory overhead.
+  runnableExamples: static: doAssert editDistanceAscii("Kitten", "Bitten") == 1
+  var len1 = a.len
+  var len2 = b.len
+  if len1 > len2:
+    # make `b` the longer string
+    return editDistanceAscii(b, a)
+
+  # strip common prefix:
+  var s = 0
+  while s < len1 and a[s] == b[s]:
+    inc(s)
+    dec(len1)
+    dec(len2)
+  # strip common suffix:
+  while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]:
+    dec(len1)
+    dec(len2)
+  # trivial cases:
+  if len1 == 0: return len2
+  if len2 == 0: return len1
+
+  # another special case:
+  if len1 == 1:
+    for j in s..s+len2-1:
+      if a[s] == b[j]: return len2 - 1
+    return len2
+
+  inc(len1)
+  inc(len2)
+  var half = len1 shr 1
+  # initialize first row:
+  #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int)))
+  var row: seq[int]
+  newSeq(row, len2)
+  var e = s + len2 - 1 # end marker
+  for i in 1..len2 - half - 1: row[i] = i
+  row[0] = len1 - half - 1
+  for i in 1 .. len1 - 1:
+    var char1 = a[i + s - 1]
+    var char2p: int
+    var diff, x: int
+    var p: int
+    if i >= len1 - half:
+      # skip the upper triangle:
+      var offset = i - len1 + half
+      char2p = offset
+      p = offset
+      var c3 = row[p] + ord(char1 != b[s + char2p])
+      inc(p)
+      inc(char2p)
+      x = row[p] + 1
+      diff = x
+      if x > c3: x = c3
+      row[p] = x
+      inc(p)
+    else:
+      p = 1
+      char2p = 0
+      diff = i
+      x = i
+    if i <= half + 1:
+      # skip the lower triangle:
+      e = len2 + i - half - 2
+    # main:
+    while p <= e:
+      dec(diff)
+      var c3 = diff + ord(char1 != b[char2p + s])
+      inc(char2p)
+      inc(x)
+      if x > c3: x = c3
+      diff = row[p] + 1
+      if x > diff: x = diff
+      row[p] = x
+      inc(p)
+    # lower triangle sentinel:
+    if i <= half:
+      dec(diff)
+      var c3 = diff + ord(char1 != b[char2p + s])
+      inc(x)
+      if x > c3: x = c3
+      row[p] = x
+  result = row[e]