#
#
# Nim's Runtime Library
# (c) Copyright 2018 Nim contributors
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## This module implements an algorithm to compute the
## `edit distance`:idx: between two Unicode strings.
import unicode
proc editDistance*(a, b: string): int {.noSideEffect.} =
## Returns the **unicode-rune** edit distance between `a` and `b`.
##
## This uses the `Levenshtein`:idx: distance algorithm with only a linear
## memory overhead.
runnableExamples: static: doAssert editdistance("Kitten", "Bitten") == 1
if len(a) > len(b):
# make `b` the longer string
return editDistance(b, a)
# strip common prefix
var
iStart = 0 ## The character starting index of the first rune in both strings `a` and `b`
iNextA = 0
iNextB = 0
runeA, runeB: Rune
lenRunesA = 0 ## The number of relevant runes in string `a`.
lenRunesB = 0 ## The number of relevant runes in string `b`.
block commonPrefix:
# `a` is the shorter string
while iStart < len(a):
iNextA = iStart
a.fastRuneAt(iNextA, runeA, doInc = true)
iNextB = iStart
b.fastRuneAt(iNextB, runeB, doInc = true)
if runeA != runeB:
inc(lenRunesA)
inc(lenRunesB)
break
iStart = iNextA
var
# we know that we are either at the start of the strings
# or that the current value of runeA is not equal to runeB
# => start search for common suffix after the current rune (`i_next_*`)
iEndA = iNextA ## The exclusive upper index bound of string `a`.
iEndB = iNextB ## The exclusive upper index bound of string `b`.
iCurrentA = iNextA
iCurrentB = iNextB
block commonSuffix:
var
addRunesA = 0
addRunesB = 0
while iCurrentA < len(a) and iCurrentB < len(b):
iNextA = iCurrentA
a.fastRuneAt(iNextA, runeA)
iNextB = iCurrentB
b.fastRuneAt(iNextB, runeB)
inc(addRunesA)
inc(addRunesB)
if runeA != runeB:
iEndA = iNextA
iEndB = iNextB
inc(lenRunesA, addRunesA)
inc(lenRunesB, addRunesB)
addRunesA = 0
addRunesB = 0
iCurrentA = iNextA
iCurrentB = iNextB
if iCurrentA >= len(a): # `a` exhausted
if iCurrentB < len(b): # `b` not exhausted
iEndA = iCurrentA
iEndB = iCurrentB
inc(lenRunesA, addRunesA)
inc(lenRunesB, addRunesB)
while true:
b.fastRuneAt(iEndB, runeB)
inc(lenRunesB)
if iEndB >= len(b): break
elif iCurrentB >= len(b): # `b` exhausted and `a` not exhausted
iEndA = iCurrentA
iEndB = iCurrentB
inc(lenRunesA, addRunesA)
inc(lenRunesB, addRunesB)
while true:
a.fastRuneAt(iEndA, runeA)
inc(lenRunesA)
if iEndA >= len(a): break
block specialCases:
# trivial cases:
if lenRunesA == 0: return lenRunesB
if lenRunesB == 0: return lenRunesA
# another special case:
if lenRunesA == 1:
a.fastRuneAt(iStart, runeA, doInc = false)
var iCurrentB = iStart
while iCurrentB < iEndB:
b.fastRuneAt(iCurrentB, runeB, doInc = true)
if runeA == runeB: return lenRunesB - 1
return lenRunesB
# common case:
var
len1 = lenRunesA + 1
len2 = lenRunesB + 1
row: seq[int]
let half = lenRunesA div 2
newSeq(row, len2)
var e = iStart + len2 - 1 # end marker
# initialize first row:
for i in 1 .. (len2 - half - 1): row[i] = i
row[0] = len1 - half - 1
iCurrentA = iStart
var
char2pI = -1
char2pPrev: int
for i in 1 .. (len1 - 1):
iNextA = iCurrentA
a.fastRuneAt(iNextA, runeA)
var
char2p: int
diff, x: int
p: int
if i >= (len1 - half):
# skip the upper triangle:
let offset = i + half - len1
if char2pI == i:
b.fastRuneAt(char2pPrev, runeB)
char2p = char2pPrev
char2pI = i + 1
else:
char2p = iStart
for j in 0 ..< offset:
runeB = b.runeAt(char2p)
inc(char2p, runeB.size)
char2pI = i + 1
char2pPrev = char2p
p = offset
runeB = b.runeAt(char2p)
var c3 = row[p] + (if runeA != runeB: 1 else: 0)
inc(char2p, runeB.size)
inc(p)
x = row[p] + 1
diff = x
if x > c3: x = c3
row[p] = x
inc(p)
else:
p = 1
char2p = iStart
diff = i
x = i
if i <= (half + 1):
# skip the lower triangle:
e = len2 + i - half - 2
# main:
while p <= e:
dec(diff)
runeB = b.runeAt(char2p)
var c3 = diff + (if runeA != runeB: 1 else: 0)
inc(char2p, runeB.size)
inc(x)
if x > c3: x = c3
diff = row[p] + 1
if x > diff: x = diff
row[p] = x
inc(p)
# lower triangle sentinel:
if i <= half:
dec(diff)
runeB = b.runeAt(char2p)
var c3 = diff + (if runeA != runeB: 1 else: 0)
inc(x)
if x > c3: x = c3
row[p] = x
iCurrentA = iNextA
result = row[e]
proc editDistanceAscii*(a, b: string): int {.noSideEffect.} =
## Returns the edit distance between `a` and `b`.
##
## This uses the `Levenshtein`:idx: distance algorithm with only a linear
## memory overhead.
runnableExamples: static: doAssert editDistanceAscii("Kitten", "Bitten") == 1
var len1 = a.len
var len2 = b.len
if len1 > len2:
# make `b` the longer string
return editDistanceAscii(b, a)
# strip common prefix:
var s = 0
while s < len1 and a[s] == b[s]:
inc(s)
dec(len1)
dec(len2)
# strip common suffix:
while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]:
dec(len1)
dec(len2)
# trivial cases:
if len1 == 0: return len2
if len2 == 0: return len1
# another special case:
if len1 == 1:
for j in s..s+len2-1:
if a[s] == b[j]: return len2 - 1
return len2
inc(len1)
inc(len2)
var half = len1 shr 1
# initialize first row:
#var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int)))
var row: seq[int]
newSeq(row, len2)
var e = s + len2 - 1 # end marker
for i in 1..len2 - half - 1: row[i] = i
row[0] = len1 - half - 1
for i in 1 .. len1 - 1:
var char1 = a[i + s - 1]
var char2p: int
var diff, x: int
var p: int
if i >= len1 - half:
# skip the upper triangle:
var offset = i - len1 + half
char2p = offset
p = offset
var c3 = row[p] + ord(char1 != b[s + char2p])
inc(p)
inc(char2p)
x = row[p] + 1
diff = x
if x > c3: x = c3
row[p] = x
inc(p)
else:
p = 1
char2p = 0
diff = i
x = i
if i <= half + 1:
# skip the lower triangle:
e = len2 + i - half - 2
# main:
while p <= e:
dec(diff)
var c3 = diff + ord(char1 != b[char2p + s])
inc(char2p)
inc(x)
if x > c3: x = c3
diff = row[p] + 1
if x > diff: x = diff
row[p] = x
inc(p)
# lower triangle sentinel:
if i <= half:
dec(diff)
var c3 = diff + ord(char1 != b[char2p + s])
inc(x)
if x > c3: x = c3
row[p] = x
result = row[e]