diff options
Diffstat (limited to 'lib/std/editdistance.nim')
-rw-r--r-- | lib/std/editdistance.nim | 266 |
1 files changed, 266 insertions, 0 deletions
diff --git a/lib/std/editdistance.nim b/lib/std/editdistance.nim new file mode 100644 index 000000000..40c0017ae --- /dev/null +++ b/lib/std/editdistance.nim @@ -0,0 +1,266 @@ +# +# +# Nim's Runtime Library +# (c) Copyright 2018 Nim contributors +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module implements an algorithm to compute the +## `edit distance`:idx: between two Unicode strings. + +import std/unicode + +proc editDistance*(a, b: string): int {.noSideEffect.} = + ## Returns the **unicode-rune** edit distance between `a` and `b`. + ## + ## This uses the `Levenshtein`:idx: distance algorithm with only a linear + ## memory overhead. + runnableExamples: static: doAssert editdistance("Kitten", "Bitten") == 1 + if runeLen(a) > runeLen(b): + # make `b` the longer string + return editDistance(b, a) + # strip common prefix + var + iStart = 0 ## The character starting index of the first rune in both strings `a` and `b` + iNextA = 0 + iNextB = 0 + runeA, runeB: Rune + lenRunesA = 0 ## The number of relevant runes in string `a`. + lenRunesB = 0 ## The number of relevant runes in string `b`. + block commonPrefix: + # `a` is the shorter string + while iStart < len(a): + iNextA = iStart + a.fastRuneAt(iNextA, runeA, doInc = true) + iNextB = iStart + b.fastRuneAt(iNextB, runeB, doInc = true) + if runeA != runeB: + inc(lenRunesA) + inc(lenRunesB) + break + iStart = iNextA + var + # we know that we are either at the start of the strings + # or that the current value of runeA is not equal to runeB + # => start search for common suffix after the current rune (`i_next_*`) + iEndA = iNextA ## The exclusive upper index bound of string `a`. + iEndB = iNextB ## The exclusive upper index bound of string `b`. + iCurrentA = iNextA + iCurrentB = iNextB + block commonSuffix: + var + addRunesA = 0 + addRunesB = 0 + while iCurrentA < len(a) and iCurrentB < len(b): + iNextA = iCurrentA + a.fastRuneAt(iNextA, runeA) + iNextB = iCurrentB + b.fastRuneAt(iNextB, runeB) + inc(addRunesA) + inc(addRunesB) + if runeA != runeB: + iEndA = iNextA + iEndB = iNextB + inc(lenRunesA, addRunesA) + inc(lenRunesB, addRunesB) + addRunesA = 0 + addRunesB = 0 + iCurrentA = iNextA + iCurrentB = iNextB + if iCurrentA >= len(a): # `a` exhausted + if iCurrentB < len(b): # `b` not exhausted + iEndA = iCurrentA + iEndB = iCurrentB + inc(lenRunesA, addRunesA) + inc(lenRunesB, addRunesB) + while true: + b.fastRuneAt(iEndB, runeB) + inc(lenRunesB) + if iEndB >= len(b): break + elif iCurrentB >= len(b): # `b` exhausted and `a` not exhausted + iEndA = iCurrentA + iEndB = iCurrentB + inc(lenRunesA, addRunesA) + inc(lenRunesB, addRunesB) + while true: + a.fastRuneAt(iEndA, runeA) + inc(lenRunesA) + if iEndA >= len(a): break + block specialCases: + # trivial cases: + if lenRunesA == 0: return lenRunesB + if lenRunesB == 0: return lenRunesA + # another special case: + if lenRunesA == 1: + a.fastRuneAt(iStart, runeA, doInc = false) + var iCurrentB = iStart + while iCurrentB < iEndB: + b.fastRuneAt(iCurrentB, runeB, doInc = true) + if runeA == runeB: return lenRunesB - 1 + return lenRunesB + # common case: + var + len1 = lenRunesA + 1 + len2 = lenRunesB + 1 + row: seq[int] + let half = lenRunesA div 2 + newSeq(row, len2) + var e = iStart + len2 - 1 # end marker + # initialize first row: + for i in 1 .. (len2 - half - 1): row[i] = i + row[0] = len1 - half - 1 + iCurrentA = iStart + var + char2pI = -1 + char2pPrev: int + for i in 1 .. (len1 - 1): + iNextA = iCurrentA + a.fastRuneAt(iNextA, runeA) + var + char2p: int + diff, x: int + p: int + if i >= (len1 - half): + # skip the upper triangle: + let offset = i + half - len1 + if char2pI == i: + b.fastRuneAt(char2pPrev, runeB) + char2p = char2pPrev + char2pI = i + 1 + else: + char2p = iStart + for j in 0 ..< offset: + runeB = b.runeAt(char2p) + inc(char2p, runeB.size) + char2pI = i + 1 + char2pPrev = char2p + p = offset + runeB = b.runeAt(char2p) + var c3 = row[p] + (if runeA != runeB: 1 else: 0) + inc(char2p, runeB.size) + inc(p) + x = row[p] + 1 + diff = x + if x > c3: x = c3 + row[p] = x + inc(p) + else: + p = 1 + char2p = iStart + diff = i + x = i + if i <= (half + 1): + # skip the lower triangle: + e = len2 + i - half - 2 + # main: + while p <= e: + dec(diff) + runeB = b.runeAt(char2p) + var c3 = diff + (if runeA != runeB: 1 else: 0) + inc(char2p, runeB.size) + inc(x) + if x > c3: x = c3 + diff = row[p] + 1 + if x > diff: x = diff + row[p] = x + inc(p) + # lower triangle sentinel: + if i <= half: + dec(diff) + runeB = b.runeAt(char2p) + var c3 = diff + (if runeA != runeB: 1 else: 0) + inc(x) + if x > c3: x = c3 + row[p] = x + iCurrentA = iNextA + result = row[e] + +proc editDistanceAscii*(a, b: string): int {.noSideEffect.} = + ## Returns the edit distance between `a` and `b`. + ## + ## This uses the `Levenshtein`:idx: distance algorithm with only a linear + ## memory overhead. + runnableExamples: static: doAssert editDistanceAscii("Kitten", "Bitten") == 1 + var len1 = a.len + var len2 = b.len + if len1 > len2: + # make `b` the longer string + return editDistanceAscii(b, a) + + # strip common prefix: + var s = 0 + while s < len1 and a[s] == b[s]: + inc(s) + dec(len1) + dec(len2) + # strip common suffix: + while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]: + dec(len1) + dec(len2) + # trivial cases: + if len1 == 0: return len2 + if len2 == 0: return len1 + + # another special case: + if len1 == 1: + for j in s..s+len2-1: + if a[s] == b[j]: return len2 - 1 + return len2 + + inc(len1) + inc(len2) + var half = len1 shr 1 + # initialize first row: + #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int))) + var row: seq[int] + newSeq(row, len2) + var e = s + len2 - 1 # end marker + for i in 1..len2 - half - 1: row[i] = i + row[0] = len1 - half - 1 + for i in 1 .. len1 - 1: + var char1 = a[i + s - 1] + var char2p: int + var diff, x: int + var p: int + if i >= len1 - half: + # skip the upper triangle: + var offset = i - len1 + half + char2p = offset + p = offset + var c3 = row[p] + ord(char1 != b[s + char2p]) + inc(p) + inc(char2p) + x = row[p] + 1 + diff = x + if x > c3: x = c3 + row[p] = x + inc(p) + else: + p = 1 + char2p = 0 + diff = i + x = i + if i <= half + 1: + # skip the lower triangle: + e = len2 + i - half - 2 + # main: + while p <= e: + dec(diff) + var c3 = diff + ord(char1 != b[char2p + s]) + inc(char2p) + inc(x) + if x > c3: x = c3 + diff = row[p] + 1 + if x > diff: x = diff + row[p] = x + inc(p) + # lower triangle sentinel: + if i <= half: + dec(diff) + var c3 = diff + ord(char1 != b[char2p + s]) + inc(x) + if x > c3: x = c3 + row[p] = x + result = row[e] |