# # # Nim's Runtime Library # (c) Copyright 2018 Nim contributors # # See the file "copying.txt", included in this # distribution, for details about the copyright. # ## This module implements an algorithm to compute the ## `edit distance`:idx: between two Unicode strings. import unicode proc editDistance*(a, b: string): int {.noSideEffect.} = ## Returns the **unicode-rune** edit distance between ``a`` and ``b``. ## ## This uses the `Levenshtein`:idx: distance algorithm with only a linear ## memory overhead. runnableExamples: static: doAssert editdistance("Kitten", "Bitten") == 1 if len(a) > len(b): # make ``b`` the longer string return editDistance(b, a) # strip common prefix var iStart = 0 ## The character starting index of the first rune in both strings ``a`` and ``b`` iNextA = 0 iNextB = 0 runeA, runeB: Rune lenRunesA = 0 ## The number of relevant runes in string ``a``. lenRunesB = 0 ## The number of relevant runes in string ``b``. block commonPrefix: # ``a`` is the shorter string while iStart < len(a): iNextA = iStart a.fastRuneAt(iNextA, runeA, doInc = true) iNextB = iStart b.fastRuneAt(iNextB, runeB, doInc = true) if runeA != runeB: inc(lenRunesA) inc(lenRunesB) break iStart = iNextA var # we know that we are either at the start of the strings # or that the current value of runeA is not equal to runeB # => start search for common suffix after the current rune (``i_next_*``) iEndA = iNextA ## The exclusive upper index bound of string ``a``. iEndB = iNextB ## The exclusive upper index bound of string ``b``. iCurrentA = iNextA iCurrentB = iNextB block commonSuffix: var addRunesA = 0 addRunesB = 0 while iCurrentA < len(a) and iCurrentB < len(b): iNextA = iCurrentA a.fastRuneAt(iNextA, runeA) iNextB = iCurrentB b.fastRuneAt(iNextB, runeB) inc(addRunesA) inc(addRunesB) if runeA != runeB: iEndA = iNextA iEndB = iNextB inc(lenRunesA, addRunesA) inc(lenRunesB, addRunesB) addRunesA = 0 addRunesB = 0 iCurrentA = iNextA iCurrentB = iNextB if iCurrentA >= len(a): # ``a`` exhausted if iCurrentB < len(b): # ``b`` not exhausted iEndA = iCurrentA iEndB = iCurrentB inc(lenRunesA, addRunesA) inc(lenRunesB, addRunesB) while true: b.fastRuneAt(iEndB, runeB) inc(lenRunesB) if iEndB >= len(b): break elif iCurrentB >= len(b): # ``b`` exhausted and ``a`` not exhausted iEndA = iCurrentA iEndB = iCurrentB inc(lenRunesA, addRunesA) inc(lenRunesB, addRunesB) while true: a.fastRuneAt(iEndA, runeA) inc(lenRunesA) if iEndA >= len(a): break block specialCases: # trivial cases: if lenRunesA == 0: return lenRunesB if lenRunesB == 0: return lenRunesA # another special case: if lenRunesA == 1: a.fastRuneAt(iStart, runeA, doInc = false) var iCurrentB = iStart while iCurrentB < iEndB: b.fastRuneAt(iCurrentB, runeB, doInc = true) if runeA == runeB: return lenRunesB - 1 return lenRunesB # common case: var len1 = lenRunesA + 1 len2 = lenRunesB + 1 row: seq[int] let half = lenRunesA div 2 newSeq(row, len2) var e = iStart + len2 - 1 # end marker # initialize first row: for i in 1 .. (len2 - half - 1): row[i] = i row[0] = len1 - half - 1 iCurrentA = iStart var char2pI = -1 char2pPrev: int for i in 1 .. (len1 - 1): iNextA = iCurrentA a.fastRuneAt(iNextA, runeA) var char2p: int diff, x: int p: int if i >= (len1 - half): # skip the upper triangle: let offset = i + half - len1 if char2pI == i: b.fastRuneAt(char2pPrev, runeB) char2p = char2pPrev char2pI = i + 1 else: char2p = iStart for j in 0 ..< offset: runeB = b.runeAt(char2p) inc(char2p, runeB.size) char2pI = i + 1 char2pPrev = char2p p = offset runeB = b.runeAt(char2p) var c3 = row[p] + (if runeA != runeB: 1 else: 0) inc(char2p, runeB.size) inc(p) x = row[p] + 1 diff = x if x > c3: x = c3 row[p] = x inc(p) else: p = 1 char2p = iStart diff = i x = i if i <= (half + 1): # skip the lower triangle: e = len2 + i - half - 2 # main: while p <= e: dec(diff) runeB = b.runeAt(char2p) var c3 = diff + (if runeA != runeB: 1 else: 0) inc(char2p, runeB.size) inc(x) if x > c3: x = c3 diff = row[p] + 1 if x > diff: x = diff row[p] = x inc(p) # lower triangle sentinel: if i <= half: dec(diff) runeB = b.runeAt(char2p) var c3 = diff + (if runeA != runeB: 1 else: 0) inc(x) if x > c3: x = c3 row[p] = x iCurrentA = iNextA result = row[e] proc editDistanceAscii*(a, b: string): int {.noSideEffect.} = ## Returns the edit distance between `a` and `b`. ## ## This uses the `Levenshtein`:idx: distance algorithm with only a linear ## memory overhead. runnableExamples: static: doAssert editDistanceAscii("Kitten", "Bitten") == 1 var len1 = a.len var len2 = b.len if len1 > len2: # make `b` the longer string return editDistanceAscii(b, a) # strip common prefix: var s = 0 while s < len1 and a[s] == b[s]: inc(s) dec(len1) dec(len2) # strip common suffix: while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]: dec(len1) dec(len2) # trivial cases: if len1 == 0: return len2 if len2 == 0: return len1 # another special case: if len1 == 1: for j in s..s+len2-1: if a[s] == b[j]: return len2 - 1 return len2 inc(len1) inc(len2) var half = len1 shr 1 # initialize first row: #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int))) var row: seq[int] newSeq(row, len2) var e = s + len2 - 1 # end marker for i in 1..len2 - half - 1: row[i] = i row[0] = len1 - half - 1 for i in 1 .. len1 - 1: var char1 = a[i + s - 1] var char2p: int var diff, x: int var p: int if i >= len1 - half: # skip the upper triangle: var offset = i - len1 + half char2p = offset p = offset var c3 = row[p] + ord(char1 != b[s + char2p]) inc(p) inc(char2p) x = row[p] + 1 diff = x if x > c3: x = c3 row[p] = x inc(p) else: p = 1 char2p = 0 diff = i x = i if i <= half + 1: # skip the lower triangle: e = len2 + i - half - 2 # main: while p <= e: dec(diff) var c3 = diff + ord(char1 != b[char2p + s]) inc(char2p) inc(x) if x > c3: x = c3 diff = row[p] + 1 if x > diff: x = diff row[p] = x inc(p) # lower triangle sentinel: if i <= half: dec(diff) var c3 = diff + ord(char1 != b[char2p + s]) inc(x) if x > c3: x = c3 row[p] = x result = row[e] when isMainModule: doAssert editDistance("", "") == 0 doAssert editDistance("kitten", "sitting") == 3 # from Wikipedia doAssert editDistance("flaw", "lawn") == 2 # from Wikipedia doAssert editDistance("привет", "превет") == 1 doAssert editDistance("Åge", "Age") == 1 # editDistance, one string is longer in bytes, but shorter in rune length # first string: 4 bytes, second: 6 bytes, but only 3 runes doAssert editDistance("aaaa", "×××") == 4 block veryLongStringEditDistanceTest: const cap = 256 var s1 = newStringOfCap(cap) s2 = newStringOfCap(cap) while len(s1) < cap: s1.add 'a' while len(s2) < cap: s2.add 'b' doAssert editDistance(s1, s2) == cap block combiningCodePointsEditDistanceTest: const s = "A\xCC\x8Age" doAssert editDistance(s, "Age") == 1 doAssert editDistanceAscii("", "") == 0 doAssert editDistanceAscii("kitten", "sitting") == 3 # from Wikipedia doAssert editDistanceAscii("flaw", "lawn") == 2 # from Wikipedia assert(editDistance("prefix__hallo_suffix", "prefix__hallo_suffix") == 0) assert(editDistance("prefix__hallo_suffix", "prefix__hallo_suffi1") == 1) assert(editDistance("prefix__hallo_suffix", "prefix__HALLO_suffix") == 5) assert(editDistance("prefix__hallo_suffix", "prefix__ha_suffix") == 3) assert(editDistance("prefix__hallo_suffix", "prefix") == 14) assert(editDistance("prefix__hallo_suffix", "suffix") == 14) assert(editDistance("prefix__hallo_suffix", "prefix__hao_suffix") == 2) assert(editDistance("main", "malign") == 2)