diff options
Diffstat (limited to 'lib/experimental/diff.nim')
-rw-r--r-- | lib/experimental/diff.nim | 339 |
1 files changed, 339 insertions, 0 deletions
diff --git a/lib/experimental/diff.nim b/lib/experimental/diff.nim new file mode 100644 index 000000000..669e9f613 --- /dev/null +++ b/lib/experimental/diff.nim @@ -0,0 +1,339 @@ +# +# +# Nim's Runtime Library +# (c) Copyright 2018 Nim contributors +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module implements an algorithm to compute the +## `diff`:idx: between two sequences of lines. +## +## - To learn more see `Diff on Wikipedia. <http://wikipedia.org/wiki/Diff>`_ + +runnableExamples: + assert diffInt( + [0, 1, 2, 3, 4, 5, 6, 7, 8], + [-1, 1, 2, 3, 4, 5, 666, 7, 42]) == + @[Item(startA: 0, startB: 0, deletedA: 1, insertedB: 1), + Item(startA: 6, startB: 6, deletedA: 1, insertedB: 1), + Item(startA: 8, startB: 8, deletedA: 1, insertedB: 1)] + +runnableExamples: + # 2 samples of text (from "The Call of Cthulhu" by Lovecraft) + let txt0 = """ +abc +def ghi +jkl2""" + let txt1 = """ +bacx +abc +def ghi +jkl""" + assert diffText(txt0, txt1) == + @[Item(startA: 0, startB: 0, deletedA: 0, insertedB: 1), + Item(startA: 2, startB: 3, deletedA: 1, insertedB: 1)] + +# code owner: Arne Döring +# +# This is based on C# code written by Matthias Hertel, http://www.mathertel.de +# +# This Class implements the Difference Algorithm published in +# "An O(ND) Difference Algorithm and its Variations" by Eugene Myers +# Algorithmica Vol. 1 No. 2, 1986, p 251. + +import std/[tables, strutils] + +when defined(nimPreviewSlimSystem): + import std/assertions + +type + Item* = object ## An Item in the list of differences. + startA*: int ## Start Line number in Data A. + startB*: int ## Start Line number in Data B. + deletedA*: int ## Number of changes in Data A. + insertedB*: int ## Number of changes in Data B. + + DiffData = object ## Data on one input file being compared. + data: seq[int] ## Buffer of numbers that will be compared. + modified: seq[bool] ## Array of booleans that flag for modified + ## data. This is the result of the diff. + ## This means deletedA in the first Data or + ## inserted in the second Data. + + Smsrd = object + x, y: int + +# template to avoid a seq copy. Required until `sink` parameters are ready. +template newDiffData(initData: seq[int]; L: int): DiffData = + DiffData( + data: initData, + modified: newSeq[bool](L + 2) + ) + +proc len(d: DiffData): int {.inline.} = d.data.len + +proc diffCodes(aText: string; h: var Table[string, int]): DiffData = + ## This function converts all textlines of the text into unique numbers for every unique textline + ## so further work can work only with simple numbers. + ## `aText` the input text + ## `h` This extern initialized hashtable is used for storing all ever used textlines. + ## `trimSpace` ignore leading and trailing space characters + ## Returns a array of integers. + var lastUsedCode = h.len + result.data = newSeq[int]() + for s in aText.splitLines: + if h.contains s: + result.data.add h[s] + else: + inc lastUsedCode + h[s] = lastUsedCode + result.data.add lastUsedCode + result.modified = newSeq[bool](result.data.len + 2) + +proc optimize(data: var DiffData) = + ## If a sequence of modified lines starts with a line that contains the same content + ## as the line that appends the changes, the difference sequence is modified so that the + ## appended line and not the starting line is marked as modified. + ## This leads to more readable diff sequences when comparing text files. + var startPos = 0 + while startPos < data.len: + while startPos < data.len and not data.modified[startPos]: + inc startPos + var endPos = startPos + while endPos < data.len and data.modified[endPos]: + inc endPos + + if endPos < data.len and data.data[startPos] == data.data[endPos]: + data.modified[startPos] = false + data.modified[endPos] = true + else: + startPos = endPos + +proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int; + downVector, upVector: var openArray[int]): Smsrd = + ## This is the algorithm to find the Shortest Middle Snake (sms). + ## `dataA` sequence A + ## `lowerA` lower bound of the actual range in dataA + ## `upperA` upper bound of the actual range in dataA (exclusive) + ## `dataB` sequence B + ## `lowerB` lower bound of the actual range in dataB + ## `upperB` upper bound of the actual range in dataB (exclusive) + ## `downVector` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons. + ## `upVector` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons. + ## Returns a MiddleSnakeData record containing x,y and u,v. + + let max = dataA.len + dataB.len + 1 + + let downK = lowerA - lowerB # the k-line to start the forward search + let upK = upperA - upperB # the k-line to start the reverse search + + let delta = (upperA - lowerA) - (upperB - lowerB) + let oddDelta = (delta and 1) != 0 + + # The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based + # and are access using a specific offset: upOffset upVector and downOffset for downVector + let downOffset = max - downK + let upOffset = max - upK + + let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1 + + downVector[downOffset + downK + 1] = lowerA + upVector[upOffset + upK - 1] = upperA + + for D in 0 .. maxD: + # Extend the forward path. + for k in countup(downK - D, downK + D, 2): + # find the only or better starting point + var x: int + if k == downK - D: + x = downVector[downOffset + k + 1] # down + else: + x = downVector[downOffset + k - 1] + 1 # a step to the right + if k < downK + D and downVector[downOffset + k + 1] >= x: + x = downVector[downOffset + k + 1] # down + + var y = x - k + + # find the end of the furthest reaching forward D-path in diagonal k. + while x < upperA and y < upperB and dataA.data[x] == dataB.data[y]: + inc x + inc y + + downVector[downOffset + k] = x + + # overlap ? + if oddDelta and upK - D < k and k < upK + D: + if upVector[upOffset + k] <= downVector[downOffset + k]: + return Smsrd(x: downVector[downOffset + k], + y: downVector[downOffset + k] - k) + + # Extend the reverse path. + for k in countup(upK - D, upK + D, 2): + # find the only or better starting point + var x: int + if k == upK + D: + x = upVector[upOffset + k - 1] # up + else: + x = upVector[upOffset + k + 1] - 1 # left + if k > upK - D and upVector[upOffset + k - 1] < x: + x = upVector[upOffset + k - 1] # up + + var y = x - k + while x > lowerA and y > lowerB and dataA.data[x - 1] == dataB.data[y - 1]: + dec x + dec y + + upVector[upOffset + k] = x + + # overlap ? + if not oddDelta and downK-D <= k and k <= downK+D: + if upVector[upOffset + k] <= downVector[downOffset + k]: + return Smsrd(x: downVector[downOffset + k], + y: downVector[downOffset + k] - k) + + assert false, "the algorithm should never come here." + +proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int; + downVector, upVector: var openArray[int]) = + ## This is the divide-and-conquer implementation of the longes common-subsequence (lcs) + ## algorithm. + ## The published algorithm passes recursively parts of the A and B sequences. + ## To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant. + ## `dataA` sequence A + ## `lowerA` lower bound of the actual range in dataA + ## `upperA` upper bound of the actual range in dataA (exclusive) + ## `dataB` sequence B + ## `lowerB` lower bound of the actual range in dataB + ## `upperB` upper bound of the actual range in dataB (exclusive) + ## `downVector` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons. + ## `upVector` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons. + + # make mutable copy: + var lowerA = lowerA + var lowerB = lowerB + var upperA = upperA + var upperB = upperB + + # Fast walkthrough equal lines at the start + while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]: + inc lowerA + inc lowerB + + # Fast walkthrough equal lines at the end + while lowerA < upperA and lowerB < upperB and dataA.data[upperA - 1] == dataB.data[upperB - 1]: + dec upperA + dec upperB + + if lowerA == upperA: + # mark as inserted lines. + while lowerB < upperB: + dataB.modified[lowerB] = true + inc lowerB + + elif lowerB == upperB: + # mark as deleted lines. + while lowerA < upperA: + dataA.modified[lowerA] = true + inc lowerA + + else: + # Find the middle snake and length of an optimal path for A and B + let smsrd = sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector) + # Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y)) + + # The path is from LowerX to (x,y) and (x,y) to UpperX + lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector) + lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector) # 2002.09.20: no need for 2 points + +proc createDiffs(dataA, dataB: DiffData): seq[Item] = + ## Scan the tables of which lines are inserted and deleted, + ## producing an edit script in forward order. + var startA = 0 + var startB = 0 + var lineA = 0 + var lineB = 0 + while lineA < dataA.len or lineB < dataB.len: + if lineA < dataA.len and not dataA.modified[lineA] and + lineB < dataB.len and not dataB.modified[lineB]: + # equal lines + inc lineA + inc lineB + else: + # maybe deleted and/or inserted lines + startA = lineA + startB = lineB + + while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]): + inc lineA + + while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]): + inc lineB + + if (startA < lineA) or (startB < lineB): + result.add Item(startA: startA, + startB: startB, + deletedA: lineA - startA, + insertedB: lineB - startB) + + +proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] = + ## Find the difference in 2 arrays of integers. + ## + ## `arrayA` A-version of the numbers (usually the old one) + ## + ## `arrayB` B-version of the numbers (usually the new one) + ## + ## Returns a sequence of Items that describe the differences. + + # The A-Version of the data (original data) to be compared. + var dataA = newDiffData(@arrayA, arrayA.len) + + # The B-Version of the data (modified data) to be compared. + var dataB = newDiffData(@arrayB, arrayB.len) + + let max = dataA.len + dataB.len + 1 + # vector for the (0,0) to (x,y) search + var downVector = newSeq[int](2 * max + 2) + # vector for the (u,v) to (N,M) search + var upVector = newSeq[int](2 * max + 2) + + lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector) + result = createDiffs(dataA, dataB) + +proc diffText*(textA, textB: string): seq[Item] = + ## Find the difference in 2 text documents, comparing by textlines. + ## + ## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents + ## each line is converted into a (hash) number. This hash-value is computed by storing all + ## textlines into a common hashtable so i can find duplicates in there, and generating a + ## new number each time a new textline is inserted. + ## + ## `textA` A-version of the text (usually the old one) + ## + ## `textB` B-version of the text (usually the new one) + ## + ## Returns a seq of Items that describe the differences. + # See also `gitutils.diffStrings`. + # prepare the input-text and convert to comparable numbers. + var h = initTable[string, int]() # TextA.len + TextB.len <- probably wrong initial size + # The A-Version of the data (original data) to be compared. + var dataA = diffCodes(textA, h) + + # The B-Version of the data (modified data) to be compared. + var dataB = diffCodes(textB, h) + + h.clear # free up hashtable memory (maybe) + + let max = dataA.len + dataB.len + 1 + # vector for the (0,0) to (x,y) search + var downVector = newSeq[int](2 * max + 2) + # vector for the (u,v) to (N,M) search + var upVector = newSeq[int](2 * max + 2) + + lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector) + + optimize(dataA) + optimize(dataB) + result = createDiffs(dataA, dataB) |