From 43ea41bdf4626831116568b0b19e665a41c3f98b Mon Sep 17 00:00:00 2001 From: Arne Döring Date: Fri, 7 Dec 2018 16:21:50 +0100 Subject: added lib/std/diff.nim --- lib/std/diff.nim | 463 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 463 insertions(+) create mode 100644 lib/std/diff.nim (limited to 'lib') diff --git a/lib/std/diff.nim b/lib/std/diff.nim new file mode 100644 index 000000000..940b5888d --- /dev/null +++ b/lib/std/diff.nim @@ -0,0 +1,463 @@ + +## This Class implements the Difference Algorithm published in +## "An O(ND) Difference Algorithm and its Variations" by Eugene Myers +## Algorithmica Vol. 1 No. 2, 1986, p 251. +## +## There are many C, Java, Lisp implementations public available but they all seem to come +## from the same source (diffutils) that is under the (unfree) GNU public License +## and cannot be reused as a sourcecode for a commercial application. +## There are very old C implementations that use other (worse) algorithms. +## Microsoft also published sourcecode of a diff-tool (windiff) that uses some tree data. +## Also, a direct transfer from a C source to C# is not easy because there is a lot of pointer +## arithmetic in the typical C solutions and i need a managed solution. +## These are the reasons why I implemented the original published algorithm from the scratch and +## make it avaliable without the GNU license limitations. +## I do not need a high performance diff tool because it is used only sometimes. +## I will do some performace tweaking when needed. +## +## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents +## each line is converted into a (hash) number. See diffText(). +## +## Some chages to the original algorithm: +## The original algorithm was described using a recursive approach and comparing zero indexed arrays. +## Extracting sub-arrays and rejoining them is very performance and memory intensive so the same +## (readonly) data arrays are passed arround together with their lower and upper bounds. +## This circumstance makes the lcs and sms functions more complicate. +## I added some code to the lcs function to get a fast response on sub-arrays that are identical, +## completely deleted or inserted. +## +## The result from a comparisation is stored in 2 arrays that flag for modified (deleted or inserted) +## lines in the 2 data arrays. These bits are then analysed to produce a array of Item objects. +## +## Further possible optimizations: +## (first rule: don't do it; second: don't do it yet) +## The arrays dataA and dataB are passed as parameters, but are never changed after the creation +## so they can be members of the class to avoid the paramter overhead. +## In sms is a lot of boundary arithmetic in the for-D and for-k loops that can be done by increment +## and decrement of local variables. +## The downVector and upVector arrays are alywas created and destroyed each time the sms gets called. +## It is possible to reuse tehm when transfering them to members of the class. +## See TODO: hints. +## +## diff.cs: A port of the algorythm to C# +## Copyright (c) by Matthias Hertel, http://www.mathertel.de +## This work is licensed under a BSD style license. See http://www.mathertel.de/License.aspx +## +## Changes: +## 2002.09.20 There was a "hang" in some situations. +## Now I undestand a little bit more of the sms algorithm. +## There have been overlapping boxes; that where analyzed partial differently. +## One return-point is enough. +## A assertion was added in createDiffs when in debug-mode, that counts the number of equal (no modified) lines in both arrays. +## They must be identical. +## +## 2003.02.07 Out of bounds error in the Up/Down vector arrays in some situations. +## The two vetors are now accessed using different offsets that are adjusted using the start k-Line. +## A test case is added. +## +## 2006.03.05 Some documentation and a direct Diff entry point. +## +## 2006.03.08 Refactored the API to static methods on the Diff class to make usage simpler. +## 2006.03.10 using the standard Debug class for self-test now. +## compile with: csc /target:exe /out:diffTest.exe /d:DEBUG /d:TRACE /d:SELFTEST Diff.cs +## 2007.01.06 license agreement changed to a BSD style license. +## 2007.06.03 added the optimize method. +## 2007.09.23 upVector and downVector optimization by Jan Stoklasa (). +## 2008.05.31 Adjusted the testing code that failed because of the optimize method (not a bug in the diff algorithm). +## 2008.10.08 Fixing a test case and adding a new test case. +## 2018.12.07 Port to Nim. + +import tables, strutils + +type + Item* = object + startA*: int ## Start Line number in Data A. + startB*: int ## Start Line number in Data B. + deletedA*: int ## Number of changes in Data A. + insertedB*: int ## Number of changes in Data B. + + DiffData = object + ## Data on one input file being compared. + len: int ## Number of elements (lines). + data: seq[int] ## Buffer of numbers that will be compared. + modified: seq[bool] ## Array of booleans that flag for modified + ## data. This is the result of the diff. + ## This means deletedA in the first Data or + ## inserted in the second Data. + + SMSRD = object + x,y: int + +proc newDiffData(initData: seq[int]): DiffData = + ## Initialize the Diff-Data buffer. + result.len = initData.len + result.data = initData + result.modified = newSeq[bool](result.len + 2) + +proc diffCodes(aText: string; h: var Table[string,int]): seq[int] = + ## This function converts all textlines of the text into unique numbers for every unique textline + ## so further work can work only with simple numbers. + ## ``aText`` the input text + ## ``h`` This extern initialized hashtable is used for storing all ever used textlines. + ## ``trimSpace`` ignore leading and trailing space characters + ## Returns a array of integers. + + # get all codes of the text + var lastUsedCode = h.len + var codes = newSeq[int]() + for s in aText.splitLines: + if h.contains s: + codes.add h[s] + else: + inc lastUsedCode + h[s] = lastUsedCode + codes.add lastUsedCode + + return codes + +proc optimize(data: var DiffData): void = + ## If a sequence of modified lines starts with a line that contains the same content + ## as the line that appends the changes, the difference sequence is modified so that the + ## appended line and not the starting line is marked as modified. + ## This leads to more readable diff sequences when comparing text files. + ## ``data`` A Diff data buffer containing the identified changes. + + var startPos, endPos: int + startPos = 0 + + while startPos < data.len: + while (startPos < data.len) and (data.modified[startPos] == false): + inc startPos + endPos = startPos + while (endPos < data.len) and (data.modified[endPos] == true): + inc endPos + + if (endPos < data.len) and (data.data[startPos] == data.data[endPos]): + data.modified[startPos] = false + data.modified[endPos] = true + else: + startPos = endPos + +proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int; + downVector, upVector: var openArray[int]): SMSRD = + ## This is the algorithm to find the Shortest Middle Snake (sms). + ## ``dataA`` sequence A + ## ``lowerA`` lower bound of the actual range in dataA + ## ``upperA`` upper bound of the actual range in dataA (exclusive) + ## ``dataB`` sequence B + ## ``lowerB`` lower bound of the actual range in dataB + ## ``upperB`` upper bound of the actual range in dataB (exclusive) + ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons. + ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons. + ## Returns a MiddleSnakeData record containing x,y and u,v. + + let max: int = dataA.len + dataB.len + 1 + + let downK = lowerA - lowerB; # the k-line to start the forward search + let upK = upperA - upperB; # the k-line to start the reverse search + + let delta = (upperA - lowerA) - (upperB - lowerB) + let oddDelta = (delta and 1) != 0 + + # The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based + # and are access using a specific offset: upOffset upVector and downOffset for downVector + let downOffset = max - downK + let upOffset = max - upK + + let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1 + + # Debug.Write(2, "sms", String.Format("Search the box: A[{0}-{1}] to B[{2}-{3}]", lowerA, upperA, lowerB, upperB)) + + # init vectors + downVector[downOffset + downK + 1] = lowerA + upVector[upOffset + upK - 1] = upperA + + for D in 0 .. maxD: + + # Extend the forward path. + + for k in countUp(downK - D, downK + D, 2): + # Debug.Write(0, "sms", "extend forward path " + k.ToString()) + + # find the only or better starting point + var x, y: int + if k == downK - D: + x = downVector[downOffset + k + 1]; # down + else: + x = downVector[downOffset + k - 1] + 1; # a step to the right + if (k < downK + D) and (downVector[downOffset + k + 1] >= x): + x = downVector[downOffset + k + 1]; # down + + y = x - k + + # find the end of the furthest reaching forward D-path in diagonal k. + while (x < upperA) and (y < upperB) and (dataA.data[x] == dataB.data[y]): + inc x + inc y + + downVector[downOffset + k] = x + + # overlap ? + if oddDelta and (upK - D < k) and (k < upK + D): + if upVector[upOffset + k] <= downVector[downOffset + k]: + result.x = downVector[downOffset + k] + result.y = downVector[downOffset + k] - k + return + + # Extend the reverse path. + for k in countUp(upK - D, upK + D, 2): + # Debug.Write(0, "sms", "extend reverse path " + k.ToString()) + + # find the only or better starting point + var x, y: int + if k == upK + D: + x = upVector[upOffset + k - 1]; # up + else: + x = upVector[upOffset + k + 1] - 1; # left + if (k > upK - D) and (upVector[upOffset + k - 1] < x): + x = upVector[upOffset + k - 1]; # up + # if + y = x - k + + while (x > lowerA) and (y > lowerB) and (dataA.data[x - 1] == dataB.data[y - 1]): + dec x + dec y + + upVector[upOffset + k] = x + + # overlap ? + if (not oddDelta) and (downK - D <= k) and (k <= downK + D): + if upVector[upOffset + k] <= downVector[downOffset + k]: + result.x = downVector[downOffset + k] + result.y = downVector[downOffset + k] - k + return + + assert false, "the algorithm should never come here." + +proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int; downVector, upVector: var openArray[int]): void = + ## This is the divide-and-conquer implementation of the longes common-subsequence (lcs) + ## algorithm. + ## The published algorithm passes recursively parts of the A and B sequences. + ## To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant. + ## ``dataA`` sequence A + ## ``lowerA`` lower bound of the actual range in dataA + ## ``upperA`` upper bound of the actual range in dataA (exclusive) + ## ``dataB`` sequence B + ## ``lowerB`` lower bound of the actual range in dataB + ## ``upperB`` upper bound of the actual range in dataB (exclusive) + ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons. + ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons. + # make mutable copy + var lowerA = lowerA + var lowerB = lowerB + var upperA = upperA + var upperB = upperB + + # Debug.Write(2, "lcs", String.Format("Analyse the box: A[{0}-{1}] to B[{2}-{3}]", lowerA, upperA, lowerB, upperB)) + + # Fast walkthrough equal lines at the start + while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]: + inc lowerA + inc lowerB + + # Fast walkthrough equal lines at the end + while lowerA < upperA and lowerB < upperB and dataA.data[upperA - 1] == dataB.data[upperB - 1]: + dec upperA + dec upperB + + if lowerA == upperA: + # mark as inserted lines. + while lowerB < upperB: + dataB.modified[lowerB] = true + inc lowerB + + elif lowerB == upperB: + # mark as deleted lines. + while lowerA < upperA: + dataA.modified[lowerA] = true + inc lowerA + + else: + # Find the middle snakea and length of an optimal path for A and B + let smsrd = sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector) + # Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y)) + + # The path is from LowerX to (x,y) and (x,y) to UpperX + lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector) + lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector) # 2002.09.20: no need for 2 points + +# lcs() + +proc createDiffs(dataA, dataB: DiffData): seq[Item] = + ## Scan the tables of which lines are inserted and deleted, + ## producing an edit script in forward order. + + var startA: int + var startB: int + var lineA: int + var lineB: int + + while lineA < dataA.len or lineB < dataB.len: + if (lineA < dataA.len) and (not dataA.modified[lineA]) and + (lineB < dataB.len) and (not dataB.modified[lineB]): + # equal lines + inc lineA + inc lineB + else: + # maybe deleted and/or inserted lines + startA = lineA + startB = lineB + + while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]): + # while (lineA < dataA.len and dataA.modified[lineA]) + inc lineA + + while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]): + # while (lineB < dataB.len and dataB.modified[lineB]) + inc lineB + + if (startA < lineA) or (startB < lineB): + # store a new difference-item + var aItem: Item + aItem.startA = startA + aItem.startB = startB + aItem.deletedA = lineA - startA + aItem.insertedB = lineB - startB + result.add aItem + + +proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] = + ## Find the difference in 2 arrays of integers. + ## ``arrayA`` A-version of the numbers (usualy the old one) + ## ``arrayB`` B-version of the numbers (usualy the new one) + ## Returns a array of Items that describe the differences. + + # The A-Version of the data (original data) to be compared. + var dataA = newDiffData(@arrayA) + + # The B-Version of the data (modified data) to be compared. + var dataB = newDiffData(@arrayB) + + let max = dataA.len + dataB.len + 1 + ## vector for the (0,0) to (x,y) search + var downVector = newSeq[int](2 * max + 2) + ## vector for the (u,v) to (N,M) search + var upVector = newSeq[int](2 * max + 2) + + lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector) + return createDiffs(dataA, dataB) + +proc diffText*(textA,textB: string): seq[Item] = + ## Find the difference in 2 text documents, comparing by textlines. + ## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents + ## each line is converted into a (hash) number. This hash-value is computed by storing all + ## textlines into a common hashtable so i can find dublicates in there, and generating a + ## new number each time a new textline is inserted. + ## ``TextA`` A-version of the text (usualy the old one) + ## ``TextB`` B-version of the text (usualy the new one) + ## ``trimSpace`` When set to true, all leading and trailing whitespace characters are stripped out before the comparation is done. + ## ``ignoreSpace`` When set to true, all whitespace characters are converted to a single space character before the comparation is done. + ## ``ignoreCase`` When set to true, all characters are converted to their lowercase equivivalence before the comparation is done. + ## Returns a seq of Items that describe the differences. + + # prepare the input-text and convert to comparable numbers. + var h = initTable[string,int]() # TextA.len + TextB.len <- probably wrong initial size + # The A-Version of the data (original data) to be compared. + var dataA = newDiffData(diffCodes(textA, h)) + + # The B-Version of the data (modified data) to be compared. + var dataB = newDiffData(diffCodes(textB, h)) + + h.clear # free up hashtable memory (maybe) + + let max = dataA.len + dataB.len + 1 + ## vector for the (0,0) to (x,y) search + var downVector = newSeq[int](2 * max + 2) + ## vector for the (u,v) to (N,M) search + var upVector = newSeq[int](2 * max + 2) + + lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector) + + optimize(dataA) + optimize(dataB) + return createDiffs(dataA, dataB) + +when isMainModule: + + proc testHelper(f: seq[Item]): string = + for it in f: + result.add( + $it.deletedA & "." & $it.insertedB & "." & $it.startA & "." & $it.startB & "*" + ) + + proc main(): void = + var a, b: string + + stdout.writeLine("Diff Self Test...") + + # test all changes + a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n') + b = "0,1,2,3,4,5,6,7,8,9".replace(',', '\n') + assert(testHelper(diffText(a, b)) == + "12.10.0.0*", + "all-changes test failed.") + stdout.writeLine("all-changes test passed.") + # test all same + a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n') + b = a + assert(testHelper(diffText(a, b)) == + "", + "all-same test failed.") + stdout.writeLine("all-same test passed.") + + # test snake + a = "a,b,c,d,e,f".replace(',', '\n') + b = "b,c,d,e,f,x".replace(',', '\n') + assert(testHelper(diffText(a, b)) == + "1.0.0.0*0.1.6.5*", + "snake test failed.") + stdout.writeLine("snake test passed.") + + # 2002.09.20 - repro + a = "c1,a,c2,b,c,d,e,g,h,i,j,c3,k,l".replace(',', '\n') + b = "C1,a,C2,b,c,d,e,I1,e,g,h,i,j,C3,k,I2,l".replace(',', '\n') + assert(testHelper(diffText(a, b)) == + "1.1.0.0*1.1.2.2*0.2.7.7*1.1.11.13*0.1.13.15*", + "repro20020920 test failed.") + stdout.writeLine("repro20020920 test passed.") + + # 2003.02.07 - repro + a = "F".replace(',', '\n') + b = "0,F,1,2,3,4,5,6,7".replace(',', '\n') + assert(testHelper(diffText(a, b)) == + "0.1.0.0*0.7.1.2*", + "repro20030207 test failed.") + stdout.writeLine("repro20030207 test passed.") + + # Muegel - repro + a = "HELLO\nWORLD" + b = "\n\nhello\n\n\n\nworld\n" + assert(testHelper(diffText(a, b)) == + "2.8.0.0*", + "repro20030409 test failed.") + stdout.writeLine("repro20030409 test passed.") + + # test some differences + a = "a,b,-,c,d,e,f,f".replace(',', '\n') + b = "a,b,x,c,e,f".replace(',', '\n') + assert(testHelper(diffText(a, b)) == + "1.1.2.2*1.0.4.4*1.0.7.6*", + "some-changes test failed.") + stdout.writeLine("some-changes test passed.") + + # test one change within long chain of repeats + a = "a,a,a,a,a,a,a,a,a,a".replace(',', '\n') + b = "a,a,a,a,-,a,a,a,a,a".replace(',', '\n') + assert(testHelper(diffText(a, b)) == + "0.1.4.4*1.0.9.10*", + "long chain of repeats test failed.") + + stdout.writeLine("End.") + stdout.flushFile + + main() -- cgit 1.4.1-2-gfad0 From f080a15da1ef34994d38779d630a4ee7dc81715d Mon Sep 17 00:00:00 2001 From: Araq Date: Wed, 12 Dec 2018 16:17:35 +0100 Subject: diff.nim: massive code cleanups --- lib/std/diff.nim | 256 +++++++++++++++++++------------------------------------ 1 file changed, 90 insertions(+), 166 deletions(-) (limited to 'lib') diff --git a/lib/std/diff.nim b/lib/std/diff.nim index 940b5888d..bffce2803 100644 --- a/lib/std/diff.nim +++ b/lib/std/diff.nim @@ -1,145 +1,90 @@ - -## This Class implements the Difference Algorithm published in -## "An O(ND) Difference Algorithm and its Variations" by Eugene Myers -## Algorithmica Vol. 1 No. 2, 1986, p 251. -## -## There are many C, Java, Lisp implementations public available but they all seem to come -## from the same source (diffutils) that is under the (unfree) GNU public License -## and cannot be reused as a sourcecode for a commercial application. -## There are very old C implementations that use other (worse) algorithms. -## Microsoft also published sourcecode of a diff-tool (windiff) that uses some tree data. -## Also, a direct transfer from a C source to C# is not easy because there is a lot of pointer -## arithmetic in the typical C solutions and i need a managed solution. -## These are the reasons why I implemented the original published algorithm from the scratch and -## make it avaliable without the GNU license limitations. -## I do not need a high performance diff tool because it is used only sometimes. -## I will do some performace tweaking when needed. -## -## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents -## each line is converted into a (hash) number. See diffText(). -## -## Some chages to the original algorithm: -## The original algorithm was described using a recursive approach and comparing zero indexed arrays. -## Extracting sub-arrays and rejoining them is very performance and memory intensive so the same -## (readonly) data arrays are passed arround together with their lower and upper bounds. -## This circumstance makes the lcs and sms functions more complicate. -## I added some code to the lcs function to get a fast response on sub-arrays that are identical, -## completely deleted or inserted. -## -## The result from a comparisation is stored in 2 arrays that flag for modified (deleted or inserted) -## lines in the 2 data arrays. These bits are then analysed to produce a array of Item objects. -## -## Further possible optimizations: -## (first rule: don't do it; second: don't do it yet) -## The arrays dataA and dataB are passed as parameters, but are never changed after the creation -## so they can be members of the class to avoid the paramter overhead. -## In sms is a lot of boundary arithmetic in the for-D and for-k loops that can be done by increment -## and decrement of local variables. -## The downVector and upVector arrays are alywas created and destroyed each time the sms gets called. -## It is possible to reuse tehm when transfering them to members of the class. -## See TODO: hints. -## -## diff.cs: A port of the algorythm to C# -## Copyright (c) by Matthias Hertel, http://www.mathertel.de -## This work is licensed under a BSD style license. See http://www.mathertel.de/License.aspx -## -## Changes: -## 2002.09.20 There was a "hang" in some situations. -## Now I undestand a little bit more of the sms algorithm. -## There have been overlapping boxes; that where analyzed partial differently. -## One return-point is enough. -## A assertion was added in createDiffs when in debug-mode, that counts the number of equal (no modified) lines in both arrays. -## They must be identical. -## -## 2003.02.07 Out of bounds error in the Up/Down vector arrays in some situations. -## The two vetors are now accessed using different offsets that are adjusted using the start k-Line. -## A test case is added. -## -## 2006.03.05 Some documentation and a direct Diff entry point. -## -## 2006.03.08 Refactored the API to static methods on the Diff class to make usage simpler. -## 2006.03.10 using the standard Debug class for self-test now. -## compile with: csc /target:exe /out:diffTest.exe /d:DEBUG /d:TRACE /d:SELFTEST Diff.cs -## 2007.01.06 license agreement changed to a BSD style license. -## 2007.06.03 added the optimize method. -## 2007.09.23 upVector and downVector optimization by Jan Stoklasa (). -## 2008.05.31 Adjusted the testing code that failed because of the optimize method (not a bug in the diff algorithm). -## 2008.10.08 Fixing a test case and adding a new test case. -## 2018.12.07 Port to Nim. +# +# +# Nim's Runtime Library +# (c) Copyright 2018 Nim contributors +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module implements an algorithm to compute the +## `diff`:idx: between two sequences of lines. + +# code owner: Arne Döring +# +# This is based on C# code written by Matthias Hertel, http://www.mathertel.de +# +# This Class implements the Difference Algorithm published in +# "An O(ND) Difference Algorithm and its Variations" by Eugene Myers +# Algorithmica Vol. 1 No. 2, 1986, p 251. import tables, strutils type - Item* = object + Item* = object ## An Item in the list of differences. startA*: int ## Start Line number in Data A. startB*: int ## Start Line number in Data B. deletedA*: int ## Number of changes in Data A. insertedB*: int ## Number of changes in Data B. - DiffData = object - ## Data on one input file being compared. - len: int ## Number of elements (lines). + DiffData = object ## Data on one input file being compared. data: seq[int] ## Buffer of numbers that will be compared. modified: seq[bool] ## Array of booleans that flag for modified - ## data. This is the result of the diff. + ## data. This is the result of the diff. ## This means deletedA in the first Data or ## inserted in the second Data. - SMSRD = object - x,y: int + Smsrd = object + x, y: int + +# template to avoid a seq copy. Required until ``sink`` parameters are ready. +template newDiffData(initData: seq[int]; L: int): DiffData = + DiffData( + data: initData, + modified: newSeq[bool](L + 2) + ) -proc newDiffData(initData: seq[int]): DiffData = - ## Initialize the Diff-Data buffer. - result.len = initData.len - result.data = initData - result.modified = newSeq[bool](result.len + 2) +proc len(d: DiffData): int {.inline.} = d.data.len -proc diffCodes(aText: string; h: var Table[string,int]): seq[int] = +proc diffCodes(aText: string; h: var Table[string, int]): DiffData = ## This function converts all textlines of the text into unique numbers for every unique textline ## so further work can work only with simple numbers. ## ``aText`` the input text ## ``h`` This extern initialized hashtable is used for storing all ever used textlines. ## ``trimSpace`` ignore leading and trailing space characters ## Returns a array of integers. - - # get all codes of the text var lastUsedCode = h.len - var codes = newSeq[int]() + result.data = newSeq[int]() for s in aText.splitLines: if h.contains s: - codes.add h[s] + result.data.add h[s] else: inc lastUsedCode h[s] = lastUsedCode - codes.add lastUsedCode + result.data.add lastUsedCode + result.modified = newSeq[bool](result.data.len + 2) - return codes - -proc optimize(data: var DiffData): void = +proc optimize(data: var DiffData) = ## If a sequence of modified lines starts with a line that contains the same content ## as the line that appends the changes, the difference sequence is modified so that the ## appended line and not the starting line is marked as modified. ## This leads to more readable diff sequences when comparing text files. - ## ``data`` A Diff data buffer containing the identified changes. - - var startPos, endPos: int - startPos = 0 - + var startPos = 0 while startPos < data.len: - while (startPos < data.len) and (data.modified[startPos] == false): + while startPos < data.len and not data.modified[startPos]: inc startPos - endPos = startPos - while (endPos < data.len) and (data.modified[endPos] == true): + var endPos = startPos + while endPos < data.len and data.modified[endPos]: inc endPos - if (endPos < data.len) and (data.data[startPos] == data.data[endPos]): + if endPos < data.len and data.data[startPos] == data.data[endPos]: data.modified[startPos] = false data.modified[endPos] = true else: startPos = endPos proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int; - downVector, upVector: var openArray[int]): SMSRD = + downVector, upVector: var openArray[int]): Smsrd = ## This is the algorithm to find the Shortest Middle Snake (sms). ## ``dataA`` sequence A ## ``lowerA`` lower bound of the actual range in dataA @@ -151,10 +96,10 @@ proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, uppe ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons. ## Returns a MiddleSnakeData record containing x,y and u,v. - let max: int = dataA.len + dataB.len + 1 + let max = dataA.len + dataB.len + 1 - let downK = lowerA - lowerB; # the k-line to start the forward search - let upK = upperA - upperB; # the k-line to start the reverse search + let downK = lowerA - lowerB # the k-line to start the forward search + let upK = upperA - upperB # the k-line to start the reverse search let delta = (upperA - lowerA) - (upperB - lowerB) let oddDelta = (delta and 1) != 0 @@ -166,75 +111,64 @@ proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, uppe let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1 - # Debug.Write(2, "sms", String.Format("Search the box: A[{0}-{1}] to B[{2}-{3}]", lowerA, upperA, lowerB, upperB)) - - # init vectors downVector[downOffset + downK + 1] = lowerA upVector[upOffset + upK - 1] = upperA for D in 0 .. maxD: - # Extend the forward path. - for k in countUp(downK - D, downK + D, 2): - # Debug.Write(0, "sms", "extend forward path " + k.ToString()) - # find the only or better starting point - var x, y: int + var x: int if k == downK - D: - x = downVector[downOffset + k + 1]; # down + x = downVector[downOffset + k + 1] # down else: - x = downVector[downOffset + k - 1] + 1; # a step to the right - if (k < downK + D) and (downVector[downOffset + k + 1] >= x): - x = downVector[downOffset + k + 1]; # down + x = downVector[downOffset + k - 1] + 1 # a step to the right + if k < downK + D and downVector[downOffset + k + 1] >= x: + x = downVector[downOffset + k + 1] # down - y = x - k + var y = x - k # find the end of the furthest reaching forward D-path in diagonal k. - while (x < upperA) and (y < upperB) and (dataA.data[x] == dataB.data[y]): + while x < upperA and y < upperB and dataA.data[x] == dataB.data[y]: inc x inc y downVector[downOffset + k] = x # overlap ? - if oddDelta and (upK - D < k) and (k < upK + D): + if oddDelta and upK - D < k and k < upK + D: if upVector[upOffset + k] <= downVector[downOffset + k]: - result.x = downVector[downOffset + k] - result.y = downVector[downOffset + k] - k - return + return Smsrd(x: downVector[downOffset + k], + y: downVector[downOffset + k] - k) # Extend the reverse path. for k in countUp(upK - D, upK + D, 2): - # Debug.Write(0, "sms", "extend reverse path " + k.ToString()) - # find the only or better starting point - var x, y: int + var x: int if k == upK + D: - x = upVector[upOffset + k - 1]; # up + x = upVector[upOffset + k - 1] # up else: - x = upVector[upOffset + k + 1] - 1; # left - if (k > upK - D) and (upVector[upOffset + k - 1] < x): - x = upVector[upOffset + k - 1]; # up - # if - y = x - k + x = upVector[upOffset + k + 1] - 1 # left + if k > upK - D and upVector[upOffset + k - 1] < x: + x = upVector[upOffset + k - 1] # up - while (x > lowerA) and (y > lowerB) and (dataA.data[x - 1] == dataB.data[y - 1]): + var y = x - k + while x > lowerA and y > lowerB and dataA.data[x - 1] == dataB.data[y - 1]: dec x dec y upVector[upOffset + k] = x # overlap ? - if (not oddDelta) and (downK - D <= k) and (k <= downK + D): + if not oddDelta and downK-D <= k and k <= downK+D: if upVector[upOffset + k] <= downVector[downOffset + k]: - result.x = downVector[downOffset + k] - result.y = downVector[downOffset + k] - k - return + return Smsrd(x: downVector[downOffset + k], + y: downVector[downOffset + k] - k) assert false, "the algorithm should never come here." -proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int; downVector, upVector: var openArray[int]): void = +proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int; + downVector, upVector: var openArray[int]) = ## This is the divide-and-conquer implementation of the longes common-subsequence (lcs) ## algorithm. ## The published algorithm passes recursively parts of the A and B sequences. @@ -247,14 +181,13 @@ proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, ## ``upperB`` upper bound of the actual range in dataB (exclusive) ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons. ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons. - # make mutable copy + + # make mutable copy: var lowerA = lowerA var lowerB = lowerB var upperA = upperA var upperB = upperB - # Debug.Write(2, "lcs", String.Format("Analyse the box: A[{0}-{1}] to B[{2}-{3}]", lowerA, upperA, lowerB, upperB)) - # Fast walkthrough equal lines at the start while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]: inc lowerA @@ -286,20 +219,16 @@ proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector) lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector) # 2002.09.20: no need for 2 points -# lcs() - proc createDiffs(dataA, dataB: DiffData): seq[Item] = ## Scan the tables of which lines are inserted and deleted, ## producing an edit script in forward order. - - var startA: int - var startB: int - var lineA: int - var lineB: int - + var startA = 0 + var startB = 0 + var lineA = 0 + var lineB = 0 while lineA < dataA.len or lineB < dataB.len: - if (lineA < dataA.len) and (not dataA.modified[lineA]) and - (lineB < dataB.len) and (not dataB.modified[lineB]): + if lineA < dataA.len and not dataA.modified[lineA] and + lineB < dataB.len and not dataB.modified[lineB]: # equal lines inc lineA inc lineB @@ -309,21 +238,16 @@ proc createDiffs(dataA, dataB: DiffData): seq[Item] = startB = lineB while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]): - # while (lineA < dataA.len and dataA.modified[lineA]) inc lineA while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]): - # while (lineB < dataB.len and dataB.modified[lineB]) inc lineB if (startA < lineA) or (startB < lineB): - # store a new difference-item - var aItem: Item - aItem.startA = startA - aItem.startB = startB - aItem.deletedA = lineA - startA - aItem.insertedB = lineB - startB - result.add aItem + result.add Item(startA: startA, + startB: startB, + deletedA: lineA - startA, + insertedB: lineB - startB) proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] = @@ -333,10 +257,10 @@ proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] = ## Returns a array of Items that describe the differences. # The A-Version of the data (original data) to be compared. - var dataA = newDiffData(@arrayA) + var dataA = newDiffData(@arrayA, arrayA.len) # The B-Version of the data (modified data) to be compared. - var dataB = newDiffData(@arrayB) + var dataB = newDiffData(@arrayB, arrayB.len) let max = dataA.len + dataB.len + 1 ## vector for the (0,0) to (x,y) search @@ -345,9 +269,9 @@ proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] = var upVector = newSeq[int](2 * max + 2) lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector) - return createDiffs(dataA, dataB) + result = createDiffs(dataA, dataB) -proc diffText*(textA,textB: string): seq[Item] = +proc diffText*(textA, textB: string): seq[Item] = ## Find the difference in 2 text documents, comparing by textlines. ## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents ## each line is converted into a (hash) number. This hash-value is computed by storing all @@ -361,12 +285,12 @@ proc diffText*(textA,textB: string): seq[Item] = ## Returns a seq of Items that describe the differences. # prepare the input-text and convert to comparable numbers. - var h = initTable[string,int]() # TextA.len + TextB.len <- probably wrong initial size + var h = initTable[string, int]() # TextA.len + TextB.len <- probably wrong initial size # The A-Version of the data (original data) to be compared. - var dataA = newDiffData(diffCodes(textA, h)) + var dataA = diffCodes(textA, h) # The B-Version of the data (modified data) to be compared. - var dataB = newDiffData(diffCodes(textB, h)) + var dataB = diffCodes(textB, h) h.clear # free up hashtable memory (maybe) @@ -380,7 +304,7 @@ proc diffText*(textA,textB: string): seq[Item] = optimize(dataA) optimize(dataB) - return createDiffs(dataA, dataB) + result = createDiffs(dataA, dataB) when isMainModule: @@ -390,7 +314,7 @@ when isMainModule: $it.deletedA & "." & $it.insertedB & "." & $it.startA & "." & $it.startB & "*" ) - proc main(): void = + proc main() = var a, b: string stdout.writeLine("Diff Self Test...") -- cgit 1.4.1-2-gfad0