From 43ea41bdf4626831116568b0b19e665a41c3f98b Mon Sep 17 00:00:00 2001
From: Arne Döring <arne.doering@gmx.net>
Date: Fri, 7 Dec 2018 16:21:50 +0100
Subject: added lib/std/diff.nim

---
 lib/std/diff.nim | 463 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 463 insertions(+)
 create mode 100644 lib/std/diff.nim

(limited to 'lib')

diff --git a/lib/std/diff.nim b/lib/std/diff.nim
new file mode 100644
index 000000000..940b5888d
--- /dev/null
+++ b/lib/std/diff.nim
@@ -0,0 +1,463 @@
+
+## This Class implements the Difference Algorithm published in
+## "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
+## Algorithmica Vol. 1 No. 2, 1986, p 251.
+##
+## There are many C, Java, Lisp implementations public available but they all seem to come
+## from the same source (diffutils) that is under the (unfree) GNU public License
+## and cannot be reused as a sourcecode for a commercial application.
+## There are very old C implementations that use other (worse) algorithms.
+## Microsoft also published sourcecode of a diff-tool (windiff) that uses some tree data.
+## Also, a direct transfer from a C source to C# is not easy because there is a lot of pointer
+## arithmetic in the typical C solutions and i need a managed solution.
+## These are the reasons why I implemented the original published algorithm from the scratch and
+## make it avaliable without the GNU license limitations.
+## I do not need a high performance diff tool because it is used only sometimes.
+## I will do some performace tweaking when needed.
+##
+## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
+## each line is converted into a (hash) number. See diffText().
+##
+## Some chages to the original algorithm:
+## The original algorithm was described using a recursive approach and comparing zero indexed arrays.
+## Extracting sub-arrays and rejoining them is very performance and memory intensive so the same
+## (readonly) data arrays are passed arround together with their lower and upper bounds.
+## This circumstance makes the lcs and sms functions more complicate.
+## I added some code to the lcs function to get a fast response on sub-arrays that are identical,
+## completely deleted or inserted.
+##
+## The result from a comparisation is stored in 2 arrays that flag for modified (deleted or inserted)
+## lines in the 2 data arrays. These bits are then analysed to produce a array of Item objects.
+##
+## Further possible optimizations:
+## (first rule: don't do it; second: don't do it yet)
+## The arrays dataA and dataB are passed as parameters, but are never changed after the creation
+## so they can be members of the class to avoid the paramter overhead.
+## In sms is a lot of boundary arithmetic in the for-D and for-k loops that can be done by increment
+## and decrement of local variables.
+## The downVector and upVector arrays are alywas created and destroyed each time the sms gets called.
+## It is possible to reuse tehm when transfering them to members of the class.
+## See TODO: hints.
+##
+## diff.cs: A port of the algorythm to C#
+## Copyright (c) by Matthias Hertel, http://www.mathertel.de
+## This work is licensed under a BSD style license. See http://www.mathertel.de/License.aspx
+##
+## Changes:
+## 2002.09.20 There was a "hang" in some situations.
+## Now I undestand a little bit more of the sms algorithm.
+## There have been overlapping boxes; that where analyzed partial differently.
+## One return-point is enough.
+## A assertion was added in createDiffs when in debug-mode, that counts the number of equal (no modified) lines in both arrays.
+## They must be identical.
+##
+## 2003.02.07 Out of bounds error in the Up/Down vector arrays in some situations.
+## The two vetors are now accessed using different offsets that are adjusted using the start k-Line.
+## A test case is added.
+##
+## 2006.03.05 Some documentation and a direct Diff entry point.
+##
+## 2006.03.08 Refactored the API to static methods on the Diff class to make usage simpler.
+## 2006.03.10 using the standard Debug class for self-test now.
+##            compile with: csc /target:exe /out:diffTest.exe /d:DEBUG /d:TRACE /d:SELFTEST Diff.cs
+## 2007.01.06 license agreement changed to a BSD style license.
+## 2007.06.03 added the optimize method.
+## 2007.09.23 upVector and downVector optimization by Jan Stoklasa ().
+## 2008.05.31 Adjusted the testing code that failed because of the optimize method (not a bug in the diff algorithm).
+## 2008.10.08 Fixing a test case and adding a new test case.
+## 2018.12.07 Port to Nim.
+
+import tables, strutils
+
+type
+  Item* = object
+    startA*: int    ## Start Line number in Data A.
+    startB*: int    ## Start Line number in Data B.
+    deletedA*: int  ## Number of changes in Data A.
+    insertedB*: int ## Number of changes in Data B.
+
+  DiffData = object
+    ## Data on one input file being compared.
+    len: int ## Number of elements (lines).
+    data: seq[int] ## Buffer of numbers that will be compared.
+    modified: seq[bool] ## Array of booleans that flag for modified
+                        ## data.  This is the result of the diff.
+                        ## This means deletedA in the first Data or
+                        ## inserted in the second Data.
+
+  SMSRD = object
+    x,y: int
+
+proc newDiffData(initData: seq[int]): DiffData =
+  ## Initialize the Diff-Data buffer.
+  result.len = initData.len
+  result.data = initData
+  result.modified = newSeq[bool](result.len + 2)
+
+proc diffCodes(aText: string; h: var Table[string,int]): seq[int] =
+  ## This function converts all textlines of the text into unique numbers for every unique textline
+  ## so further work can work only with simple numbers.
+  ## ``aText`` the input text
+  ## ``h`` This extern initialized hashtable is used for storing all ever used textlines.
+  ## ``trimSpace`` ignore leading and trailing space characters
+  ## Returns a array of integers.
+
+  # get all codes of the text
+  var lastUsedCode = h.len
+  var codes = newSeq[int]()
+  for s in aText.splitLines:
+    if h.contains s:
+      codes.add h[s]
+    else:
+      inc lastUsedCode
+      h[s] = lastUsedCode
+      codes.add lastUsedCode
+
+  return codes
+
+proc optimize(data: var DiffData): void =
+  ## If a sequence of modified lines starts with a line that contains the same content
+  ## as the line that appends the changes, the difference sequence is modified so that the
+  ## appended line and not the starting line is marked as modified.
+  ## This leads to more readable diff sequences when comparing text files.
+  ## ``data`` A Diff data buffer containing the identified changes.
+
+  var startPos, endPos: int
+  startPos = 0
+
+  while startPos < data.len:
+    while (startPos < data.len) and (data.modified[startPos] == false):
+      inc startPos
+    endPos = startPos
+    while (endPos < data.len) and (data.modified[endPos] == true):
+      inc endPos
+
+    if (endPos < data.len) and (data.data[startPos] == data.data[endPos]):
+      data.modified[startPos] = false
+      data.modified[endPos] = true
+    else:
+      startPos = endPos
+
+proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int;
+  downVector, upVector: var openArray[int]): SMSRD =
+  ## This is the algorithm to find the Shortest Middle Snake (sms).
+  ## ``dataA`` sequence A
+  ## ``lowerA`` lower bound of the actual range in dataA
+  ## ``upperA`` upper bound of the actual range in dataA (exclusive)
+  ## ``dataB`` sequence B
+  ## ``lowerB`` lower bound of the actual range in dataB
+  ## ``upperB`` upper bound of the actual range in dataB (exclusive)
+  ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
+  ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
+  ## Returns a MiddleSnakeData record containing x,y and u,v.
+
+  let max: int = dataA.len + dataB.len + 1
+
+  let downK = lowerA - lowerB; # the k-line to start the forward search
+  let upK = upperA - upperB; # the k-line to start the reverse search
+
+  let delta = (upperA - lowerA) - (upperB - lowerB)
+  let oddDelta = (delta and 1) != 0
+
+  # The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
+  # and are access using a specific offset: upOffset upVector and downOffset for downVector
+  let downOffset = max - downK
+  let upOffset = max - upK
+
+  let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1
+
+  # Debug.Write(2, "sms", String.Format("Search the box: A[{0}-{1}] to B[{2}-{3}]", lowerA, upperA, lowerB, upperB))
+
+  # init vectors
+  downVector[downOffset + downK + 1] = lowerA
+  upVector[upOffset + upK - 1] = upperA
+
+  for D in 0 .. maxD:
+
+    # Extend the forward path.
+
+    for k in countUp(downK - D, downK + D, 2):
+      # Debug.Write(0, "sms", "extend forward path " + k.ToString())
+
+      # find the only or better starting point
+      var x, y: int
+      if k == downK - D:
+        x = downVector[downOffset + k + 1]; # down
+      else:
+        x = downVector[downOffset + k - 1] + 1; # a step to the right
+        if (k < downK + D) and (downVector[downOffset + k + 1] >= x):
+          x = downVector[downOffset + k + 1]; # down
+
+      y = x - k
+
+      # find the end of the furthest reaching forward D-path in diagonal k.
+      while (x < upperA) and (y < upperB) and (dataA.data[x] == dataB.data[y]):
+        inc x
+        inc y
+
+      downVector[downOffset + k] = x
+
+      # overlap ?
+      if oddDelta and (upK - D < k) and (k < upK + D):
+        if upVector[upOffset + k] <= downVector[downOffset + k]:
+          result.x = downVector[downOffset + k]
+          result.y = downVector[downOffset + k] - k
+          return
+
+    # Extend the reverse path.
+    for k in countUp(upK - D, upK + D, 2):
+      # Debug.Write(0, "sms", "extend reverse path " + k.ToString())
+
+      # find the only or better starting point
+      var x, y: int
+      if k == upK + D:
+        x = upVector[upOffset + k - 1]; # up
+      else:
+        x = upVector[upOffset + k + 1] - 1; # left
+        if (k > upK - D) and (upVector[upOffset + k - 1] < x):
+          x = upVector[upOffset + k - 1]; # up
+      # if
+      y = x - k
+
+      while (x > lowerA) and (y > lowerB) and (dataA.data[x - 1] == dataB.data[y - 1]):
+        dec x
+        dec y
+
+      upVector[upOffset + k] = x
+
+      # overlap ?
+      if (not oddDelta) and (downK - D <= k) and (k <= downK + D):
+        if upVector[upOffset + k] <= downVector[downOffset + k]:
+          result.x = downVector[downOffset + k]
+          result.y = downVector[downOffset + k] - k
+          return
+
+  assert false, "the algorithm should never come here."
+
+proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int; downVector, upVector: var openArray[int]): void =
+  ## This is the divide-and-conquer implementation of the longes common-subsequence (lcs)
+  ## algorithm.
+  ## The published algorithm passes recursively parts of the A and B sequences.
+  ## To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant.
+  ## ``dataA`` sequence A
+  ## ``lowerA`` lower bound of the actual range in dataA
+  ## ``upperA`` upper bound of the actual range in dataA (exclusive)
+  ## ``dataB`` sequence B
+  ## ``lowerB`` lower bound of the actual range in dataB
+  ## ``upperB`` upper bound of the actual range in dataB (exclusive)
+  ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
+  ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
+  # make mutable copy
+  var lowerA = lowerA
+  var lowerB = lowerB
+  var upperA = upperA
+  var upperB = upperB
+
+  # Debug.Write(2, "lcs", String.Format("Analyse the box: A[{0}-{1}] to B[{2}-{3}]", lowerA, upperA, lowerB, upperB))
+
+  # Fast walkthrough equal lines at the start
+  while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]:
+    inc lowerA
+    inc lowerB
+
+  # Fast walkthrough equal lines at the end
+  while lowerA < upperA and lowerB < upperB and dataA.data[upperA - 1] == dataB.data[upperB - 1]:
+    dec upperA
+    dec upperB
+
+  if lowerA == upperA:
+    # mark as inserted lines.
+    while lowerB < upperB:
+      dataB.modified[lowerB] = true
+      inc lowerB
+
+  elif lowerB == upperB:
+    # mark as deleted lines.
+    while lowerA < upperA:
+      dataA.modified[lowerA] = true
+      inc lowerA
+
+  else:
+    # Find the middle snakea and length of an optimal path for A and B
+    let smsrd = sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector)
+    # Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y))
+
+    # The path is from LowerX to (x,y) and (x,y) to UpperX
+    lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector)
+    lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector)  # 2002.09.20: no need for 2 points
+
+# lcs()
+
+proc createDiffs(dataA, dataB: DiffData): seq[Item] =
+  ## Scan the tables of which lines are inserted and deleted,
+  ## producing an edit script in forward order.
+
+  var startA: int
+  var startB: int
+  var lineA: int
+  var lineB: int
+
+  while lineA < dataA.len or lineB < dataB.len:
+    if (lineA < dataA.len) and (not dataA.modified[lineA]) and
+       (lineB < dataB.len) and (not dataB.modified[lineB]):
+      # equal lines
+      inc lineA
+      inc lineB
+    else:
+      # maybe deleted and/or inserted lines
+      startA = lineA
+      startB = lineB
+
+      while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]):
+        # while (lineA < dataA.len and dataA.modified[lineA])
+        inc lineA
+
+      while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]):
+        # while (lineB < dataB.len and dataB.modified[lineB])
+        inc lineB
+
+      if (startA < lineA) or (startB < lineB):
+        # store a new difference-item
+        var aItem: Item
+        aItem.startA = startA
+        aItem.startB = startB
+        aItem.deletedA = lineA - startA
+        aItem.insertedB = lineB - startB
+        result.add aItem
+
+
+proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] =
+  ## Find the difference in 2 arrays of integers.
+  ## ``arrayA`` A-version of the numbers (usualy the old one)
+  ## ``arrayB`` B-version of the numbers (usualy the new one)
+  ## Returns a array of Items that describe the differences.
+
+  # The A-Version of the data (original data) to be compared.
+  var dataA = newDiffData(@arrayA)
+
+  # The B-Version of the data (modified data) to be compared.
+  var dataB = newDiffData(@arrayB)
+
+  let max = dataA.len + dataB.len + 1
+  ## vector for the (0,0) to (x,y) search
+  var downVector = newSeq[int](2 * max + 2)
+  ## vector for the (u,v) to (N,M) search
+  var upVector = newSeq[int](2 * max + 2)
+
+  lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
+  return createDiffs(dataA, dataB)
+
+proc diffText*(textA,textB: string): seq[Item] =
+  ## Find the difference in 2 text documents, comparing by textlines.
+  ## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
+  ## each line is converted into a (hash) number. This hash-value is computed by storing all
+  ## textlines into a common hashtable so i can find dublicates in there, and generating a
+  ## new number each time a new textline is inserted.
+  ## ``TextA`` A-version of the text (usualy the old one)
+  ## ``TextB`` B-version of the text (usualy the new one)
+  ## ``trimSpace`` When set to true, all leading and trailing whitespace characters are stripped out before the comparation is done.
+  ## ``ignoreSpace`` When set to true, all whitespace characters are converted to a single space character before the comparation is done.
+  ## ``ignoreCase`` When set to true, all characters are converted to their lowercase equivivalence before the comparation is done.
+  ## Returns a seq of Items that describe the differences.
+
+  # prepare the input-text and convert to comparable numbers.
+  var h = initTable[string,int]()  # TextA.len + TextB.len  <- probably wrong initial size
+  # The A-Version of the data (original data) to be compared.
+  var dataA = newDiffData(diffCodes(textA, h))
+
+  # The B-Version of the data (modified data) to be compared.
+  var dataB = newDiffData(diffCodes(textB, h))
+
+  h.clear # free up hashtable memory (maybe)
+
+  let max = dataA.len + dataB.len + 1
+  ## vector for the (0,0) to (x,y) search
+  var downVector = newSeq[int](2 * max + 2)
+  ## vector for the (u,v) to (N,M) search
+  var upVector = newSeq[int](2 * max + 2)
+
+  lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
+
+  optimize(dataA)
+  optimize(dataB)
+  return createDiffs(dataA, dataB)
+
+when isMainModule:
+
+  proc testHelper(f: seq[Item]): string =
+    for it in f:
+      result.add(
+        $it.deletedA & "." & $it.insertedB & "." & $it.startA & "." & $it.startB & "*"
+      )
+
+  proc main(): void =
+    var a, b: string
+
+    stdout.writeLine("Diff Self Test...")
+
+    # test all changes
+    a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
+    b = "0,1,2,3,4,5,6,7,8,9".replace(',', '\n')
+    assert(testHelper(diffText(a, b)) ==
+      "12.10.0.0*",
+      "all-changes test failed.")
+    stdout.writeLine("all-changes test passed.")
+    # test all same
+    a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
+    b = a
+    assert(testHelper(diffText(a, b)) ==
+      "",
+      "all-same test failed.")
+    stdout.writeLine("all-same test passed.")
+
+    # test snake
+    a = "a,b,c,d,e,f".replace(',', '\n')
+    b = "b,c,d,e,f,x".replace(',', '\n')
+    assert(testHelper(diffText(a, b)) ==
+      "1.0.0.0*0.1.6.5*",
+      "snake test failed.")
+    stdout.writeLine("snake test passed.")
+
+    # 2002.09.20 - repro
+    a = "c1,a,c2,b,c,d,e,g,h,i,j,c3,k,l".replace(',', '\n')
+    b = "C1,a,C2,b,c,d,e,I1,e,g,h,i,j,C3,k,I2,l".replace(',', '\n')
+    assert(testHelper(diffText(a, b)) ==
+      "1.1.0.0*1.1.2.2*0.2.7.7*1.1.11.13*0.1.13.15*",
+      "repro20020920 test failed.")
+    stdout.writeLine("repro20020920 test passed.")
+
+    # 2003.02.07 - repro
+    a = "F".replace(',', '\n')
+    b = "0,F,1,2,3,4,5,6,7".replace(',', '\n')
+    assert(testHelper(diffText(a, b)) ==
+      "0.1.0.0*0.7.1.2*",
+      "repro20030207 test failed.")
+    stdout.writeLine("repro20030207 test passed.")
+
+    # Muegel - repro
+    a = "HELLO\nWORLD"
+    b = "\n\nhello\n\n\n\nworld\n"
+    assert(testHelper(diffText(a, b)) ==
+      "2.8.0.0*",
+      "repro20030409 test failed.")
+    stdout.writeLine("repro20030409 test passed.")
+
+    # test some differences
+    a = "a,b,-,c,d,e,f,f".replace(',', '\n')
+    b = "a,b,x,c,e,f".replace(',', '\n')
+    assert(testHelper(diffText(a, b)) ==
+      "1.1.2.2*1.0.4.4*1.0.7.6*",
+      "some-changes test failed.")
+    stdout.writeLine("some-changes test passed.")
+
+    # test one change within long chain of repeats
+    a = "a,a,a,a,a,a,a,a,a,a".replace(',', '\n')
+    b = "a,a,a,a,-,a,a,a,a,a".replace(',', '\n')
+    assert(testHelper(diffText(a, b)) ==
+      "0.1.4.4*1.0.9.10*",
+      "long chain of repeats test failed.")
+
+    stdout.writeLine("End.")
+    stdout.flushFile
+
+  main()
-- 
cgit 1.4.1-2-gfad0


From f080a15da1ef34994d38779d630a4ee7dc81715d Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Wed, 12 Dec 2018 16:17:35 +0100
Subject: diff.nim: massive code cleanups

---
 lib/std/diff.nim | 256 +++++++++++++++++++------------------------------------
 1 file changed, 90 insertions(+), 166 deletions(-)

(limited to 'lib')

diff --git a/lib/std/diff.nim b/lib/std/diff.nim
index 940b5888d..bffce2803 100644
--- a/lib/std/diff.nim
+++ b/lib/std/diff.nim
@@ -1,145 +1,90 @@
-
-## This Class implements the Difference Algorithm published in
-## "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
-## Algorithmica Vol. 1 No. 2, 1986, p 251.
-##
-## There are many C, Java, Lisp implementations public available but they all seem to come
-## from the same source (diffutils) that is under the (unfree) GNU public License
-## and cannot be reused as a sourcecode for a commercial application.
-## There are very old C implementations that use other (worse) algorithms.
-## Microsoft also published sourcecode of a diff-tool (windiff) that uses some tree data.
-## Also, a direct transfer from a C source to C# is not easy because there is a lot of pointer
-## arithmetic in the typical C solutions and i need a managed solution.
-## These are the reasons why I implemented the original published algorithm from the scratch and
-## make it avaliable without the GNU license limitations.
-## I do not need a high performance diff tool because it is used only sometimes.
-## I will do some performace tweaking when needed.
-##
-## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
-## each line is converted into a (hash) number. See diffText().
-##
-## Some chages to the original algorithm:
-## The original algorithm was described using a recursive approach and comparing zero indexed arrays.
-## Extracting sub-arrays and rejoining them is very performance and memory intensive so the same
-## (readonly) data arrays are passed arround together with their lower and upper bounds.
-## This circumstance makes the lcs and sms functions more complicate.
-## I added some code to the lcs function to get a fast response on sub-arrays that are identical,
-## completely deleted or inserted.
-##
-## The result from a comparisation is stored in 2 arrays that flag for modified (deleted or inserted)
-## lines in the 2 data arrays. These bits are then analysed to produce a array of Item objects.
-##
-## Further possible optimizations:
-## (first rule: don't do it; second: don't do it yet)
-## The arrays dataA and dataB are passed as parameters, but are never changed after the creation
-## so they can be members of the class to avoid the paramter overhead.
-## In sms is a lot of boundary arithmetic in the for-D and for-k loops that can be done by increment
-## and decrement of local variables.
-## The downVector and upVector arrays are alywas created and destroyed each time the sms gets called.
-## It is possible to reuse tehm when transfering them to members of the class.
-## See TODO: hints.
-##
-## diff.cs: A port of the algorythm to C#
-## Copyright (c) by Matthias Hertel, http://www.mathertel.de
-## This work is licensed under a BSD style license. See http://www.mathertel.de/License.aspx
-##
-## Changes:
-## 2002.09.20 There was a "hang" in some situations.
-## Now I undestand a little bit more of the sms algorithm.
-## There have been overlapping boxes; that where analyzed partial differently.
-## One return-point is enough.
-## A assertion was added in createDiffs when in debug-mode, that counts the number of equal (no modified) lines in both arrays.
-## They must be identical.
-##
-## 2003.02.07 Out of bounds error in the Up/Down vector arrays in some situations.
-## The two vetors are now accessed using different offsets that are adjusted using the start k-Line.
-## A test case is added.
-##
-## 2006.03.05 Some documentation and a direct Diff entry point.
-##
-## 2006.03.08 Refactored the API to static methods on the Diff class to make usage simpler.
-## 2006.03.10 using the standard Debug class for self-test now.
-##            compile with: csc /target:exe /out:diffTest.exe /d:DEBUG /d:TRACE /d:SELFTEST Diff.cs
-## 2007.01.06 license agreement changed to a BSD style license.
-## 2007.06.03 added the optimize method.
-## 2007.09.23 upVector and downVector optimization by Jan Stoklasa ().
-## 2008.05.31 Adjusted the testing code that failed because of the optimize method (not a bug in the diff algorithm).
-## 2008.10.08 Fixing a test case and adding a new test case.
-## 2018.12.07 Port to Nim.
+#
+#
+#            Nim's Runtime Library
+#        (c) Copyright 2018 Nim contributors
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module implements an algorithm to compute the
+## `diff`:idx: between two sequences of lines.
+
+# code owner: Arne Döring
+#
+# This is based on C# code written by Matthias Hertel, http://www.mathertel.de
+#
+# This Class implements the Difference Algorithm published in
+# "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
+# Algorithmica Vol. 1 No. 2, 1986, p 251.
 
 import tables, strutils
 
 type
-  Item* = object
+  Item* = object    ## An Item in the list of differences.
     startA*: int    ## Start Line number in Data A.
     startB*: int    ## Start Line number in Data B.
     deletedA*: int  ## Number of changes in Data A.
     insertedB*: int ## Number of changes in Data B.
 
-  DiffData = object
-    ## Data on one input file being compared.
-    len: int ## Number of elements (lines).
+  DiffData = object ## Data on one input file being compared.
     data: seq[int] ## Buffer of numbers that will be compared.
     modified: seq[bool] ## Array of booleans that flag for modified
-                        ## data.  This is the result of the diff.
+                        ## data. This is the result of the diff.
                         ## This means deletedA in the first Data or
                         ## inserted in the second Data.
 
-  SMSRD = object
-    x,y: int
+  Smsrd = object
+    x, y: int
+
+# template to avoid a seq copy. Required until ``sink`` parameters are ready.
+template newDiffData(initData: seq[int]; L: int): DiffData =
+  DiffData(
+    data: initData,
+    modified: newSeq[bool](L + 2)
+  )
 
-proc newDiffData(initData: seq[int]): DiffData =
-  ## Initialize the Diff-Data buffer.
-  result.len = initData.len
-  result.data = initData
-  result.modified = newSeq[bool](result.len + 2)
+proc len(d: DiffData): int {.inline.} = d.data.len
 
-proc diffCodes(aText: string; h: var Table[string,int]): seq[int] =
+proc diffCodes(aText: string; h: var Table[string, int]): DiffData =
   ## This function converts all textlines of the text into unique numbers for every unique textline
   ## so further work can work only with simple numbers.
   ## ``aText`` the input text
   ## ``h`` This extern initialized hashtable is used for storing all ever used textlines.
   ## ``trimSpace`` ignore leading and trailing space characters
   ## Returns a array of integers.
-
-  # get all codes of the text
   var lastUsedCode = h.len
-  var codes = newSeq[int]()
+  result.data = newSeq[int]()
   for s in aText.splitLines:
     if h.contains s:
-      codes.add h[s]
+      result.data.add h[s]
     else:
       inc lastUsedCode
       h[s] = lastUsedCode
-      codes.add lastUsedCode
+      result.data.add lastUsedCode
+  result.modified = newSeq[bool](result.data.len + 2)
 
-  return codes
-
-proc optimize(data: var DiffData): void =
+proc optimize(data: var DiffData) =
   ## If a sequence of modified lines starts with a line that contains the same content
   ## as the line that appends the changes, the difference sequence is modified so that the
   ## appended line and not the starting line is marked as modified.
   ## This leads to more readable diff sequences when comparing text files.
-  ## ``data`` A Diff data buffer containing the identified changes.
-
-  var startPos, endPos: int
-  startPos = 0
-
+  var startPos = 0
   while startPos < data.len:
-    while (startPos < data.len) and (data.modified[startPos] == false):
+    while startPos < data.len and not data.modified[startPos]:
       inc startPos
-    endPos = startPos
-    while (endPos < data.len) and (data.modified[endPos] == true):
+    var endPos = startPos
+    while endPos < data.len and data.modified[endPos]:
       inc endPos
 
-    if (endPos < data.len) and (data.data[startPos] == data.data[endPos]):
+    if endPos < data.len and data.data[startPos] == data.data[endPos]:
       data.modified[startPos] = false
       data.modified[endPos] = true
     else:
       startPos = endPos
 
 proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int;
-  downVector, upVector: var openArray[int]): SMSRD =
+         downVector, upVector: var openArray[int]): Smsrd =
   ## This is the algorithm to find the Shortest Middle Snake (sms).
   ## ``dataA`` sequence A
   ## ``lowerA`` lower bound of the actual range in dataA
@@ -151,10 +96,10 @@ proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, uppe
   ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
   ## Returns a MiddleSnakeData record containing x,y and u,v.
 
-  let max: int = dataA.len + dataB.len + 1
+  let max = dataA.len + dataB.len + 1
 
-  let downK = lowerA - lowerB; # the k-line to start the forward search
-  let upK = upperA - upperB; # the k-line to start the reverse search
+  let downK = lowerA - lowerB # the k-line to start the forward search
+  let upK = upperA - upperB # the k-line to start the reverse search
 
   let delta = (upperA - lowerA) - (upperB - lowerB)
   let oddDelta = (delta and 1) != 0
@@ -166,75 +111,64 @@ proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, uppe
 
   let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1
 
-  # Debug.Write(2, "sms", String.Format("Search the box: A[{0}-{1}] to B[{2}-{3}]", lowerA, upperA, lowerB, upperB))
-
-  # init vectors
   downVector[downOffset + downK + 1] = lowerA
   upVector[upOffset + upK - 1] = upperA
 
   for D in 0 .. maxD:
-
     # Extend the forward path.
-
     for k in countUp(downK - D, downK + D, 2):
-      # Debug.Write(0, "sms", "extend forward path " + k.ToString())
-
       # find the only or better starting point
-      var x, y: int
+      var x: int
       if k == downK - D:
-        x = downVector[downOffset + k + 1]; # down
+        x = downVector[downOffset + k + 1] # down
       else:
-        x = downVector[downOffset + k - 1] + 1; # a step to the right
-        if (k < downK + D) and (downVector[downOffset + k + 1] >= x):
-          x = downVector[downOffset + k + 1]; # down
+        x = downVector[downOffset + k - 1] + 1 # a step to the right
+        if k < downK + D and downVector[downOffset + k + 1] >= x:
+          x = downVector[downOffset + k + 1] # down
 
-      y = x - k
+      var y = x - k
 
       # find the end of the furthest reaching forward D-path in diagonal k.
-      while (x < upperA) and (y < upperB) and (dataA.data[x] == dataB.data[y]):
+      while x < upperA and y < upperB and dataA.data[x] == dataB.data[y]:
         inc x
         inc y
 
       downVector[downOffset + k] = x
 
       # overlap ?
-      if oddDelta and (upK - D < k) and (k < upK + D):
+      if oddDelta and upK - D < k and k < upK + D:
         if upVector[upOffset + k] <= downVector[downOffset + k]:
-          result.x = downVector[downOffset + k]
-          result.y = downVector[downOffset + k] - k
-          return
+          return Smsrd(x: downVector[downOffset + k],
+                       y: downVector[downOffset + k] - k)
 
     # Extend the reverse path.
     for k in countUp(upK - D, upK + D, 2):
-      # Debug.Write(0, "sms", "extend reverse path " + k.ToString())
-
       # find the only or better starting point
-      var x, y: int
+      var x: int
       if k == upK + D:
-        x = upVector[upOffset + k - 1]; # up
+        x = upVector[upOffset + k - 1] # up
       else:
-        x = upVector[upOffset + k + 1] - 1; # left
-        if (k > upK - D) and (upVector[upOffset + k - 1] < x):
-          x = upVector[upOffset + k - 1]; # up
-      # if
-      y = x - k
+        x = upVector[upOffset + k + 1] - 1 # left
+        if k > upK - D and upVector[upOffset + k - 1] < x:
+          x = upVector[upOffset + k - 1] # up
 
-      while (x > lowerA) and (y > lowerB) and (dataA.data[x - 1] == dataB.data[y - 1]):
+      var y = x - k
+      while x > lowerA and y > lowerB and dataA.data[x - 1] == dataB.data[y - 1]:
         dec x
         dec y
 
       upVector[upOffset + k] = x
 
       # overlap ?
-      if (not oddDelta) and (downK - D <= k) and (k <= downK + D):
+      if not oddDelta and downK-D <= k and k <= downK+D:
         if upVector[upOffset + k] <= downVector[downOffset + k]:
-          result.x = downVector[downOffset + k]
-          result.y = downVector[downOffset + k] - k
-          return
+          return Smsrd(x: downVector[downOffset + k],
+                       y: downVector[downOffset + k] - k)
 
   assert false, "the algorithm should never come here."
 
-proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int; downVector, upVector: var openArray[int]): void =
+proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int;
+         downVector, upVector: var openArray[int]) =
   ## This is the divide-and-conquer implementation of the longes common-subsequence (lcs)
   ## algorithm.
   ## The published algorithm passes recursively parts of the A and B sequences.
@@ -247,14 +181,13 @@ proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB,
   ## ``upperB`` upper bound of the actual range in dataB (exclusive)
   ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
   ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
-  # make mutable copy
+
+  # make mutable copy:
   var lowerA = lowerA
   var lowerB = lowerB
   var upperA = upperA
   var upperB = upperB
 
-  # Debug.Write(2, "lcs", String.Format("Analyse the box: A[{0}-{1}] to B[{2}-{3}]", lowerA, upperA, lowerB, upperB))
-
   # Fast walkthrough equal lines at the start
   while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]:
     inc lowerA
@@ -286,20 +219,16 @@ proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB,
     lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector)
     lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector)  # 2002.09.20: no need for 2 points
 
-# lcs()
-
 proc createDiffs(dataA, dataB: DiffData): seq[Item] =
   ## Scan the tables of which lines are inserted and deleted,
   ## producing an edit script in forward order.
-
-  var startA: int
-  var startB: int
-  var lineA: int
-  var lineB: int
-
+  var startA = 0
+  var startB = 0
+  var lineA = 0
+  var lineB = 0
   while lineA < dataA.len or lineB < dataB.len:
-    if (lineA < dataA.len) and (not dataA.modified[lineA]) and
-       (lineB < dataB.len) and (not dataB.modified[lineB]):
+    if lineA < dataA.len and not dataA.modified[lineA] and
+       lineB < dataB.len and not dataB.modified[lineB]:
       # equal lines
       inc lineA
       inc lineB
@@ -309,21 +238,16 @@ proc createDiffs(dataA, dataB: DiffData): seq[Item] =
       startB = lineB
 
       while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]):
-        # while (lineA < dataA.len and dataA.modified[lineA])
         inc lineA
 
       while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]):
-        # while (lineB < dataB.len and dataB.modified[lineB])
         inc lineB
 
       if (startA < lineA) or (startB < lineB):
-        # store a new difference-item
-        var aItem: Item
-        aItem.startA = startA
-        aItem.startB = startB
-        aItem.deletedA = lineA - startA
-        aItem.insertedB = lineB - startB
-        result.add aItem
+        result.add Item(startA: startA,
+                        startB: startB,
+                        deletedA: lineA - startA,
+                        insertedB: lineB - startB)
 
 
 proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] =
@@ -333,10 +257,10 @@ proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] =
   ## Returns a array of Items that describe the differences.
 
   # The A-Version of the data (original data) to be compared.
-  var dataA = newDiffData(@arrayA)
+  var dataA = newDiffData(@arrayA, arrayA.len)
 
   # The B-Version of the data (modified data) to be compared.
-  var dataB = newDiffData(@arrayB)
+  var dataB = newDiffData(@arrayB, arrayB.len)
 
   let max = dataA.len + dataB.len + 1
   ## vector for the (0,0) to (x,y) search
@@ -345,9 +269,9 @@ proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] =
   var upVector = newSeq[int](2 * max + 2)
 
   lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
-  return createDiffs(dataA, dataB)
+  result = createDiffs(dataA, dataB)
 
-proc diffText*(textA,textB: string): seq[Item] =
+proc diffText*(textA, textB: string): seq[Item] =
   ## Find the difference in 2 text documents, comparing by textlines.
   ## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
   ## each line is converted into a (hash) number. This hash-value is computed by storing all
@@ -361,12 +285,12 @@ proc diffText*(textA,textB: string): seq[Item] =
   ## Returns a seq of Items that describe the differences.
 
   # prepare the input-text and convert to comparable numbers.
-  var h = initTable[string,int]()  # TextA.len + TextB.len  <- probably wrong initial size
+  var h = initTable[string, int]()  # TextA.len + TextB.len  <- probably wrong initial size
   # The A-Version of the data (original data) to be compared.
-  var dataA = newDiffData(diffCodes(textA, h))
+  var dataA = diffCodes(textA, h)
 
   # The B-Version of the data (modified data) to be compared.
-  var dataB = newDiffData(diffCodes(textB, h))
+  var dataB = diffCodes(textB, h)
 
   h.clear # free up hashtable memory (maybe)
 
@@ -380,7 +304,7 @@ proc diffText*(textA,textB: string): seq[Item] =
 
   optimize(dataA)
   optimize(dataB)
-  return createDiffs(dataA, dataB)
+  result = createDiffs(dataA, dataB)
 
 when isMainModule:
 
@@ -390,7 +314,7 @@ when isMainModule:
         $it.deletedA & "." & $it.insertedB & "." & $it.startA & "." & $it.startB & "*"
       )
 
-  proc main(): void =
+  proc main() =
     var a, b: string
 
     stdout.writeLine("Diff Self Test...")
-- 
cgit 1.4.1-2-gfad0