move diff.nim to experimental

author: Araq <rumpf_a@web.de> 2018-12-12 17:48:30 +0100
committer: Araq <rumpf_a@web.de> 2018-12-12 17:48:30 +0100
commit: eb8383cb28e52bb9a790a743de442e4b71687001 (patch)
tree: 8d9c23187e1ba4bc20f45ae065f62ff756e87509 /lib/std
parent: 070bcf4cea28a3238089379f5884787b2084b2de (diff)
download: Nim-eb8383cb28e52bb9a790a743de442e4b71687001.tar.gz
1 files changed, 0 insertions, 387 deletions
diff --git a/lib/std/diff.nim b/lib/std/diff.nim
deleted file mode 100644
index bffce2803..000000000
--- a/lib/std/diff.nim
+++ /dev/null
@@ -1,387 +0,0 @@
-#
-#
-#            Nim's Runtime Library
-#        (c) Copyright 2018 Nim contributors
-#
-#    See the file "copying.txt", included in this
-#    distribution, for details about the copyright.
-#
-
-## This module implements an algorithm to compute the
-## `diff`:idx: between two sequences of lines.
-
-# code owner: Arne Döring
-#
-# This is based on C# code written by Matthias Hertel, http://www.mathertel.de
-#
-# This Class implements the Difference Algorithm published in
-# "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
-# Algorithmica Vol. 1 No. 2, 1986, p 251.
-
-import tables, strutils
-
-type
-  Item* = object    ## An Item in the list of differences.
-    startA*: int    ## Start Line number in Data A.
-    startB*: int    ## Start Line number in Data B.
-    deletedA*: int  ## Number of changes in Data A.
-    insertedB*: int ## Number of changes in Data B.
-
-  DiffData = object ## Data on one input file being compared.
-    data: seq[int] ## Buffer of numbers that will be compared.
-    modified: seq[bool] ## Array of booleans that flag for modified
-                        ## data. This is the result of the diff.
-                        ## This means deletedA in the first Data or
-                        ## inserted in the second Data.
-
-  Smsrd = object
-    x, y: int
-
-# template to avoid a seq copy. Required until ``sink`` parameters are ready.
-template newDiffData(initData: seq[int]; L: int): DiffData =
-  DiffData(
-    data: initData,
-    modified: newSeq[bool](L + 2)
-  )
-
-proc len(d: DiffData): int {.inline.} = d.data.len
-
-proc diffCodes(aText: string; h: var Table[string, int]): DiffData =
-  ## This function converts all textlines of the text into unique numbers for every unique textline
-  ## so further work can work only with simple numbers.
-  ## ``aText`` the input text
-  ## ``h`` This extern initialized hashtable is used for storing all ever used textlines.
-  ## ``trimSpace`` ignore leading and trailing space characters
-  ## Returns a array of integers.
-  var lastUsedCode = h.len
-  result.data = newSeq[int]()
-  for s in aText.splitLines:
-    if h.contains s:
-      result.data.add h[s]
-    else:
-      inc lastUsedCode
-      h[s] = lastUsedCode
-      result.data.add lastUsedCode
-  result.modified = newSeq[bool](result.data.len + 2)
-
-proc optimize(data: var DiffData) =
-  ## If a sequence of modified lines starts with a line that contains the same content
-  ## as the line that appends the changes, the difference sequence is modified so that the
-  ## appended line and not the starting line is marked as modified.
-  ## This leads to more readable diff sequences when comparing text files.
-  var startPos = 0
-  while startPos < data.len:
-    while startPos < data.len and not data.modified[startPos]:
-      inc startPos
-    var endPos = startPos
-    while endPos < data.len and data.modified[endPos]:
-      inc endPos
-
-    if endPos < data.len and data.data[startPos] == data.data[endPos]:
-      data.modified[startPos] = false
-      data.modified[endPos] = true
-    else:
-      startPos = endPos
-
-proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int;
-         downVector, upVector: var openArray[int]): Smsrd =
-  ## This is the algorithm to find the Shortest Middle Snake (sms).
-  ## ``dataA`` sequence A
-  ## ``lowerA`` lower bound of the actual range in dataA
-  ## ``upperA`` upper bound of the actual range in dataA (exclusive)
-  ## ``dataB`` sequence B
-  ## ``lowerB`` lower bound of the actual range in dataB
-  ## ``upperB`` upper bound of the actual range in dataB (exclusive)
-  ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
-  ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
-  ## Returns a MiddleSnakeData record containing x,y and u,v.
-
-  let max = dataA.len + dataB.len + 1
-
-  let downK = lowerA - lowerB # the k-line to start the forward search
-  let upK = upperA - upperB # the k-line to start the reverse search
-
-  let delta = (upperA - lowerA) - (upperB - lowerB)
-  let oddDelta = (delta and 1) != 0
-
-  # The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
-  # and are access using a specific offset: upOffset upVector and downOffset for downVector
-  let downOffset = max - downK
-  let upOffset = max - upK
-
-  let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1
-
-  downVector[downOffset + downK + 1] = lowerA
-  upVector[upOffset + upK - 1] = upperA
-
-  for D in 0 .. maxD:
-    # Extend the forward path.
-    for k in countUp(downK - D, downK + D, 2):
-      # find the only or better starting point
-      var x: int
-      if k == downK - D:
-        x = downVector[downOffset + k + 1] # down
-      else:
-        x = downVector[downOffset + k - 1] + 1 # a step to the right
-        if k < downK + D and downVector[downOffset + k + 1] >= x:
-          x = downVector[downOffset + k + 1] # down
-
-      var y = x - k
-
-      # find the end of the furthest reaching forward D-path in diagonal k.
-      while x < upperA and y < upperB and dataA.data[x] == dataB.data[y]:
-        inc x
-        inc y
-
-      downVector[downOffset + k] = x
-
-      # overlap ?
-      if oddDelta and upK - D < k and k < upK + D:
-        if upVector[upOffset + k] <= downVector[downOffset + k]:
-          return Smsrd(x: downVector[downOffset + k],
-                       y: downVector[downOffset + k] - k)
-
-    # Extend the reverse path.
-    for k in countUp(upK - D, upK + D, 2):
-      # find the only or better starting point
-      var x: int
-      if k == upK + D:
-        x = upVector[upOffset + k - 1] # up
-      else:
-        x = upVector[upOffset + k + 1] - 1 # left
-        if k > upK - D and upVector[upOffset + k - 1] < x:
-          x = upVector[upOffset + k - 1] # up
-
-      var y = x - k
-      while x > lowerA and y > lowerB and dataA.data[x - 1] == dataB.data[y - 1]:
-        dec x
-        dec y
-
-      upVector[upOffset + k] = x
-
-      # overlap ?
-      if not oddDelta and downK-D <= k and k <= downK+D:
-        if upVector[upOffset + k] <= downVector[downOffset + k]:
-          return Smsrd(x: downVector[downOffset + k],
-                       y: downVector[downOffset + k] - k)
-
-  assert false, "the algorithm should never come here."
-
-proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int;
-         downVector, upVector: var openArray[int]) =
-  ## This is the divide-and-conquer implementation of the longes common-subsequence (lcs)
-  ## algorithm.
-  ## The published algorithm passes recursively parts of the A and B sequences.
-  ## To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant.
-  ## ``dataA`` sequence A
-  ## ``lowerA`` lower bound of the actual range in dataA
-  ## ``upperA`` upper bound of the actual range in dataA (exclusive)
-  ## ``dataB`` sequence B
-  ## ``lowerB`` lower bound of the actual range in dataB
-  ## ``upperB`` upper bound of the actual range in dataB (exclusive)
-  ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
-  ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
-
-  # make mutable copy:
-  var lowerA = lowerA
-  var lowerB = lowerB
-  var upperA = upperA
-  var upperB = upperB
-
-  # Fast walkthrough equal lines at the start
-  while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]:
-    inc lowerA
-    inc lowerB
-
-  # Fast walkthrough equal lines at the end
-  while lowerA < upperA and lowerB < upperB and dataA.data[upperA - 1] == dataB.data[upperB - 1]:
-    dec upperA
-    dec upperB
-
-  if lowerA == upperA:
-    # mark as inserted lines.
-    while lowerB < upperB:
-      dataB.modified[lowerB] = true
-      inc lowerB
-
-  elif lowerB == upperB:
-    # mark as deleted lines.
-    while lowerA < upperA:
-      dataA.modified[lowerA] = true
-      inc lowerA
-
-  else:
-    # Find the middle snakea and length of an optimal path for A and B
-    let smsrd = sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector)
-    # Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y))
-
-    # The path is from LowerX to (x,y) and (x,y) to UpperX
-    lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector)
-    lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector)  # 2002.09.20: no need for 2 points
-
-proc createDiffs(dataA, dataB: DiffData): seq[Item] =
-  ## Scan the tables of which lines are inserted and deleted,
-  ## producing an edit script in forward order.
-  var startA = 0
-  var startB = 0
-  var lineA = 0
-  var lineB = 0
-  while lineA < dataA.len or lineB < dataB.len:
-    if lineA < dataA.len and not dataA.modified[lineA] and
-       lineB < dataB.len and not dataB.modified[lineB]:
-      # equal lines
-      inc lineA
-      inc lineB
-    else:
-      # maybe deleted and/or inserted lines
-      startA = lineA
-      startB = lineB
-
-      while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]):
-        inc lineA
-
-      while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]):
-        inc lineB
-
-      if (startA < lineA) or (startB < lineB):
-        result.add Item(startA: startA,
-                        startB: startB,
-                        deletedA: lineA - startA,
-                        insertedB: lineB - startB)
-
-
-proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] =
-  ## Find the difference in 2 arrays of integers.
-  ## ``arrayA`` A-version of the numbers (usualy the old one)
-  ## ``arrayB`` B-version of the numbers (usualy the new one)
-  ## Returns a array of Items that describe the differences.
-
-  # The A-Version of the data (original data) to be compared.
-  var dataA = newDiffData(@arrayA, arrayA.len)
-
-  # The B-Version of the data (modified data) to be compared.
-  var dataB = newDiffData(@arrayB, arrayB.len)
-
-  let max = dataA.len + dataB.len + 1
-  ## vector for the (0,0) to (x,y) search
-  var downVector = newSeq[int](2 * max + 2)
-  ## vector for the (u,v) to (N,M) search
-  var upVector = newSeq[int](2 * max + 2)
-
-  lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
-  result = createDiffs(dataA, dataB)
-
-proc diffText*(textA, textB: string): seq[Item] =
-  ## Find the difference in 2 text documents, comparing by textlines.
-  ## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
-  ## each line is converted into a (hash) number. This hash-value is computed by storing all
-  ## textlines into a common hashtable so i can find dublicates in there, and generating a
-  ## new number each time a new textline is inserted.
-  ## ``TextA`` A-version of the text (usualy the old one)
-  ## ``TextB`` B-version of the text (usualy the new one)
-  ## ``trimSpace`` When set to true, all leading and trailing whitespace characters are stripped out before the comparation is done.
-  ## ``ignoreSpace`` When set to true, all whitespace characters are converted to a single space character before the comparation is done.
-  ## ``ignoreCase`` When set to true, all characters are converted to their lowercase equivivalence before the comparation is done.
-  ## Returns a seq of Items that describe the differences.
-
-  # prepare the input-text and convert to comparable numbers.
-  var h = initTable[string, int]()  # TextA.len + TextB.len  <- probably wrong initial size
-  # The A-Version of the data (original data) to be compared.
-  var dataA = diffCodes(textA, h)
-
-  # The B-Version of the data (modified data) to be compared.
-  var dataB = diffCodes(textB, h)
-
-  h.clear # free up hashtable memory (maybe)
-
-  let max = dataA.len + dataB.len + 1
-  ## vector for the (0,0) to (x,y) search
-  var downVector = newSeq[int](2 * max + 2)
-  ## vector for the (u,v) to (N,M) search
-  var upVector = newSeq[int](2 * max + 2)
-
-  lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
-
-  optimize(dataA)
-  optimize(dataB)
-  result = createDiffs(dataA, dataB)
-
-when isMainModule:
-
-  proc testHelper(f: seq[Item]): string =
-    for it in f:
-      result.add(
-        $it.deletedA & "." & $it.insertedB & "." & $it.startA & "." & $it.startB & "*"
-      )
-
-  proc main() =
-    var a, b: string
-
-    stdout.writeLine("Diff Self Test...")
-
-    # test all changes
-    a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
-    b = "0,1,2,3,4,5,6,7,8,9".replace(',', '\n')
-    assert(testHelper(diffText(a, b)) ==
-      "12.10.0.0*",
-      "all-changes test failed.")
-    stdout.writeLine("all-changes test passed.")
-    # test all same
-    a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
-    b = a
-    assert(testHelper(diffText(a, b)) ==
-      "",
-      "all-same test failed.")
-    stdout.writeLine("all-same test passed.")
-
-    # test snake
-    a = "a,b,c,d,e,f".replace(',', '\n')
-    b = "b,c,d,e,f,x".replace(',', '\n')
-    assert(testHelper(diffText(a, b)) ==
-      "1.0.0.0*0.1.6.5*",
-      "snake test failed.")
-    stdout.writeLine("snake test passed.")
-
-    # 2002.09.20 - repro
-    a = "c1,a,c2,b,c,d,e,g,h,i,j,c3,k,l".replace(',', '\n')
-    b = "C1,a,C2,b,c,d,e,I1,e,g,h,i,j,C3,k,I2,l".replace(',', '\n')
-    assert(testHelper(diffText(a, b)) ==
-      "1.1.0.0*1.1.2.2*0.2.7.7*1.1.11.13*0.1.13.15*",
-      "repro20020920 test failed.")
-    stdout.writeLine("repro20020920 test passed.")
-
-    # 2003.02.07 - repro
-    a = "F".replace(',', '\n')
-    b = "0,F,1,2,3,4,5,6,7".replace(',', '\n')
-    assert(testHelper(diffText(a, b)) ==
-      "0.1.0.0*0.7.1.2*",
-      "repro20030207 test failed.")
-    stdout.writeLine("repro20030207 test passed.")
-
-    # Muegel - repro
-    a = "HELLO\nWORLD"
-    b = "\n\nhello\n\n\n\nworld\n"
-    assert(testHelper(diffText(a, b)) ==
-      "2.8.0.0*",
-      "repro20030409 test failed.")
-    stdout.writeLine("repro20030409 test passed.")
-
-    # test some differences
-    a = "a,b,-,c,d,e,f,f".replace(',', '\n')
-    b = "a,b,x,c,e,f".replace(',', '\n')
-    assert(testHelper(diffText(a, b)) ==
-      "1.1.2.2*1.0.4.4*1.0.7.6*",
-      "some-changes test failed.")
-    stdout.writeLine("some-changes test passed.")
-
-    # test one change within long chain of repeats
-    a = "a,a,a,a,a,a,a,a,a,a".replace(',', '\n')
-    b = "a,a,a,a,-,a,a,a,a,a".replace(',', '\n')
-    assert(testHelper(diffText(a, b)) ==
-      "0.1.4.4*1.0.9.10*",
-      "long chain of repeats test failed.")
-
-    stdout.writeLine("End.")
-    stdout.flushFile
-
-  main()
author	Araq <rumpf_a@web.de>	2018-12-12 17:48:30 +0100
committer	Araq <rumpf_a@web.de>	2018-12-12 17:48:30 +0100
commit	eb8383cb28e52bb9a790a743de442e4b71687001 (patch)
tree	8d9c23187e1ba4bc20f45ae065f62ff756e87509 /lib/std
parent	070bcf4cea28a3238089379f5884787b2084b2de (diff)
download	Nim-eb8383cb28e52bb9a790a743de442e4b71687001.tar.gz