lib/experimental/diff.nim


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413

#
#
#            Nim's Runtime Library
#        (c) Copyright 2018 Nim contributors
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## This module implements an algorithm to compute the
## `diff`:idx: between two sequences of lines.
##
## A basic example of ``diffInt`` on 2 arrays of integers:
##
## .. code::nim
##
##   import experimental/diff
##   echo diffInt([0, 1, 2, 3, 4, 5, 6, 7, 8], [-1, 1, 2, 3, 4, 5, 666, 7, 42])
##
## Another short example of ``diffText`` to diff strings:
##
## .. code::nim
##
##   import experimental/diff
##   # 2 samples of text for testing (from "The Call of Cthulhu" by Lovecraft)
##   let txt0 = """I have looked upon all the universe has to hold of horror,
##   even skies of spring and flowers of summer must ever be poison to me."""
##   let txt1 = """I have looked upon all your code has to hold of bugs,
##   even skies of spring and flowers of summer must ever be poison to me."""
##
##   echo diffText(txt0, txt1)
##
## - To learn more see `Diff on Wikipedia. <http://wikipedia.org/wiki/Diff>`_

# code owner: Arne Döring
#
# This is based on C# code written by Matthias Hertel, http://www.mathertel.de
#
# This Class implements the Difference Algorithm published in
# "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
# Algorithmica Vol. 1 No. 2, 1986, p 251.

import tables, strutils

type
  Item* = object    ## An Item in the list of differences.
    startA*: int    ## Start Line number in Data A.
    startB*: int    ## Start Line number in Data B.
    deletedA*: int  ## Number of changes in Data A.
    insertedB*: int ## Number of changes in Data B.

  DiffData = object ## Data on one input file being compared.
    data: seq[int] ## Buffer of numbers that will be compared.
    modified: seq[bool] ## Array of booleans that flag for modified
                        ## data. This is the result of the diff.
                        ## This means deletedA in the first Data or
                        ## inserted in the second Data.

  Smsrd = object
    x, y: int

# template to avoid a seq copy. Required until ``sink`` parameters are ready.
template newDiffData(initData: seq[int]; L: int): DiffData =
  DiffData(
    data: initData,
    modified: newSeq[bool](L + 2)
  )

proc len(d: DiffData): int {.inline.} = d.data.len

proc diffCodes(aText: string; h: var Table[string, int]): DiffData =
  ## This function converts all textlines of the text into unique numbers for every unique textline
  ## so further work can work only with simple numbers.
  ## ``aText`` the input text
  ## ``h`` This extern initialized hashtable is used for storing all ever used textlines.
  ## ``trimSpace`` ignore leading and trailing space characters
  ## Returns a array of integers.
  var lastUsedCode = h.len
  result.data = newSeq[int]()
  for s in aText.splitLines:
    if h.contains s:
      result.data.add h[s]
    else:
      inc lastUsedCode
      h[s] = lastUsedCode
      result.data.add lastUsedCode
  result.modified = newSeq[bool](result.data.len + 2)

proc optimize(data: var DiffData) =
  ## If a sequence of modified lines starts with a line that contains the same content
  ## as the line that appends the changes, the difference sequence is modified so that the
  ## appended line and not the starting line is marked as modified.
  ## This leads to more readable diff sequences when comparing text files.
  var startPos = 0
  while startPos < data.len:
    while startPos < data.len and not data.modified[startPos]:
      inc startPos
    var endPos = startPos
    while endPos < data.len and data.modified[endPos]:
      inc endPos

    if endPos < data.len and data.data[startPos] == data.data[endPos]:
      data.modified[startPos] = false
      data.modified[endPos] = true
    else:
      startPos = endPos

proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int;
         downVector, upVector: var openArray[int]): Smsrd =
  ## This is the algorithm to find the Shortest Middle Snake (sms).
  ## ``dataA`` sequence A
  ## ``lowerA`` lower bound of the actual range in dataA
  ## ``upperA`` upper bound of the actual range in dataA (exclusive)
  ## ``dataB`` sequence B
  ## ``lowerB`` lower bound of the actual range in dataB
  ## ``upperB`` upper bound of the actual range in dataB (exclusive)
  ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
  ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
  ## Returns a MiddleSnakeData record containing x,y and u,v.

  let max = dataA.len + dataB.len + 1

  let downK = lowerA - lowerB # the k-line to start the forward search
  let upK = upperA - upperB # the k-line to start the reverse search

  let delta = (upperA - lowerA) - (upperB - lowerB)
  let oddDelta = (delta and 1) != 0

  # The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
  # and are access using a specific offset: upOffset upVector and downOffset for downVector
  let downOffset = max - downK
  let upOffset = max - upK

  let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1

  downVector[downOffset + downK + 1] = lowerA
  upVector[upOffset + upK - 1] = upperA

  for D in 0 .. maxD:
    # Extend the forward path.
    for k in countUp(downK - D, downK + D, 2):
      # find the only or better starting point
      var x: int
      if k == downK - D:
        x = downVector[downOffset + k + 1] # down
      else:
        x = downVector[downOffset + k - 1] + 1 # a step to the right
        if k < downK + D and downVector[downOffset + k + 1] >= x:
          x = downVector[downOffset + k + 1] # down

      var y = x - k

      # find the end of the furthest reaching forward D-path in diagonal k.
      while x < upperA and y < upperB and dataA.data[x] == dataB.data[y]:
        inc x
        inc y

      downVector[downOffset + k] = x

      # overlap ?
      if oddDelta and upK - D < k and k < upK + D:
        if upVector[upOffset + k] <= downVector[downOffset + k]:
          return Smsrd(x: downVector[downOffset + k],
                       y: downVector[downOffset + k] - k)

    # Extend the reverse path.
    for k in countUp(upK - D, upK + D, 2):
      # find the only or better starting point
      var x: int
      if k == upK + D:
        x = upVector[upOffset + k - 1] # up
      else:
        x = upVector[upOffset + k + 1] - 1 # left
        if k > upK - D and upVector[upOffset + k - 1] < x:
          x = upVector[upOffset + k - 1] # up

      var y = x - k
      while x > lowerA and y > lowerB and dataA.data[x - 1] == dataB.data[y - 1]:
        dec x
        dec y

      upVector[upOffset + k] = x

      # overlap ?
      if not oddDelta and downK-D <= k and k <= downK+D:
        if upVector[upOffset + k] <= downVector[downOffset + k]:
          return Smsrd(x: downVector[downOffset + k],
                       y: downVector[downOffset + k] - k)

  assert false, "the algorithm should never come here."

proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int;
         downVector, upVector: var openArray[int]) =
  ## This is the divide-and-conquer implementation of the longes common-subsequence (lcs)
  ## algorithm.
  ## The published algorithm passes recursively parts of the A and B sequences.
  ## To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant.
  ## ``dataA`` sequence A
  ## ``lowerA`` lower bound of the actual range in dataA
  ## ``upperA`` upper bound of the actual range in dataA (exclusive)
  ## ``dataB`` sequence B
  ## ``lowerB`` lower bound of the actual range in dataB
  ## ``upperB`` upper bound of the actual range in dataB (exclusive)
  ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
  ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.

  # make mutable copy:
  var lowerA = lowerA
  var lowerB = lowerB
  var upperA = upperA
  var upperB = upperB

  # Fast walkthrough equal lines at the start
  while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]:
    inc lowerA
    inc lowerB

  # Fast walkthrough equal lines at the end
  while lowerA < upperA and lowerB < upperB and dataA.data[upperA - 1] == dataB.data[upperB - 1]:
    dec upperA
    dec upperB

  if lowerA == upperA:
    # mark as inserted lines.
    while lowerB < upperB:
      dataB.modified[lowerB] = true
      inc lowerB

  elif lowerB == upperB:
    # mark as deleted lines.
    while lowerA < upperA:
      dataA.modified[lowerA] = true
      inc lowerA

  else:
    # Find the middle snakea and length of an optimal path for A and B
    let smsrd = sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector)
    # Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y))

    # The path is from LowerX to (x,y) and (x,y) to UpperX
    lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector)
    lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector)  # 2002.09.20: no need for 2 points

proc createDiffs(dataA, dataB: DiffData): seq[Item] =
  ## Scan the tables of which lines are inserted and deleted,
  ## producing an edit script in forward order.
  var startA = 0
  var startB = 0
  var lineA = 0
  var lineB = 0
  while lineA < dataA.len or lineB < dataB.len:
    if lineA < dataA.len and not dataA.modified[lineA] and
       lineB < dataB.len and not dataB.modified[lineB]:
      # equal lines
      inc lineA
      inc lineB
    else:
      # maybe deleted and/or inserted lines
      startA = lineA
      startB = lineB

      while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]):
        inc lineA

      while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]):
        inc lineB

      if (startA < lineA) or (startB < lineB):
        result.add Item(startA: startA,
                        startB: startB,
                        deletedA: lineA - startA,
                        insertedB: lineB - startB)


proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] =
  ## Find the difference in 2 arrays of integers.
  ##
  ## ``arrayA`` A-version of the numbers (usually the old one)
  ##
  ## ``arrayB`` B-version of the numbers (usually the new one)
  ##
  ## Returns a sequence of Items that describe the differences.

  # The A-Version of the data (original data) to be compared.
  var dataA = newDiffData(@arrayA, arrayA.len)

  # The B-Version of the data (modified data) to be compared.
  var dataB = newDiffData(@arrayB, arrayB.len)

  let max = dataA.len + dataB.len + 1
  ## vector for the (0,0) to (x,y) search
  var downVector = newSeq[int](2 * max + 2)
  ## vector for the (u,v) to (N,M) search
  var upVector = newSeq[int](2 * max + 2)

  lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
  result = createDiffs(dataA, dataB)

proc diffText*(textA, textB: string): seq[Item] =
  ## Find the difference in 2 text documents, comparing by textlines.
  ##
  ## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
  ## each line is converted into a (hash) number. This hash-value is computed by storing all
  ## textlines into a common hashtable so i can find duplicates in there, and generating a
  ## new number each time a new textline is inserted.
  ##
  ## ``textA`` A-version of the text (usually the old one)
  ##
  ## ``textB`` B-version of the text (usually the new one)
  ##
  ## Returns a seq of Items that describe the differences.

  # prepare the input-text and convert to comparable numbers.
  var h = initTable[string, int]()  # TextA.len + TextB.len  <- probably wrong initial size
  # The A-Version of the data (original data) to be compared.
  var dataA = diffCodes(textA, h)

  # The B-Version of the data (modified data) to be compared.
  var dataB = diffCodes(textB, h)

  h.clear # free up hashtable memory (maybe)

  let max = dataA.len + dataB.len + 1
  ## vector for the (0,0) to (x,y) search
  var downVector = newSeq[int](2 * max + 2)
  ## vector for the (u,v) to (N,M) search
  var upVector = newSeq[int](2 * max + 2)

  lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)

  optimize(dataA)
  optimize(dataB)
  result = createDiffs(dataA, dataB)

when isMainModule:

  proc testHelper(f: seq[Item]): string =
    for it in f:
      result.add(
        $it.deletedA & "." & $it.insertedB & "." & $it.startA & "." & $it.startB & "*"
      )

  proc main() =
    var a, b: string

    stdout.writeLine("Diff Self Test...")

    # test all changes
    a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
    b = "0,1,2,3,4,5,6,7,8,9".replace(',', '\n')
    assert(testHelper(diffText(a, b)) ==
      "12.10.0.0*",
      "all-changes test failed.")
    stdout.writeLine("all-changes test passed.")
    # test all same
    a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
    b = a
    assert(testHelper(diffText(a, b)) ==
      "",
      "all-same test failed.")
    stdout.writeLine("all-same test passed.")

    # test snake
    a = "a,b,c,d,e,f".replace(',', '\n')
    b = "b,c,d,e,f,x".replace(',', '\n')
    assert(testHelper(diffText(a, b)) ==
      "1.0.0.0*0.1.6.5*",
      "snake test failed.")
    stdout.writeLine("snake test passed.")

    # 2002.09.20 - repro
    a = "c1,a,c2,b,c,d,e,g,h,i,j,c3,k,l".replace(',', '\n')
    b = "C1,a,C2,b,c,d,e,I1,e,g,h,i,j,C3,k,I2,l".replace(',', '\n')
    assert(testHelper(diffText(a, b)) ==
      "1.1.0.0*1.1.2.2*0.2.7.7*1.1.11.13*0.1.13.15*",
      "repro20020920 test failed.")
    stdout.writeLine("repro20020920 test passed.")

    # 2003.02.07 - repro
    a = "F".replace(',', '\n')
    b = "0,F,1,2,3,4,5,6,7".replace(',', '\n')
    assert(testHelper(diffText(a, b)) ==
      "0.1.0.0*0.7.1.2*",
      "repro20030207 test failed.")
    stdout.writeLine("repro20030207 test passed.")

    # Muegel - repro
    a = "HELLO\nWORLD"
    b = "\n\nhello\n\n\n\nworld\n"
    assert(testHelper(diffText(a, b)) ==
      "2.8.0.0*",
      "repro20030409 test failed.")
    stdout.writeLine("repro20030409 test passed.")

    # test some differences
    a = "a,b,-,c,d,e,f,f".replace(',', '\n')
    b = "a,b,x,c,e,f".replace(',', '\n')
    assert(testHelper(diffText(a, b)) ==
      "1.1.2.2*1.0.4.4*1.0.7.6*",
      "some-changes test failed.")
    stdout.writeLine("some-changes test passed.")

    # test one change within long chain of repeats
    a = "a,a,a,a,a,a,a,a,a,a".replace(',', '\n')
    b = "a,a,a,a,-,a,a,a,a,a".replace(',', '\n')
    assert(testHelper(diffText(a, b)) ==
      "0.1.4.4*1.0.9.10*",
      "long chain of repeats test failed.")

    stdout.writeLine("End.")
    stdout.flushFile

  main()