1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
|
#
#
# Nim's Runtime Library
# (c) Copyright 2018 Nim contributors
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## This module implements an algorithm to compute the
## `diff`:idx: between two sequences of lines.
##
## A basic example of ``diffInt`` on 2 arrays of integers:
##
## .. code::nim
##
## import experimental/diff
## echo diffInt([0, 1, 2, 3, 4, 5, 6, 7, 8], [-1, 1, 2, 3, 4, 5, 666, 7, 42])
##
## Another short example of ``diffText`` to diff strings:
##
## .. code::nim
##
## import experimental/diff
## # 2 samples of text for testing (from "The Call of Cthulhu" by Lovecraft)
## let txt0 = """I have looked upon all the universe has to hold of horror,
## even skies of spring and flowers of summer must ever be poison to me."""
## let txt1 = """I have looked upon all your code has to hold of bugs,
## even skies of spring and flowers of summer must ever be poison to me."""
##
## echo diffText(txt0, txt1)
##
## - To learn more see `Diff on Wikipedia. <http://wikipedia.org/wiki/Diff>`_
# code owner: Arne Döring
#
# This is based on C# code written by Matthias Hertel, http://www.mathertel.de
#
# This Class implements the Difference Algorithm published in
# "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
# Algorithmica Vol. 1 No. 2, 1986, p 251.
import tables, strutils
type
Item* = object ## An Item in the list of differences.
startA*: int ## Start Line number in Data A.
startB*: int ## Start Line number in Data B.
deletedA*: int ## Number of changes in Data A.
insertedB*: int ## Number of changes in Data B.
DiffData = object ## Data on one input file being compared.
data: seq[int] ## Buffer of numbers that will be compared.
modified: seq[bool] ## Array of booleans that flag for modified
## data. This is the result of the diff.
## This means deletedA in the first Data or
## inserted in the second Data.
Smsrd = object
x, y: int
# template to avoid a seq copy. Required until ``sink`` parameters are ready.
template newDiffData(initData: seq[int]; L: int): DiffData =
DiffData(
data: initData,
modified: newSeq[bool](L + 2)
)
proc len(d: DiffData): int {.inline.} = d.data.len
proc diffCodes(aText: string; h: var Table[string, int]): DiffData =
## This function converts all textlines of the text into unique numbers for every unique textline
## so further work can work only with simple numbers.
## ``aText`` the input text
## ``h`` This extern initialized hashtable is used for storing all ever used textlines.
## ``trimSpace`` ignore leading and trailing space characters
## Returns a array of integers.
var lastUsedCode = h.len
result.data = newSeq[int]()
for s in aText.splitLines:
if h.contains s:
result.data.add h[s]
else:
inc lastUsedCode
h[s] = lastUsedCode
result.data.add lastUsedCode
result.modified = newSeq[bool](result.data.len + 2)
proc optimize(data: var DiffData) =
## If a sequence of modified lines starts with a line that contains the same content
## as the line that appends the changes, the difference sequence is modified so that the
## appended line and not the starting line is marked as modified.
## This leads to more readable diff sequences when comparing text files.
var startPos = 0
while startPos < data.len:
while startPos < data.len and not data.modified[startPos]:
inc startPos
var endPos = startPos
while endPos < data.len and data.modified[endPos]:
inc endPos
if endPos < data.len and data.data[startPos] == data.data[endPos]:
data.modified[startPos] = false
data.modified[endPos] = true
else:
startPos = endPos
proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int;
downVector, upVector: var openArray[int]): Smsrd =
## This is the algorithm to find the Shortest Middle Snake (sms).
## ``dataA`` sequence A
## ``lowerA`` lower bound of the actual range in dataA
## ``upperA`` upper bound of the actual range in dataA (exclusive)
## ``dataB`` sequence B
## ``lowerB`` lower bound of the actual range in dataB
## ``upperB`` upper bound of the actual range in dataB (exclusive)
## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
## Returns a MiddleSnakeData record containing x,y and u,v.
let max = dataA.len + dataB.len + 1
let downK = lowerA - lowerB # the k-line to start the forward search
let upK = upperA - upperB # the k-line to start the reverse search
let delta = (upperA - lowerA) - (upperB - lowerB)
let oddDelta = (delta and 1) != 0
# The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
# and are access using a specific offset: upOffset upVector and downOffset for downVector
let downOffset = max - downK
let upOffset = max - upK
let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1
downVector[downOffset + downK + 1] = lowerA
upVector[upOffset + upK - 1] = upperA
for D in 0 .. maxD:
# Extend the forward path.
for k in countUp(downK - D, downK + D, 2):
# find the only or better starting point
var x: int
if k == downK - D:
x = downVector[downOffset + k + 1] # down
else:
x = downVector[downOffset + k - 1] + 1 # a step to the right
if k < downK + D and downVector[downOffset + k + 1] >= x:
x = downVector[downOffset + k + 1] # down
var y = x - k
# find the end of the furthest reaching forward D-path in diagonal k.
while x < upperA and y < upperB and dataA.data[x] == dataB.data[y]:
inc x
inc y
downVector[downOffset + k] = x
# overlap ?
if oddDelta and upK - D < k and k < upK + D:
if upVector[upOffset + k] <= downVector[downOffset + k]:
return Smsrd(x: downVector[downOffset + k],
y: downVector[downOffset + k] - k)
# Extend the reverse path.
for k in countUp(upK - D, upK + D, 2):
# find the only or better starting point
var x: int
if k == upK + D:
x = upVector[upOffset + k - 1] # up
else:
x = upVector[upOffset + k + 1] - 1 # left
if k > upK - D and upVector[upOffset + k - 1] < x:
x = upVector[upOffset + k - 1] # up
var y = x - k
while x > lowerA and y > lowerB and dataA.data[x - 1] == dataB.data[y - 1]:
dec x
dec y
upVector[upOffset + k] = x
# overlap ?
if not oddDelta and downK-D <= k and k <= downK+D:
if upVector[upOffset + k] <= downVector[downOffset + k]:
return Smsrd(x: downVector[downOffset + k],
y: downVector[downOffset + k] - k)
assert false, "the algorithm should never come here."
proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int;
downVector, upVector: var openArray[int]) =
## This is the divide-and-conquer implementation of the longes common-subsequence (lcs)
## algorithm.
## The published algorithm passes recursively parts of the A and B sequences.
## To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant.
## ``dataA`` sequence A
## ``lowerA`` lower bound of the actual range in dataA
## ``upperA`` upper bound of the actual range in dataA (exclusive)
## ``dataB`` sequence B
## ``lowerB`` lower bound of the actual range in dataB
## ``upperB`` upper bound of the actual range in dataB (exclusive)
## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
# make mutable copy:
var lowerA = lowerA
var lowerB = lowerB
var upperA = upperA
var upperB = upperB
# Fast walkthrough equal lines at the start
while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]:
inc lowerA
inc lowerB
# Fast walkthrough equal lines at the end
while lowerA < upperA and lowerB < upperB and dataA.data[upperA - 1] == dataB.data[upperB - 1]:
dec upperA
dec upperB
if lowerA == upperA:
# mark as inserted lines.
while lowerB < upperB:
dataB.modified[lowerB] = true
inc lowerB
elif lowerB == upperB:
# mark as deleted lines.
while lowerA < upperA:
dataA.modified[lowerA] = true
inc lowerA
else:
# Find the middle snakea and length of an optimal path for A and B
let smsrd = sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector)
# Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y))
# The path is from LowerX to (x,y) and (x,y) to UpperX
lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector)
lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector) # 2002.09.20: no need for 2 points
proc createDiffs(dataA, dataB: DiffData): seq[Item] =
## Scan the tables of which lines are inserted and deleted,
## producing an edit script in forward order.
var startA = 0
var startB = 0
var lineA = 0
var lineB = 0
while lineA < dataA.len or lineB < dataB.len:
if lineA < dataA.len and not dataA.modified[lineA] and
lineB < dataB.len and not dataB.modified[lineB]:
# equal lines
inc lineA
inc lineB
else:
# maybe deleted and/or inserted lines
startA = lineA
startB = lineB
while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]):
inc lineA
while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]):
inc lineB
if (startA < lineA) or (startB < lineB):
result.add Item(startA: startA,
startB: startB,
deletedA: lineA - startA,
insertedB: lineB - startB)
proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] =
## Find the difference in 2 arrays of integers.
##
## ``arrayA`` A-version of the numbers (usually the old one)
##
## ``arrayB`` B-version of the numbers (usually the new one)
##
## Returns a sequence of Items that describe the differences.
# The A-Version of the data (original data) to be compared.
var dataA = newDiffData(@arrayA, arrayA.len)
# The B-Version of the data (modified data) to be compared.
var dataB = newDiffData(@arrayB, arrayB.len)
let max = dataA.len + dataB.len + 1
## vector for the (0,0) to (x,y) search
var downVector = newSeq[int](2 * max + 2)
## vector for the (u,v) to (N,M) search
var upVector = newSeq[int](2 * max + 2)
lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
result = createDiffs(dataA, dataB)
proc diffText*(textA, textB: string): seq[Item] =
## Find the difference in 2 text documents, comparing by textlines.
##
## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
## each line is converted into a (hash) number. This hash-value is computed by storing all
## textlines into a common hashtable so i can find duplicates in there, and generating a
## new number each time a new textline is inserted.
##
## ``textA`` A-version of the text (usually the old one)
##
## ``textB`` B-version of the text (usually the new one)
##
## Returns a seq of Items that describe the differences.
# prepare the input-text and convert to comparable numbers.
var h = initTable[string, int]() # TextA.len + TextB.len <- probably wrong initial size
# The A-Version of the data (original data) to be compared.
var dataA = diffCodes(textA, h)
# The B-Version of the data (modified data) to be compared.
var dataB = diffCodes(textB, h)
h.clear # free up hashtable memory (maybe)
let max = dataA.len + dataB.len + 1
## vector for the (0,0) to (x,y) search
var downVector = newSeq[int](2 * max + 2)
## vector for the (u,v) to (N,M) search
var upVector = newSeq[int](2 * max + 2)
lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
optimize(dataA)
optimize(dataB)
result = createDiffs(dataA, dataB)
when isMainModule:
proc testHelper(f: seq[Item]): string =
for it in f:
result.add(
$it.deletedA & "." & $it.insertedB & "." & $it.startA & "." & $it.startB & "*"
)
proc main() =
var a, b: string
stdout.writeLine("Diff Self Test...")
# test all changes
a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
b = "0,1,2,3,4,5,6,7,8,9".replace(',', '\n')
assert(testHelper(diffText(a, b)) ==
"12.10.0.0*",
"all-changes test failed.")
stdout.writeLine("all-changes test passed.")
# test all same
a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
b = a
assert(testHelper(diffText(a, b)) ==
"",
"all-same test failed.")
stdout.writeLine("all-same test passed.")
# test snake
a = "a,b,c,d,e,f".replace(',', '\n')
b = "b,c,d,e,f,x".replace(',', '\n')
assert(testHelper(diffText(a, b)) ==
"1.0.0.0*0.1.6.5*",
"snake test failed.")
stdout.writeLine("snake test passed.")
# 2002.09.20 - repro
a = "c1,a,c2,b,c,d,e,g,h,i,j,c3,k,l".replace(',', '\n')
b = "C1,a,C2,b,c,d,e,I1,e,g,h,i,j,C3,k,I2,l".replace(',', '\n')
assert(testHelper(diffText(a, b)) ==
"1.1.0.0*1.1.2.2*0.2.7.7*1.1.11.13*0.1.13.15*",
"repro20020920 test failed.")
stdout.writeLine("repro20020920 test passed.")
# 2003.02.07 - repro
a = "F".replace(',', '\n')
b = "0,F,1,2,3,4,5,6,7".replace(',', '\n')
assert(testHelper(diffText(a, b)) ==
"0.1.0.0*0.7.1.2*",
"repro20030207 test failed.")
stdout.writeLine("repro20030207 test passed.")
# Muegel - repro
a = "HELLO\nWORLD"
b = "\n\nhello\n\n\n\nworld\n"
assert(testHelper(diffText(a, b)) ==
"2.8.0.0*",
"repro20030409 test failed.")
stdout.writeLine("repro20030409 test passed.")
# test some differences
a = "a,b,-,c,d,e,f,f".replace(',', '\n')
b = "a,b,x,c,e,f".replace(',', '\n')
assert(testHelper(diffText(a, b)) ==
"1.1.2.2*1.0.4.4*1.0.7.6*",
"some-changes test failed.")
stdout.writeLine("some-changes test passed.")
# test one change within long chain of repeats
a = "a,a,a,a,a,a,a,a,a,a".replace(',', '\n')
b = "a,a,a,a,-,a,a,a,a,a".replace(',', '\n')
assert(testHelper(diffText(a, b)) ==
"0.1.4.4*1.0.9.10*",
"long chain of repeats test failed.")
stdout.writeLine("End.")
stdout.flushFile
main()
|