diff options
Diffstat (limited to 'lib/std')
-rw-r--r-- | lib/std/diff.nim | 387 | ||||
-rw-r--r-- | lib/std/editdistance.nim | 295 | ||||
-rw-r--r-- | lib/std/wordwrap.nim | 90 |
3 files changed, 772 insertions, 0 deletions
diff --git a/lib/std/diff.nim b/lib/std/diff.nim new file mode 100644 index 000000000..bffce2803 --- /dev/null +++ b/lib/std/diff.nim @@ -0,0 +1,387 @@ +# +# +# Nim's Runtime Library +# (c) Copyright 2018 Nim contributors +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module implements an algorithm to compute the +## `diff`:idx: between two sequences of lines. + +# code owner: Arne Döring +# +# This is based on C# code written by Matthias Hertel, http://www.mathertel.de +# +# This Class implements the Difference Algorithm published in +# "An O(ND) Difference Algorithm and its Variations" by Eugene Myers +# Algorithmica Vol. 1 No. 2, 1986, p 251. + +import tables, strutils + +type + Item* = object ## An Item in the list of differences. + startA*: int ## Start Line number in Data A. + startB*: int ## Start Line number in Data B. + deletedA*: int ## Number of changes in Data A. + insertedB*: int ## Number of changes in Data B. + + DiffData = object ## Data on one input file being compared. + data: seq[int] ## Buffer of numbers that will be compared. + modified: seq[bool] ## Array of booleans that flag for modified + ## data. This is the result of the diff. + ## This means deletedA in the first Data or + ## inserted in the second Data. + + Smsrd = object + x, y: int + +# template to avoid a seq copy. Required until ``sink`` parameters are ready. +template newDiffData(initData: seq[int]; L: int): DiffData = + DiffData( + data: initData, + modified: newSeq[bool](L + 2) + ) + +proc len(d: DiffData): int {.inline.} = d.data.len + +proc diffCodes(aText: string; h: var Table[string, int]): DiffData = + ## This function converts all textlines of the text into unique numbers for every unique textline + ## so further work can work only with simple numbers. + ## ``aText`` the input text + ## ``h`` This extern initialized hashtable is used for storing all ever used textlines. + ## ``trimSpace`` ignore leading and trailing space characters + ## Returns a array of integers. + var lastUsedCode = h.len + result.data = newSeq[int]() + for s in aText.splitLines: + if h.contains s: + result.data.add h[s] + else: + inc lastUsedCode + h[s] = lastUsedCode + result.data.add lastUsedCode + result.modified = newSeq[bool](result.data.len + 2) + +proc optimize(data: var DiffData) = + ## If a sequence of modified lines starts with a line that contains the same content + ## as the line that appends the changes, the difference sequence is modified so that the + ## appended line and not the starting line is marked as modified. + ## This leads to more readable diff sequences when comparing text files. + var startPos = 0 + while startPos < data.len: + while startPos < data.len and not data.modified[startPos]: + inc startPos + var endPos = startPos + while endPos < data.len and data.modified[endPos]: + inc endPos + + if endPos < data.len and data.data[startPos] == data.data[endPos]: + data.modified[startPos] = false + data.modified[endPos] = true + else: + startPos = endPos + +proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int; + downVector, upVector: var openArray[int]): Smsrd = + ## This is the algorithm to find the Shortest Middle Snake (sms). + ## ``dataA`` sequence A + ## ``lowerA`` lower bound of the actual range in dataA + ## ``upperA`` upper bound of the actual range in dataA (exclusive) + ## ``dataB`` sequence B + ## ``lowerB`` lower bound of the actual range in dataB + ## ``upperB`` upper bound of the actual range in dataB (exclusive) + ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons. + ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons. + ## Returns a MiddleSnakeData record containing x,y and u,v. + + let max = dataA.len + dataB.len + 1 + + let downK = lowerA - lowerB # the k-line to start the forward search + let upK = upperA - upperB # the k-line to start the reverse search + + let delta = (upperA - lowerA) - (upperB - lowerB) + let oddDelta = (delta and 1) != 0 + + # The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based + # and are access using a specific offset: upOffset upVector and downOffset for downVector + let downOffset = max - downK + let upOffset = max - upK + + let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1 + + downVector[downOffset + downK + 1] = lowerA + upVector[upOffset + upK - 1] = upperA + + for D in 0 .. maxD: + # Extend the forward path. + for k in countUp(downK - D, downK + D, 2): + # find the only or better starting point + var x: int + if k == downK - D: + x = downVector[downOffset + k + 1] # down + else: + x = downVector[downOffset + k - 1] + 1 # a step to the right + if k < downK + D and downVector[downOffset + k + 1] >= x: + x = downVector[downOffset + k + 1] # down + + var y = x - k + + # find the end of the furthest reaching forward D-path in diagonal k. + while x < upperA and y < upperB and dataA.data[x] == dataB.data[y]: + inc x + inc y + + downVector[downOffset + k] = x + + # overlap ? + if oddDelta and upK - D < k and k < upK + D: + if upVector[upOffset + k] <= downVector[downOffset + k]: + return Smsrd(x: downVector[downOffset + k], + y: downVector[downOffset + k] - k) + + # Extend the reverse path. + for k in countUp(upK - D, upK + D, 2): + # find the only or better starting point + var x: int + if k == upK + D: + x = upVector[upOffset + k - 1] # up + else: + x = upVector[upOffset + k + 1] - 1 # left + if k > upK - D and upVector[upOffset + k - 1] < x: + x = upVector[upOffset + k - 1] # up + + var y = x - k + while x > lowerA and y > lowerB and dataA.data[x - 1] == dataB.data[y - 1]: + dec x + dec y + + upVector[upOffset + k] = x + + # overlap ? + if not oddDelta and downK-D <= k and k <= downK+D: + if upVector[upOffset + k] <= downVector[downOffset + k]: + return Smsrd(x: downVector[downOffset + k], + y: downVector[downOffset + k] - k) + + assert false, "the algorithm should never come here." + +proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int; + downVector, upVector: var openArray[int]) = + ## This is the divide-and-conquer implementation of the longes common-subsequence (lcs) + ## algorithm. + ## The published algorithm passes recursively parts of the A and B sequences. + ## To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant. + ## ``dataA`` sequence A + ## ``lowerA`` lower bound of the actual range in dataA + ## ``upperA`` upper bound of the actual range in dataA (exclusive) + ## ``dataB`` sequence B + ## ``lowerB`` lower bound of the actual range in dataB + ## ``upperB`` upper bound of the actual range in dataB (exclusive) + ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons. + ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons. + + # make mutable copy: + var lowerA = lowerA + var lowerB = lowerB + var upperA = upperA + var upperB = upperB + + # Fast walkthrough equal lines at the start + while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]: + inc lowerA + inc lowerB + + # Fast walkthrough equal lines at the end + while lowerA < upperA and lowerB < upperB and dataA.data[upperA - 1] == dataB.data[upperB - 1]: + dec upperA + dec upperB + + if lowerA == upperA: + # mark as inserted lines. + while lowerB < upperB: + dataB.modified[lowerB] = true + inc lowerB + + elif lowerB == upperB: + # mark as deleted lines. + while lowerA < upperA: + dataA.modified[lowerA] = true + inc lowerA + + else: + # Find the middle snakea and length of an optimal path for A and B + let smsrd = sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector) + # Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y)) + + # The path is from LowerX to (x,y) and (x,y) to UpperX + lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector) + lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector) # 2002.09.20: no need for 2 points + +proc createDiffs(dataA, dataB: DiffData): seq[Item] = + ## Scan the tables of which lines are inserted and deleted, + ## producing an edit script in forward order. + var startA = 0 + var startB = 0 + var lineA = 0 + var lineB = 0 + while lineA < dataA.len or lineB < dataB.len: + if lineA < dataA.len and not dataA.modified[lineA] and + lineB < dataB.len and not dataB.modified[lineB]: + # equal lines + inc lineA + inc lineB + else: + # maybe deleted and/or inserted lines + startA = lineA + startB = lineB + + while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]): + inc lineA + + while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]): + inc lineB + + if (startA < lineA) or (startB < lineB): + result.add Item(startA: startA, + startB: startB, + deletedA: lineA - startA, + insertedB: lineB - startB) + + +proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] = + ## Find the difference in 2 arrays of integers. + ## ``arrayA`` A-version of the numbers (usualy the old one) + ## ``arrayB`` B-version of the numbers (usualy the new one) + ## Returns a array of Items that describe the differences. + + # The A-Version of the data (original data) to be compared. + var dataA = newDiffData(@arrayA, arrayA.len) + + # The B-Version of the data (modified data) to be compared. + var dataB = newDiffData(@arrayB, arrayB.len) + + let max = dataA.len + dataB.len + 1 + ## vector for the (0,0) to (x,y) search + var downVector = newSeq[int](2 * max + 2) + ## vector for the (u,v) to (N,M) search + var upVector = newSeq[int](2 * max + 2) + + lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector) + result = createDiffs(dataA, dataB) + +proc diffText*(textA, textB: string): seq[Item] = + ## Find the difference in 2 text documents, comparing by textlines. + ## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents + ## each line is converted into a (hash) number. This hash-value is computed by storing all + ## textlines into a common hashtable so i can find dublicates in there, and generating a + ## new number each time a new textline is inserted. + ## ``TextA`` A-version of the text (usualy the old one) + ## ``TextB`` B-version of the text (usualy the new one) + ## ``trimSpace`` When set to true, all leading and trailing whitespace characters are stripped out before the comparation is done. + ## ``ignoreSpace`` When set to true, all whitespace characters are converted to a single space character before the comparation is done. + ## ``ignoreCase`` When set to true, all characters are converted to their lowercase equivivalence before the comparation is done. + ## Returns a seq of Items that describe the differences. + + # prepare the input-text and convert to comparable numbers. + var h = initTable[string, int]() # TextA.len + TextB.len <- probably wrong initial size + # The A-Version of the data (original data) to be compared. + var dataA = diffCodes(textA, h) + + # The B-Version of the data (modified data) to be compared. + var dataB = diffCodes(textB, h) + + h.clear # free up hashtable memory (maybe) + + let max = dataA.len + dataB.len + 1 + ## vector for the (0,0) to (x,y) search + var downVector = newSeq[int](2 * max + 2) + ## vector for the (u,v) to (N,M) search + var upVector = newSeq[int](2 * max + 2) + + lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector) + + optimize(dataA) + optimize(dataB) + result = createDiffs(dataA, dataB) + +when isMainModule: + + proc testHelper(f: seq[Item]): string = + for it in f: + result.add( + $it.deletedA & "." & $it.insertedB & "." & $it.startA & "." & $it.startB & "*" + ) + + proc main() = + var a, b: string + + stdout.writeLine("Diff Self Test...") + + # test all changes + a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n') + b = "0,1,2,3,4,5,6,7,8,9".replace(',', '\n') + assert(testHelper(diffText(a, b)) == + "12.10.0.0*", + "all-changes test failed.") + stdout.writeLine("all-changes test passed.") + # test all same + a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n') + b = a + assert(testHelper(diffText(a, b)) == + "", + "all-same test failed.") + stdout.writeLine("all-same test passed.") + + # test snake + a = "a,b,c,d,e,f".replace(',', '\n') + b = "b,c,d,e,f,x".replace(',', '\n') + assert(testHelper(diffText(a, b)) == + "1.0.0.0*0.1.6.5*", + "snake test failed.") + stdout.writeLine("snake test passed.") + + # 2002.09.20 - repro + a = "c1,a,c2,b,c,d,e,g,h,i,j,c3,k,l".replace(',', '\n') + b = "C1,a,C2,b,c,d,e,I1,e,g,h,i,j,C3,k,I2,l".replace(',', '\n') + assert(testHelper(diffText(a, b)) == + "1.1.0.0*1.1.2.2*0.2.7.7*1.1.11.13*0.1.13.15*", + "repro20020920 test failed.") + stdout.writeLine("repro20020920 test passed.") + + # 2003.02.07 - repro + a = "F".replace(',', '\n') + b = "0,F,1,2,3,4,5,6,7".replace(',', '\n') + assert(testHelper(diffText(a, b)) == + "0.1.0.0*0.7.1.2*", + "repro20030207 test failed.") + stdout.writeLine("repro20030207 test passed.") + + # Muegel - repro + a = "HELLO\nWORLD" + b = "\n\nhello\n\n\n\nworld\n" + assert(testHelper(diffText(a, b)) == + "2.8.0.0*", + "repro20030409 test failed.") + stdout.writeLine("repro20030409 test passed.") + + # test some differences + a = "a,b,-,c,d,e,f,f".replace(',', '\n') + b = "a,b,x,c,e,f".replace(',', '\n') + assert(testHelper(diffText(a, b)) == + "1.1.2.2*1.0.4.4*1.0.7.6*", + "some-changes test failed.") + stdout.writeLine("some-changes test passed.") + + # test one change within long chain of repeats + a = "a,a,a,a,a,a,a,a,a,a".replace(',', '\n') + b = "a,a,a,a,-,a,a,a,a,a".replace(',', '\n') + assert(testHelper(diffText(a, b)) == + "0.1.4.4*1.0.9.10*", + "long chain of repeats test failed.") + + stdout.writeLine("End.") + stdout.flushFile + + main() diff --git a/lib/std/editdistance.nim b/lib/std/editdistance.nim new file mode 100644 index 000000000..40beb7d93 --- /dev/null +++ b/lib/std/editdistance.nim @@ -0,0 +1,295 @@ +# +# +# Nim's Runtime Library +# (c) Copyright 2018 Nim contributors +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module implements an algorithm to compute the +## `edit distance`:idx: between two Unicode strings. + +import unicode + +proc editDistance*(a, b: string): int {.noSideEffect.} = + ## Returns the unicode-rune edit distance between ``a`` and ``b``. + ## + ## This uses the `Levenshtein`:idx: distance algorithm with only a linear + ## memory overhead. + if len(a) > len(b): + # make ``b`` the longer string + return editDistance(b, a) + # strip common prefix + var + i_start = 0 ## The character starting index of the first rune in both strings ``a`` and ``b`` + i_next_a = 0 + i_next_b = 0 + rune_a, rune_b: Rune + len_runes_a = 0 ## The number of relevant runes in string ``a``. + len_runes_b = 0 ## The number of relevant runes in string ``b``. + block commonPrefix: + # ``a`` is the shorter string + while i_start < len(a): + i_next_a = i_start + a.fastRuneAt(i_next_a, rune_a, doInc = true) + i_next_b = i_start + b.fastRuneAt(i_next_b, rune_b, doInc = true) + if rune_a != rune_b: + inc(len_runes_a) + inc(len_runes_b) + break + i_start = i_next_a + var + # we know that we are either at the start of the strings + # or that the current value of rune_a is not equal to rune_b + # => start search for common suffix after the current rune (``i_next_*``) + i_end_a = i_next_a ## The exclusive upper index bound of string ``a``. + i_end_b = i_next_b ## The exclusive upper index bound of string ``b``. + i_current_a = i_next_a + i_current_b = i_next_b + block commonSuffix: + var + add_runes_a = 0 + add_runes_b = 0 + while i_current_a < len(a) and i_current_b < len(b): + i_next_a = i_current_a + a.fastRuneAt(i_next_a, rune_a) + i_next_b = i_current_b + b.fastRuneAt(i_next_b, rune_b) + inc(add_runes_a) + inc(add_runes_b) + if rune_a != rune_b: + i_end_a = i_next_a + i_end_b = i_next_b + inc(len_runes_a, add_runes_a) + inc(len_runes_b, add_runes_b) + add_runes_a = 0 + add_runes_b = 0 + i_current_a = i_next_a + i_current_b = i_next_b + if i_current_a >= len(a): # ``a`` exhausted + if i_current_b < len(b): # ``b`` not exhausted + i_end_a = i_current_a + i_end_b = i_current_b + inc(len_runes_a, add_runes_a) + inc(len_runes_b, add_runes_b) + while true: + b.fastRuneAt(i_end_b, rune_b) + inc(len_runes_b) + if i_end_b >= len(b): break + elif i_current_b >= len(b): # ``b`` exhausted and ``a`` not exhausted + i_end_a = i_current_a + i_end_b = i_current_b + inc(len_runes_a, add_runes_a) + inc(len_runes_b, add_runes_b) + while true: + a.fastRuneAt(i_end_a, rune_a) + inc(len_runes_a) + if i_end_a >= len(a): break + block specialCases: + # trivial cases: + if len_runes_a == 0: return len_runes_b + if len_runes_b == 0: return len_runes_a + # another special case: + if len_runes_a == 1: + a.fastRuneAt(i_start, rune_a, doInc = false) + var i_current_b = i_start + while i_current_b < i_end_b: + b.fastRuneAt(i_current_b, rune_b, doInc = true) + if rune_a == rune_b: return len_runes_b - 1 + return len_runes_b + # common case: + var + len1 = len_runes_a + 1 + len2 = len_runes_b + 1 + row: seq[int] + let half = len_runes_a div 2 + newSeq(row, len2) + var e = i_start + len2 - 1 # end marker + # initialize first row: + for i in 1 .. (len2 - half - 1): row[i] = i + row[0] = len1 - half - 1 + i_current_a = i_start + var + char2p_i = -1 + char2p_prev: int + for i in 1 .. (len1 - 1): + i_next_a = i_current_a + a.fastRuneAt(i_next_a, rune_a) + var + char2p: int + D, x: int + p: int + if i >= (len1 - half): + # skip the upper triangle: + let offset = i + half - len1 + if char2p_i == i: + b.fastRuneAt(char2p_prev, rune_b) + char2p = char2p_prev + char2p_i = i + 1 + else: + char2p = i_start + for j in 0 ..< offset: + rune_b = b.runeAt(char2p) + inc(char2p, rune_b.size) + char2p_i = i + 1 + char2p_prev = char2p + p = offset + rune_b = b.runeAt(char2p) + var c3 = row[p] + (if rune_a != rune_b: 1 else: 0) + inc(char2p, rune_b.size) + inc(p) + x = row[p] + 1 + D = x + if x > c3: x = c3 + row[p] = x + inc(p) + else: + p = 1 + char2p = i_start + D = i + x = i + if i <= (half + 1): + # skip the lower triangle: + e = len2 + i - half - 2 + # main: + while p <= e: + dec(D) + rune_b = b.runeAt(char2p) + var c3 = D + (if rune_a != rune_b: 1 else: 0) + inc(char2p, rune_b.size) + inc(x) + if x > c3: x = c3 + D = row[p] + 1 + if x > D: x = D + row[p] = x + inc(p) + # lower triangle sentinel: + if i <= half: + dec(D) + rune_b = b.runeAt(char2p) + var c3 = D + (if rune_a != rune_b: 1 else: 0) + inc(x) + if x > c3: x = c3 + row[p] = x + i_current_a = i_next_a + result = row[e] + +proc editDistanceAscii*(a, b: string): int {.noSideEffect.} = + ## Returns the edit distance between `a` and `b`. + ## + ## This uses the `Levenshtein`:idx: distance algorithm with only a linear + ## memory overhead. + var len1 = a.len + var len2 = b.len + if len1 > len2: + # make `b` the longer string + return editDistanceAscii(b, a) + + # strip common prefix: + var s = 0 + while s < len1 and a[s] == b[s]: + inc(s) + dec(len1) + dec(len2) + # strip common suffix: + while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]: + dec(len1) + dec(len2) + # trivial cases: + if len1 == 0: return len2 + if len2 == 0: return len1 + + # another special case: + if len1 == 1: + for j in s..s+len2-1: + if a[s] == b[j]: return len2 - 1 + return len2 + + inc(len1) + inc(len2) + var half = len1 shr 1 + # initalize first row: + #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int))) + var row: seq[int] + newSeq(row, len2) + var e = s + len2 - 1 # end marker + for i in 1..len2 - half - 1: row[i] = i + row[0] = len1 - half - 1 + for i in 1 .. len1 - 1: + var char1 = a[i + s - 1] + var char2p: int + var D, x: int + var p: int + if i >= len1 - half: + # skip the upper triangle: + var offset = i - len1 + half + char2p = offset + p = offset + var c3 = row[p] + ord(char1 != b[s + char2p]) + inc(p) + inc(char2p) + x = row[p] + 1 + D = x + if x > c3: x = c3 + row[p] = x + inc(p) + else: + p = 1 + char2p = 0 + D = i + x = i + if i <= half + 1: + # skip the lower triangle: + e = len2 + i - half - 2 + # main: + while p <= e: + dec(D) + var c3 = D + ord(char1 != b[char2p + s]) + inc(char2p) + inc(x) + if x > c3: x = c3 + D = row[p] + 1 + if x > D: x = D + row[p] = x + inc(p) + # lower triangle sentinel: + if i <= half: + dec(D) + var c3 = D + ord(char1 != b[char2p + s]) + inc(x) + if x > c3: x = c3 + row[p] = x + result = row[e] + + +when isMainModule: + doAssert editDistance("", "") == 0 + doAssert editDistance("kitten", "sitting") == 3 # from Wikipedia + doAssert editDistance("flaw", "lawn") == 2 # from Wikipedia + + doAssert editDistance("привет", "превет") == 1 + doAssert editDistance("Åge", "Age") == 1 + # editDistance, one string is longer in bytes, but shorter in rune length + # first string: 4 bytes, second: 6 bytes, but only 3 runes + doAssert editDistance("aaaa", "×××") == 4 + + block veryLongStringEditDistanceTest: + const cap = 256 + var + s1 = newStringOfCap(cap) + s2 = newStringOfCap(cap) + while len(s1) < cap: + s1.add 'a' + while len(s2) < cap: + s2.add 'b' + doAssert editDistance(s1, s2) == cap + + block combiningCodePointsEditDistanceTest: + const s = "A\xCC\x8Age" + doAssert editDistance(s, "Age") == 1 + + doAssert editDistanceAscii("", "") == 0 + doAssert editDistanceAscii("kitten", "sitting") == 3 # from Wikipedia + doAssert editDistanceAscii("flaw", "lawn") == 2 # from Wikipedia diff --git a/lib/std/wordwrap.nim b/lib/std/wordwrap.nim new file mode 100644 index 000000000..c7898b339 --- /dev/null +++ b/lib/std/wordwrap.nim @@ -0,0 +1,90 @@ +# +# +# Nim's Runtime Library +# (c) Copyright 2018 Nim contributors +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module contains an algorithm to wordwrap a Unicode string. + +import strutils, unicode + +proc olen(s: string): int = + var i = 0 + result = 0 + while i < s.len: + inc result + let L = graphemeLen(s, i) + inc i, L + +proc wrapWords*(s: string, maxLineWidth = 80, + splitLongWords = true, + seps: set[char] = Whitespace, + newLine = "\n"): string {.noSideEffect.} = + ## Word wraps `s`. + result = newStringOfCap(s.len + s.len shr 6) + var spaceLeft = maxLineWidth + var lastSep = "" + for word, isSep in tokenize(s, seps): + let wlen = olen(word) + if isSep: + lastSep = word + spaceLeft = spaceLeft - wlen + elif wlen > spaceLeft: + if splitLongWords and wlen > maxLineWidth: + var i = 0 + while i < word.len: + if spaceLeft <= 0: + spaceLeft = maxLineWidth + result.add newLine + dec spaceLeft + let L = graphemeLen(word, i) + for j in 0 ..< L: result.add word[i+j] + inc i, L + else: + spaceLeft = maxLineWidth - wlen + result.add(newLine) + result.add(word) + else: + spaceLeft = spaceLeft - wlen + result.add(lastSep) + result.add(word) + lastSep.setLen(0) + +when isMainModule: + + when true: + let + inp = """ this is a long text -- muchlongerthan10chars and here + it goes""" + outp = " this is a\nlong text\n--\nmuchlongerthan10chars\nand here\nit goes" + doAssert wrapWords(inp, 10, false) == outp + + let + longInp = """ThisIsOneVeryLongStringWhichWeWillSplitIntoEightSeparatePartsNow""" + longOutp = "ThisIsOn\neVeryLon\ngStringW\nhichWeWi\nllSplitI\nntoEight\nSeparate\nPartsNow" + doAssert wrapWords(longInp, 8, true) == longOutp + + # test we don't break Umlauts into invalid bytes: + let fies = "äöüöäöüöäöüöäöüööäöüöäößßßßüöäößßßßßß" + let fiesRes = "ä\nö\nü\nö\nä\nö\nü\nö\nä\nö\nü\nö\nä\nö\nü\nö\nö\nä\nö\nü\nö\nä\nö\nß\nß\nß\nß\nü\nö\nä\nö\nß\nß\nß\nß\nß\nß" + doAssert wrapWords(fies, 1, true) == fiesRes + + let longlongword = """abc uitdaeröägfßhydüäpydqfü,träpydqgpmüdträpydföägpydörztdüöäfguiaeowäzjdtrüöäp psnrtuiydrözenrüöäpyfdqazpesnrtulocjtüö +äzydgyqgfqfgprtnwjlcydkqgfüöezmäzydydqüüöäpdtrnvwfhgckdumböäpydfgtdgfhtdrntdrntydfogiayqfguiatrnydrntüöärtniaoeydfgaoeiqfglwcßqfgxvlcwgtfhiaoen +rsüöäapmböäptdrniaoydfglckqfhouenrtsüöäptrniaoeyqfgulocfqclgwxßqflgcwßqfxglcwrniatrnmüböäpmöäbpümöäbpüöämpbaoestnriaesnrtdiaesrtdniaesdrtnaetdr +iaoenvlcyfglwckßqfgvwkßqgfvlwkßqfgvlwckßqvlwkgfUIαοιαοιαχολωχσωχνωκψρχκψρτιεαοσηζϵηζιοεννκεωνιαλωσωκνκψρκγτφγτχκγτεκργτιχνκιωχσιλωσλωχξλξλξωχωχ +ξχλωωχαοεοιαεοαεοιαεοαεοιαοεσναοεκνρκψγκψφϵιηαααοε""" + let longlongwordRes = """ +abc uitdaeröägfßhydüäpydqfü,träpydqgpmüdträpydföägpydörztdüöäfguiaeowäzjdtrüöäp +psnrtuiydrözenrüöäpyfdqazpesnrtulocjtüöäzydgyqgfqfgprtnwjlcydkqgfüöezmäzydydqüü +öäpdtrnvwfhgckdumböäpydfgtdgfhtdrntdrntydfogiayqfguiatrnydrntüöärtniaoeydfgaoeiq +fglwcßqfgxvlcwgtfhiaoenrsüöäapmböäptdrniaoydfglckqfhouenrtsüöäptrniaoeyqfgulocf +qclgwxßqflgcwßqfxglcwrniatrnmüböäpmöäbpümöäbpüöämpbaoestnriaesnrtdiaesrtdniaesdr +tnaetdriaoenvlcyfglwckßqfgvwkßqgfvlwkßqfgvlwckßqvlwkgfUIαοιαοιαχολωχσωχνωκψρχκψ +ρτιεαοσηζϵηζιοεννκεωνιαλωσωκνκψρκγτφγτχκγτεκργτιχνκιωχσιλωσλωχξλξλξωχωχ +ξχλωωχαοεοιαεοαεοιαεοαεοιαοεσναοεκνρκψγκψφϵιηαααοε""" + doAssert wrapWords(longlongword) == longlongwordRes + |