summary refs log tree commit diff stats
path: root/lib/std
diff options
context:
space:
mode:
Diffstat (limited to 'lib/std')
-rw-r--r--lib/std/diff.nim387
-rw-r--r--lib/std/editdistance.nim295
-rw-r--r--lib/std/wordwrap.nim90
3 files changed, 772 insertions, 0 deletions
diff --git a/lib/std/diff.nim b/lib/std/diff.nim
new file mode 100644
index 000000000..bffce2803
--- /dev/null
+++ b/lib/std/diff.nim
@@ -0,0 +1,387 @@
+#
+#
+#            Nim's Runtime Library
+#        (c) Copyright 2018 Nim contributors
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module implements an algorithm to compute the
+## `diff`:idx: between two sequences of lines.
+
+# code owner: Arne Döring
+#
+# This is based on C# code written by Matthias Hertel, http://www.mathertel.de
+#
+# This Class implements the Difference Algorithm published in
+# "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
+# Algorithmica Vol. 1 No. 2, 1986, p 251.
+
+import tables, strutils
+
+type
+  Item* = object    ## An Item in the list of differences.
+    startA*: int    ## Start Line number in Data A.
+    startB*: int    ## Start Line number in Data B.
+    deletedA*: int  ## Number of changes in Data A.
+    insertedB*: int ## Number of changes in Data B.
+
+  DiffData = object ## Data on one input file being compared.
+    data: seq[int] ## Buffer of numbers that will be compared.
+    modified: seq[bool] ## Array of booleans that flag for modified
+                        ## data. This is the result of the diff.
+                        ## This means deletedA in the first Data or
+                        ## inserted in the second Data.
+
+  Smsrd = object
+    x, y: int
+
+# template to avoid a seq copy. Required until ``sink`` parameters are ready.
+template newDiffData(initData: seq[int]; L: int): DiffData =
+  DiffData(
+    data: initData,
+    modified: newSeq[bool](L + 2)
+  )
+
+proc len(d: DiffData): int {.inline.} = d.data.len
+
+proc diffCodes(aText: string; h: var Table[string, int]): DiffData =
+  ## This function converts all textlines of the text into unique numbers for every unique textline
+  ## so further work can work only with simple numbers.
+  ## ``aText`` the input text
+  ## ``h`` This extern initialized hashtable is used for storing all ever used textlines.
+  ## ``trimSpace`` ignore leading and trailing space characters
+  ## Returns a array of integers.
+  var lastUsedCode = h.len
+  result.data = newSeq[int]()
+  for s in aText.splitLines:
+    if h.contains s:
+      result.data.add h[s]
+    else:
+      inc lastUsedCode
+      h[s] = lastUsedCode
+      result.data.add lastUsedCode
+  result.modified = newSeq[bool](result.data.len + 2)
+
+proc optimize(data: var DiffData) =
+  ## If a sequence of modified lines starts with a line that contains the same content
+  ## as the line that appends the changes, the difference sequence is modified so that the
+  ## appended line and not the starting line is marked as modified.
+  ## This leads to more readable diff sequences when comparing text files.
+  var startPos = 0
+  while startPos < data.len:
+    while startPos < data.len and not data.modified[startPos]:
+      inc startPos
+    var endPos = startPos
+    while endPos < data.len and data.modified[endPos]:
+      inc endPos
+
+    if endPos < data.len and data.data[startPos] == data.data[endPos]:
+      data.modified[startPos] = false
+      data.modified[endPos] = true
+    else:
+      startPos = endPos
+
+proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int;
+         downVector, upVector: var openArray[int]): Smsrd =
+  ## This is the algorithm to find the Shortest Middle Snake (sms).
+  ## ``dataA`` sequence A
+  ## ``lowerA`` lower bound of the actual range in dataA
+  ## ``upperA`` upper bound of the actual range in dataA (exclusive)
+  ## ``dataB`` sequence B
+  ## ``lowerB`` lower bound of the actual range in dataB
+  ## ``upperB`` upper bound of the actual range in dataB (exclusive)
+  ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
+  ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
+  ## Returns a MiddleSnakeData record containing x,y and u,v.
+
+  let max = dataA.len + dataB.len + 1
+
+  let downK = lowerA - lowerB # the k-line to start the forward search
+  let upK = upperA - upperB # the k-line to start the reverse search
+
+  let delta = (upperA - lowerA) - (upperB - lowerB)
+  let oddDelta = (delta and 1) != 0
+
+  # The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
+  # and are access using a specific offset: upOffset upVector and downOffset for downVector
+  let downOffset = max - downK
+  let upOffset = max - upK
+
+  let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1
+
+  downVector[downOffset + downK + 1] = lowerA
+  upVector[upOffset + upK - 1] = upperA
+
+  for D in 0 .. maxD:
+    # Extend the forward path.
+    for k in countUp(downK - D, downK + D, 2):
+      # find the only or better starting point
+      var x: int
+      if k == downK - D:
+        x = downVector[downOffset + k + 1] # down
+      else:
+        x = downVector[downOffset + k - 1] + 1 # a step to the right
+        if k < downK + D and downVector[downOffset + k + 1] >= x:
+          x = downVector[downOffset + k + 1] # down
+
+      var y = x - k
+
+      # find the end of the furthest reaching forward D-path in diagonal k.
+      while x < upperA and y < upperB and dataA.data[x] == dataB.data[y]:
+        inc x
+        inc y
+
+      downVector[downOffset + k] = x
+
+      # overlap ?
+      if oddDelta and upK - D < k and k < upK + D:
+        if upVector[upOffset + k] <= downVector[downOffset + k]:
+          return Smsrd(x: downVector[downOffset + k],
+                       y: downVector[downOffset + k] - k)
+
+    # Extend the reverse path.
+    for k in countUp(upK - D, upK + D, 2):
+      # find the only or better starting point
+      var x: int
+      if k == upK + D:
+        x = upVector[upOffset + k - 1] # up
+      else:
+        x = upVector[upOffset + k + 1] - 1 # left
+        if k > upK - D and upVector[upOffset + k - 1] < x:
+          x = upVector[upOffset + k - 1] # up
+
+      var y = x - k
+      while x > lowerA and y > lowerB and dataA.data[x - 1] == dataB.data[y - 1]:
+        dec x
+        dec y
+
+      upVector[upOffset + k] = x
+
+      # overlap ?
+      if not oddDelta and downK-D <= k and k <= downK+D:
+        if upVector[upOffset + k] <= downVector[downOffset + k]:
+          return Smsrd(x: downVector[downOffset + k],
+                       y: downVector[downOffset + k] - k)
+
+  assert false, "the algorithm should never come here."
+
+proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int;
+         downVector, upVector: var openArray[int]) =
+  ## This is the divide-and-conquer implementation of the longes common-subsequence (lcs)
+  ## algorithm.
+  ## The published algorithm passes recursively parts of the A and B sequences.
+  ## To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant.
+  ## ``dataA`` sequence A
+  ## ``lowerA`` lower bound of the actual range in dataA
+  ## ``upperA`` upper bound of the actual range in dataA (exclusive)
+  ## ``dataB`` sequence B
+  ## ``lowerB`` lower bound of the actual range in dataB
+  ## ``upperB`` upper bound of the actual range in dataB (exclusive)
+  ## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
+  ## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
+
+  # make mutable copy:
+  var lowerA = lowerA
+  var lowerB = lowerB
+  var upperA = upperA
+  var upperB = upperB
+
+  # Fast walkthrough equal lines at the start
+  while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]:
+    inc lowerA
+    inc lowerB
+
+  # Fast walkthrough equal lines at the end
+  while lowerA < upperA and lowerB < upperB and dataA.data[upperA - 1] == dataB.data[upperB - 1]:
+    dec upperA
+    dec upperB
+
+  if lowerA == upperA:
+    # mark as inserted lines.
+    while lowerB < upperB:
+      dataB.modified[lowerB] = true
+      inc lowerB
+
+  elif lowerB == upperB:
+    # mark as deleted lines.
+    while lowerA < upperA:
+      dataA.modified[lowerA] = true
+      inc lowerA
+
+  else:
+    # Find the middle snakea and length of an optimal path for A and B
+    let smsrd = sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector)
+    # Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y))
+
+    # The path is from LowerX to (x,y) and (x,y) to UpperX
+    lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector)
+    lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector)  # 2002.09.20: no need for 2 points
+
+proc createDiffs(dataA, dataB: DiffData): seq[Item] =
+  ## Scan the tables of which lines are inserted and deleted,
+  ## producing an edit script in forward order.
+  var startA = 0
+  var startB = 0
+  var lineA = 0
+  var lineB = 0
+  while lineA < dataA.len or lineB < dataB.len:
+    if lineA < dataA.len and not dataA.modified[lineA] and
+       lineB < dataB.len and not dataB.modified[lineB]:
+      # equal lines
+      inc lineA
+      inc lineB
+    else:
+      # maybe deleted and/or inserted lines
+      startA = lineA
+      startB = lineB
+
+      while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]):
+        inc lineA
+
+      while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]):
+        inc lineB
+
+      if (startA < lineA) or (startB < lineB):
+        result.add Item(startA: startA,
+                        startB: startB,
+                        deletedA: lineA - startA,
+                        insertedB: lineB - startB)
+
+
+proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] =
+  ## Find the difference in 2 arrays of integers.
+  ## ``arrayA`` A-version of the numbers (usualy the old one)
+  ## ``arrayB`` B-version of the numbers (usualy the new one)
+  ## Returns a array of Items that describe the differences.
+
+  # The A-Version of the data (original data) to be compared.
+  var dataA = newDiffData(@arrayA, arrayA.len)
+
+  # The B-Version of the data (modified data) to be compared.
+  var dataB = newDiffData(@arrayB, arrayB.len)
+
+  let max = dataA.len + dataB.len + 1
+  ## vector for the (0,0) to (x,y) search
+  var downVector = newSeq[int](2 * max + 2)
+  ## vector for the (u,v) to (N,M) search
+  var upVector = newSeq[int](2 * max + 2)
+
+  lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
+  result = createDiffs(dataA, dataB)
+
+proc diffText*(textA, textB: string): seq[Item] =
+  ## Find the difference in 2 text documents, comparing by textlines.
+  ## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
+  ## each line is converted into a (hash) number. This hash-value is computed by storing all
+  ## textlines into a common hashtable so i can find dublicates in there, and generating a
+  ## new number each time a new textline is inserted.
+  ## ``TextA`` A-version of the text (usualy the old one)
+  ## ``TextB`` B-version of the text (usualy the new one)
+  ## ``trimSpace`` When set to true, all leading and trailing whitespace characters are stripped out before the comparation is done.
+  ## ``ignoreSpace`` When set to true, all whitespace characters are converted to a single space character before the comparation is done.
+  ## ``ignoreCase`` When set to true, all characters are converted to their lowercase equivivalence before the comparation is done.
+  ## Returns a seq of Items that describe the differences.
+
+  # prepare the input-text and convert to comparable numbers.
+  var h = initTable[string, int]()  # TextA.len + TextB.len  <- probably wrong initial size
+  # The A-Version of the data (original data) to be compared.
+  var dataA = diffCodes(textA, h)
+
+  # The B-Version of the data (modified data) to be compared.
+  var dataB = diffCodes(textB, h)
+
+  h.clear # free up hashtable memory (maybe)
+
+  let max = dataA.len + dataB.len + 1
+  ## vector for the (0,0) to (x,y) search
+  var downVector = newSeq[int](2 * max + 2)
+  ## vector for the (u,v) to (N,M) search
+  var upVector = newSeq[int](2 * max + 2)
+
+  lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
+
+  optimize(dataA)
+  optimize(dataB)
+  result = createDiffs(dataA, dataB)
+
+when isMainModule:
+
+  proc testHelper(f: seq[Item]): string =
+    for it in f:
+      result.add(
+        $it.deletedA & "." & $it.insertedB & "." & $it.startA & "." & $it.startB & "*"
+      )
+
+  proc main() =
+    var a, b: string
+
+    stdout.writeLine("Diff Self Test...")
+
+    # test all changes
+    a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
+    b = "0,1,2,3,4,5,6,7,8,9".replace(',', '\n')
+    assert(testHelper(diffText(a, b)) ==
+      "12.10.0.0*",
+      "all-changes test failed.")
+    stdout.writeLine("all-changes test passed.")
+    # test all same
+    a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
+    b = a
+    assert(testHelper(diffText(a, b)) ==
+      "",
+      "all-same test failed.")
+    stdout.writeLine("all-same test passed.")
+
+    # test snake
+    a = "a,b,c,d,e,f".replace(',', '\n')
+    b = "b,c,d,e,f,x".replace(',', '\n')
+    assert(testHelper(diffText(a, b)) ==
+      "1.0.0.0*0.1.6.5*",
+      "snake test failed.")
+    stdout.writeLine("snake test passed.")
+
+    # 2002.09.20 - repro
+    a = "c1,a,c2,b,c,d,e,g,h,i,j,c3,k,l".replace(',', '\n')
+    b = "C1,a,C2,b,c,d,e,I1,e,g,h,i,j,C3,k,I2,l".replace(',', '\n')
+    assert(testHelper(diffText(a, b)) ==
+      "1.1.0.0*1.1.2.2*0.2.7.7*1.1.11.13*0.1.13.15*",
+      "repro20020920 test failed.")
+    stdout.writeLine("repro20020920 test passed.")
+
+    # 2003.02.07 - repro
+    a = "F".replace(',', '\n')
+    b = "0,F,1,2,3,4,5,6,7".replace(',', '\n')
+    assert(testHelper(diffText(a, b)) ==
+      "0.1.0.0*0.7.1.2*",
+      "repro20030207 test failed.")
+    stdout.writeLine("repro20030207 test passed.")
+
+    # Muegel - repro
+    a = "HELLO\nWORLD"
+    b = "\n\nhello\n\n\n\nworld\n"
+    assert(testHelper(diffText(a, b)) ==
+      "2.8.0.0*",
+      "repro20030409 test failed.")
+    stdout.writeLine("repro20030409 test passed.")
+
+    # test some differences
+    a = "a,b,-,c,d,e,f,f".replace(',', '\n')
+    b = "a,b,x,c,e,f".replace(',', '\n')
+    assert(testHelper(diffText(a, b)) ==
+      "1.1.2.2*1.0.4.4*1.0.7.6*",
+      "some-changes test failed.")
+    stdout.writeLine("some-changes test passed.")
+
+    # test one change within long chain of repeats
+    a = "a,a,a,a,a,a,a,a,a,a".replace(',', '\n')
+    b = "a,a,a,a,-,a,a,a,a,a".replace(',', '\n')
+    assert(testHelper(diffText(a, b)) ==
+      "0.1.4.4*1.0.9.10*",
+      "long chain of repeats test failed.")
+
+    stdout.writeLine("End.")
+    stdout.flushFile
+
+  main()
diff --git a/lib/std/editdistance.nim b/lib/std/editdistance.nim
new file mode 100644
index 000000000..40beb7d93
--- /dev/null
+++ b/lib/std/editdistance.nim
@@ -0,0 +1,295 @@
+#
+#
+#            Nim's Runtime Library
+#        (c) Copyright 2018 Nim contributors
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module implements an algorithm to compute the
+## `edit distance`:idx: between two Unicode strings.
+
+import unicode
+
+proc editDistance*(a, b: string): int {.noSideEffect.} =
+  ## Returns the unicode-rune edit distance between ``a`` and ``b``.
+  ##
+  ## This uses the `Levenshtein`:idx: distance algorithm with only a linear
+  ## memory overhead.
+  if len(a) > len(b):
+    # make ``b`` the longer string
+    return editDistance(b, a)
+  # strip common prefix
+  var
+    i_start = 0 ## The character starting index of the first rune in both strings ``a`` and ``b``
+    i_next_a = 0
+    i_next_b = 0
+    rune_a, rune_b: Rune
+    len_runes_a = 0 ## The number of relevant runes in string ``a``.
+    len_runes_b = 0 ## The number of relevant runes in string ``b``.
+  block commonPrefix:
+    # ``a`` is the shorter string
+    while i_start < len(a):
+      i_next_a = i_start
+      a.fastRuneAt(i_next_a, rune_a, doInc = true)
+      i_next_b = i_start
+      b.fastRuneAt(i_next_b, rune_b, doInc = true)
+      if rune_a != rune_b:
+        inc(len_runes_a)
+        inc(len_runes_b)
+        break
+      i_start = i_next_a
+  var
+    # we know that we are either at the start of the strings
+    # or that the current value of rune_a is not equal to rune_b
+    # => start search for common suffix after the current rune (``i_next_*``)
+    i_end_a = i_next_a ## The exclusive upper index bound of string ``a``.
+    i_end_b = i_next_b ## The exclusive upper index bound of string ``b``.
+    i_current_a = i_next_a
+    i_current_b = i_next_b
+  block commonSuffix:
+    var
+      add_runes_a = 0
+      add_runes_b = 0
+    while i_current_a < len(a) and i_current_b < len(b):
+      i_next_a = i_current_a
+      a.fastRuneAt(i_next_a, rune_a)
+      i_next_b = i_current_b
+      b.fastRuneAt(i_next_b, rune_b)
+      inc(add_runes_a)
+      inc(add_runes_b)
+      if rune_a != rune_b:
+        i_end_a = i_next_a
+        i_end_b = i_next_b
+        inc(len_runes_a, add_runes_a)
+        inc(len_runes_b, add_runes_b)
+        add_runes_a = 0
+        add_runes_b = 0
+      i_current_a = i_next_a
+      i_current_b = i_next_b
+    if i_current_a >= len(a): # ``a`` exhausted
+      if i_current_b < len(b): # ``b`` not exhausted
+        i_end_a = i_current_a
+        i_end_b = i_current_b
+        inc(len_runes_a, add_runes_a)
+        inc(len_runes_b, add_runes_b)
+        while true:
+          b.fastRuneAt(i_end_b, rune_b)
+          inc(len_runes_b)
+          if i_end_b >= len(b): break
+    elif i_current_b >= len(b): # ``b`` exhausted and ``a`` not exhausted
+      i_end_a = i_current_a
+      i_end_b = i_current_b
+      inc(len_runes_a, add_runes_a)
+      inc(len_runes_b, add_runes_b)
+      while true:
+        a.fastRuneAt(i_end_a, rune_a)
+        inc(len_runes_a)
+        if i_end_a >= len(a): break
+  block specialCases:
+    # trivial cases:
+    if len_runes_a == 0: return len_runes_b
+    if len_runes_b == 0: return len_runes_a
+    # another special case:
+    if len_runes_a == 1:
+      a.fastRuneAt(i_start, rune_a, doInc = false)
+      var i_current_b = i_start
+      while i_current_b < i_end_b:
+        b.fastRuneAt(i_current_b, rune_b, doInc = true)
+        if rune_a == rune_b: return len_runes_b - 1
+      return len_runes_b
+  # common case:
+  var
+    len1 = len_runes_a + 1
+    len2 = len_runes_b + 1
+    row: seq[int]
+  let half = len_runes_a div 2
+  newSeq(row, len2)
+  var e = i_start + len2 - 1 # end marker
+  # initialize first row:
+  for i in 1 .. (len2 - half - 1): row[i] = i
+  row[0] = len1 - half - 1
+  i_current_a = i_start
+  var
+    char2p_i = -1
+    char2p_prev: int
+  for i in 1 .. (len1 - 1):
+    i_next_a = i_current_a
+    a.fastRuneAt(i_next_a, rune_a)
+    var
+      char2p: int
+      D, x: int
+      p: int
+    if i >= (len1 - half):
+      # skip the upper triangle:
+      let offset = i + half - len1
+      if char2p_i == i:
+        b.fastRuneAt(char2p_prev, rune_b)
+        char2p = char2p_prev
+        char2p_i = i + 1
+      else:
+        char2p = i_start
+        for j in 0 ..< offset:
+          rune_b = b.runeAt(char2p)
+          inc(char2p, rune_b.size)
+        char2p_i = i + 1
+        char2p_prev = char2p
+      p = offset
+      rune_b = b.runeAt(char2p)
+      var c3 = row[p] + (if rune_a != rune_b: 1 else: 0)
+      inc(char2p, rune_b.size)
+      inc(p)
+      x = row[p] + 1
+      D = x
+      if x > c3: x = c3
+      row[p] = x
+      inc(p)
+    else:
+      p = 1
+      char2p = i_start
+      D = i
+      x = i
+    if i <= (half + 1):
+      # skip the lower triangle:
+      e = len2 + i - half - 2
+    # main:
+    while p <= e:
+      dec(D)
+      rune_b = b.runeAt(char2p)
+      var c3 = D + (if rune_a != rune_b: 1 else: 0)
+      inc(char2p, rune_b.size)
+      inc(x)
+      if x > c3: x = c3
+      D = row[p] + 1
+      if x > D: x = D
+      row[p] = x
+      inc(p)
+    # lower triangle sentinel:
+    if i <= half:
+      dec(D)
+      rune_b = b.runeAt(char2p)
+      var c3 = D + (if rune_a != rune_b: 1 else: 0)
+      inc(x)
+      if x > c3: x = c3
+      row[p] = x
+    i_current_a = i_next_a
+  result = row[e]
+
+proc editDistanceAscii*(a, b: string): int {.noSideEffect.} =
+  ## Returns the edit distance between `a` and `b`.
+  ##
+  ## This uses the `Levenshtein`:idx: distance algorithm with only a linear
+  ## memory overhead.
+  var len1 = a.len
+  var len2 = b.len
+  if len1 > len2:
+    # make `b` the longer string
+    return editDistanceAscii(b, a)
+
+  # strip common prefix:
+  var s = 0
+  while s < len1 and a[s] == b[s]:
+    inc(s)
+    dec(len1)
+    dec(len2)
+  # strip common suffix:
+  while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]:
+    dec(len1)
+    dec(len2)
+  # trivial cases:
+  if len1 == 0: return len2
+  if len2 == 0: return len1
+
+  # another special case:
+  if len1 == 1:
+    for j in s..s+len2-1:
+      if a[s] == b[j]: return len2 - 1
+    return len2
+
+  inc(len1)
+  inc(len2)
+  var half = len1 shr 1
+  # initalize first row:
+  #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int)))
+  var row: seq[int]
+  newSeq(row, len2)
+  var e = s + len2 - 1 # end marker
+  for i in 1..len2 - half - 1: row[i] = i
+  row[0] = len1 - half - 1
+  for i in 1 .. len1 - 1:
+    var char1 = a[i + s - 1]
+    var char2p: int
+    var D, x: int
+    var p: int
+    if i >= len1 - half:
+      # skip the upper triangle:
+      var offset = i - len1 + half
+      char2p = offset
+      p = offset
+      var c3 = row[p] + ord(char1 != b[s + char2p])
+      inc(p)
+      inc(char2p)
+      x = row[p] + 1
+      D = x
+      if x > c3: x = c3
+      row[p] = x
+      inc(p)
+    else:
+      p = 1
+      char2p = 0
+      D = i
+      x = i
+    if i <= half + 1:
+      # skip the lower triangle:
+      e = len2 + i - half - 2
+    # main:
+    while p <= e:
+      dec(D)
+      var c3 = D + ord(char1 != b[char2p + s])
+      inc(char2p)
+      inc(x)
+      if x > c3: x = c3
+      D = row[p] + 1
+      if x > D: x = D
+      row[p] = x
+      inc(p)
+    # lower triangle sentinel:
+    if i <= half:
+      dec(D)
+      var c3 = D + ord(char1 != b[char2p + s])
+      inc(x)
+      if x > c3: x = c3
+      row[p] = x
+  result = row[e]
+
+
+when isMainModule:
+  doAssert editDistance("", "") == 0
+  doAssert editDistance("kitten", "sitting") == 3 # from Wikipedia
+  doAssert editDistance("flaw", "lawn") == 2 # from Wikipedia
+
+  doAssert editDistance("привет", "превет") == 1
+  doAssert editDistance("Åge", "Age") == 1
+  # editDistance, one string is longer in bytes, but shorter in rune length
+  # first string: 4 bytes, second: 6 bytes, but only 3 runes
+  doAssert editDistance("aaaa", "×××") == 4
+
+  block veryLongStringEditDistanceTest:
+    const cap = 256
+    var
+      s1 = newStringOfCap(cap)
+      s2 = newStringOfCap(cap)
+    while len(s1) < cap:
+      s1.add 'a'
+    while len(s2) < cap:
+      s2.add 'b'
+    doAssert editDistance(s1, s2) == cap
+
+  block combiningCodePointsEditDistanceTest:
+    const s = "A\xCC\x8Age"
+    doAssert editDistance(s, "Age") == 1
+
+  doAssert editDistanceAscii("", "") == 0
+  doAssert editDistanceAscii("kitten", "sitting") == 3 # from Wikipedia
+  doAssert editDistanceAscii("flaw", "lawn") == 2 # from Wikipedia
diff --git a/lib/std/wordwrap.nim b/lib/std/wordwrap.nim
new file mode 100644
index 000000000..c7898b339
--- /dev/null
+++ b/lib/std/wordwrap.nim
@@ -0,0 +1,90 @@
+#
+#
+#            Nim's Runtime Library
+#        (c) Copyright 2018 Nim contributors
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module contains an algorithm to wordwrap a Unicode string.
+
+import strutils, unicode
+
+proc olen(s: string): int =
+  var i = 0
+  result = 0
+  while i < s.len:
+    inc result
+    let L = graphemeLen(s, i)
+    inc i, L
+
+proc wrapWords*(s: string, maxLineWidth = 80,
+               splitLongWords = true,
+               seps: set[char] = Whitespace,
+               newLine = "\n"): string {.noSideEffect.} =
+  ## Word wraps `s`.
+  result = newStringOfCap(s.len + s.len shr 6)
+  var spaceLeft = maxLineWidth
+  var lastSep = ""
+  for word, isSep in tokenize(s, seps):
+    let wlen = olen(word)
+    if isSep:
+      lastSep = word
+      spaceLeft = spaceLeft - wlen
+    elif wlen > spaceLeft:
+      if splitLongWords and wlen > maxLineWidth:
+        var i = 0
+        while i < word.len:
+          if spaceLeft <= 0:
+            spaceLeft = maxLineWidth
+            result.add newLine
+          dec spaceLeft
+          let L = graphemeLen(word, i)
+          for j in 0 ..< L: result.add word[i+j]
+          inc i, L
+      else:
+        spaceLeft = maxLineWidth - wlen
+        result.add(newLine)
+        result.add(word)
+    else:
+      spaceLeft = spaceLeft - wlen
+      result.add(lastSep)
+      result.add(word)
+      lastSep.setLen(0)
+
+when isMainModule:
+
+  when true:
+    let
+      inp = """ this is a long text --  muchlongerthan10chars and here
+                 it goes"""
+      outp = " this is a\nlong text\n--\nmuchlongerthan10chars\nand here\nit goes"
+    doAssert wrapWords(inp, 10, false) == outp
+
+    let
+      longInp = """ThisIsOneVeryLongStringWhichWeWillSplitIntoEightSeparatePartsNow"""
+      longOutp = "ThisIsOn\neVeryLon\ngStringW\nhichWeWi\nllSplitI\nntoEight\nSeparate\nPartsNow"
+    doAssert wrapWords(longInp, 8, true) == longOutp
+
+  # test we don't break Umlauts into invalid bytes:
+  let fies = "äöüöäöüöäöüöäöüööäöüöäößßßßüöäößßßßßß"
+  let fiesRes = "ä\nö\nü\nö\nä\nö\nü\nö\nä\nö\nü\nö\nä\nö\nü\nö\nö\nä\nö\nü\nö\nä\nö\nß\nß\nß\nß\nü\nö\nä\nö\nß\nß\nß\nß\nß\nß"
+  doAssert wrapWords(fies, 1, true) == fiesRes
+
+  let longlongword = """abc uitdaeröägfßhydüäpydqfü,träpydqgpmüdträpydföägpydörztdüöäfguiaeowäzjdtrüöäp psnrtuiydrözenrüöäpyfdqazpesnrtulocjtüö
+äzydgyqgfqfgprtnwjlcydkqgfüöezmäzydydqüüöäpdtrnvwfhgckdumböäpydfgtdgfhtdrntdrntydfogiayqfguiatrnydrntüöärtniaoeydfgaoeiqfglwcßqfgxvlcwgtfhiaoen
+rsüöäapmböäptdrniaoydfglckqfhouenrtsüöäptrniaoeyqfgulocfqclgwxßqflgcwßqfxglcwrniatrnmüböäpmöäbpümöäbpüöämpbaoestnriaesnrtdiaesrtdniaesdrtnaetdr
+iaoenvlcyfglwckßqfgvwkßqgfvlwkßqfgvlwckßqvlwkgfUIαοιαοιαχολωχσωχνωκψρχκψρτιεαοσηζϵηζιοεννκεωνιαλωσωκνκψρκγτφγτχκγτεκργτιχνκιωχσιλωσλωχξλξλξωχωχ
+ξχλωωχαοεοιαεοαεοιαεοαεοιαοεσναοεκνρκψγκψφϵιηαααοε"""
+  let longlongwordRes = """
+abc uitdaeröägfßhydüäpydqfü,träpydqgpmüdträpydföägpydörztdüöäfguiaeowäzjdtrüöäp
+psnrtuiydrözenrüöäpyfdqazpesnrtulocjtüöäzydgyqgfqfgprtnwjlcydkqgfüöezmäzydydqüü
+öäpdtrnvwfhgckdumböäpydfgtdgfhtdrntdrntydfogiayqfguiatrnydrntüöärtniaoeydfgaoeiq
+fglwcßqfgxvlcwgtfhiaoenrsüöäapmböäptdrniaoydfglckqfhouenrtsüöäptrniaoeyqfgulocf
+qclgwxßqflgcwßqfxglcwrniatrnmüböäpmöäbpümöäbpüöämpbaoestnriaesnrtdiaesrtdniaesdr
+tnaetdriaoenvlcyfglwckßqfgvwkßqgfvlwkßqfgvlwckßqvlwkgfUIαοιαοιαχολωχσωχνωκψρχκψ
+ρτιεαοσηζϵηζιοεννκεωνιαλωσωκνκψρκγτφγτχκγτεκργτιχνκιωχσιλωσλωχξλξλξωχωχ
+ξχλωωχαοεοιαεοαεοιαεοαεοιαοεσναοεκνρκψγκψφϵιηαααοε"""
+  doAssert wrapWords(longlongword) == longlongwordRes
+