summary refs log tree commit diff stats
path: root/rod/ropes.nim
blob: 1fe5ed55e656f1089c057287cdec3ed508af6deb (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
#
#
#           The Nimrod Compiler
#        (c) Copyright 2009 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

# Ropes for the C code generator
#
#  Ropes are a data structure that represents a very long string
#  efficiently; especially concatenation is done in O(1) instead of O(N).
#  Ropes make use a lazy evaluation: They are essentially concatenation
#  trees that are only flattened when converting to a native Nimrod
#  string or when written to disk. The empty string is represented by a
#  nil pointer.
#  A little picture makes everything clear:
#
#  "this string" & " is internally " & "represented as"
#
#             con  -- inner nodes do not contain raw data
#            /   \
#           /     \
#          /       \
#        con       "represented as"
#       /   \
#      /     \
#     /       \
#    /         \
#   /           \
#"this string"  " is internally "
#
#  Note that this is the same as:
#  "this string" & (" is internally " & "represented as")
#
#             con
#            /   \
#           /     \
#          /       \
# "this string"    con
#                 /   \
#                /     \
#               /       \
#              /         \
#             /           \
#" is internally "        "represented as"
#
#  The 'con' operator is associative! This does not matter however for
#  the algorithms we use for ropes.
#
#  Note that the left and right pointers are not needed for leafs.
#  Leafs have relatively high memory overhead (~30 bytes on a 32
#  bit machines) and we produce many of them. This is why we cache and
#  share leafs accross different rope trees.
#  To cache them they are inserted in another tree, a splay tree for best
#  performance. But for the caching tree we use the leafs' left and right
#  pointers.
#

import 
  msgs, strutils, platform, nhashes, crc

const 
  CacheLeafs* = true
  countCacheMisses* = False   # see what our little optimization gives

type 
  TFormatStr* = string # later we may change it to CString for better
                       # performance of the code generator (assignments copy the format strings
                       # though it is not necessary)
  PRope* = ref TRope
  TRope*{.acyclic.} = object of TObject # the empty rope is represented by nil to safe space
    left*, right*: PRope
    length*: int
    data*: string             # != nil if a leaf
  
  TRopeSeq* = seq[PRope]

proc con*(a, b: PRope): PRope
proc con*(a: PRope, b: string): PRope
proc con*(a: string, b: PRope): PRope
proc con*(a: openarray[PRope]): PRope
proc app*(a: var PRope, b: PRope)
proc app*(a: var PRope, b: string)
proc prepend*(a: var PRope, b: PRope)
proc toRope*(s: string): PRope
proc toRopeF*(r: BiggestFloat): PRope
proc toRope*(i: BiggestInt): PRope
proc ropeLen*(a: PRope): int
proc WriteRope*(head: PRope, filename: string)
proc writeRopeIfNotEqual*(r: PRope, filename: string): bool
proc ropeToStr*(p: PRope): string
proc ropef*(frmt: TFormatStr, args: openarray[PRope]): PRope
proc appf*(c: var PRope, frmt: TFormatStr, args: openarray[PRope])
proc getCacheStats*(): string
proc RopeEqualsFile*(r: PRope, f: string): bool
  # returns true if the rope r is the same as the contents of file f
proc RopeInvariant*(r: PRope): bool
  # exported for debugging
# implementation

proc ropeLen(a: PRope): int = 
  if a == nil: result = 0
  else: result = a.length
  
proc newRope(data: string = nil): PRope = 
  new(result)
  if data != nil: 
    result.length = len(data)
    result.data = data

var 
  cache: PRope                # the root of the cache tree
  misses, hits: int
  N: PRope                    # dummy rope needed for splay algorithm

proc getCacheStats(): string = 
  if hits + misses != 0: 
    result = "Misses: " & $(misses) & " total: " & $(hits + misses) & " quot: " &
        $(toFloat(misses) / toFloat(hits + misses))
  else: 
    result = ""
  
proc splay(s: string, tree: PRope, cmpres: var int): PRope = 
  var c: int
  var t = tree
  N.left = nil
  N.right = nil               # reset to nil
  var le = N
  var r = N
  while true: 
    c = cmp(s, t.data)
    if c < 0: 
      if (t.left != nil) and (s < t.left.data): 
        var y = t.left
        t.left = y.right
        y.right = t
        t = y
      if t.left == nil: break 
      r.left = t
      r = t
      t = t.left
    elif c > 0: 
      if (t.right != nil) and (s > t.right.data): 
        var y = t.right
        t.right = y.left
        y.left = t
        t = y
      if t.right == nil: break 
      le.right = t
      le = t
      t = t.right
    else: 
      break 
  cmpres = c
  le.right = t.left
  r.left = t.right
  t.left = N.right
  t.right = N.left
  result = t

proc insertInCache(s: string, tree: PRope): PRope = 
  # Insert i into the tree t, unless it's already there.
  # Return a pointer to the resulting tree.
  var t = tree
  if t == nil: 
    result = newRope(s)
    if countCacheMisses: inc(misses)
    return 
  var cmp: int
  t = splay(s, t, cmp)
  if cmp == 0: 
    # We get here if it's already in the Tree
    # Don't add it again
    result = t
    if countCacheMisses: inc(hits)
  else: 
    if countCacheMisses: inc(misses)
    result = newRope(s)
    if cmp < 0: 
      result.left = t.left
      result.right = t
      t.left = nil
    else: 
      # i > t.item:
      result.right = t.right
      result.left = t
      t.right = nil

proc RopeInvariant(r: PRope): bool = 
  if r == nil: 
    result = true
  else: 
    result = true #
                  #    if r.data <> snil then
                  #      result := true
                  #    else begin
                  #      result := (r.left <> nil) and (r.right <> nil);
                  #      if result then result := ropeInvariant(r.left);
                  #      if result then result := ropeInvariant(r.right);
                  #    end 
  
proc toRope(s: string): PRope = 
  if s == "": 
    result = nil
  elif cacheLeafs: 
    result = insertInCache(s, cache)
    cache = result
  else: 
    result = newRope(s)
  assert(RopeInvariant(result))

proc RopeSeqInsert(rs: var TRopeSeq, r: PRope, at: Natural) = 
  var length = len(rs)
  if at > length: 
    setlen(rs, at + 1)
  else: 
    setlen(rs, length + 1)    # move old rope elements:
  for i in countdown(length, at + 1): 
    rs[i] = rs[i - 1] # this is correct, I used pen and paper to validate it
  rs[at] = r

proc recRopeToStr(result: var string, resultLen: var int, p: PRope) = 
  if p == nil: 
    return                    # do not add to result
  if (p.data == nil): 
    recRopeToStr(result, resultLen, p.left)
    recRopeToStr(result, resultLen, p.right)
  else: 
    CopyMem(addr(result[resultLen + 0]), addr(p.data[0]), p.length)
    Inc(resultLen, p.length)
    assert(resultLen <= len(result))

proc newRecRopeToStr(result: var string, resultLen: var int, r: PRope) = 
  var stack = @[r]
  while len(stack) > 0: 
    var it = pop(stack)
    while it.data == nil: 
      add(stack, it.right)
      it = it.left
    assert(it.data != nil)
    CopyMem(addr(result[resultLen]), addr(it.data[0]), it.length)
    Inc(resultLen, it.length)
    assert(resultLen <= len(result))

proc ropeToStr(p: PRope): string = 
  if p == nil: 
    result = ""
  else: 
    result = newString(p.length)
    var resultLen = 0
    newRecRopeToStr(result, resultLen, p)

proc con(a, b: PRope): PRope = 
  if a == nil: result = b
  elif b == nil: result = a
  else:
    result = newRope()
    result.length = a.length + b.length
    result.left = a
    result.right = b

proc con(a: PRope, b: string): PRope = result = con(a, toRope(b))
proc con(a: string, b: PRope): PRope = result = con(toRope(a), b)

proc con(a: openarray[PRope]): PRope = 
  for i in countup(0, high(a)): result = con(result, a[i])

proc toRope(i: BiggestInt): PRope = result = toRope($i)
proc toRopeF(r: BiggestFloat): PRope = result = toRope($r)
proc app(a: var PRope, b: PRope) = a = con(a, b)
proc app(a: var PRope, b: string) = a = con(a, b)
proc prepend(a: var PRope, b: PRope) = a = con(b, a)

proc WriteRopeRec(f: var tfile, c: PRope) = 
  if c == nil: return 
  if (c.data != nil): 
    write(f, c.data)
  else: 
    writeRopeRec(f, c.left)
    writeRopeRec(f, c.right)

proc newWriteRopeRec(f: var tfile, c: PRope) = 
  var stack = @[c]
  while len(stack) > 0: 
    var it = pop(stack)
    while it.data == nil: 
      add(stack, it.right)
      it = it.left
      assert(it != nil)
    assert(it.data != nil)
    write(f, it.data)

proc WriteRope(head: PRope, filename: string) = 
  var f: tfile                # we use a textfile for automatic buffer handling
  if open(f, filename, fmWrite): 
    if head != nil: newWriteRopeRec(f, head)
    close(f)
  else: 
    rawMessage(errCannotOpenFile, filename)

proc ropef(frmt: TFormatStr, args: openarray[PRope]): PRope = 
  var i, j, length, start, num: int
  i = 0
  length = len(frmt)
  result = nil
  num = 0
  while i <= length - 1: 
    if frmt[i] == '$': 
      inc(i)                  # skip '$'
      case frmt[i]
      of '$': 
        app(result, "$")
        inc(i)
      of '#': 
        inc(i)
        app(result, args[num])
        inc(num)
      of '0'..'9': 
        j = 0
        while true: 
          j = (j * 10) + Ord(frmt[i]) - ord('0')
          inc(i)
          if (i > length + 0 - 1) or not (frmt[i] in {'0'..'9'}): break 
        num = j
        if j > high(args) + 1: 
          internalError("ropes: invalid format string $" & $(j))
        app(result, args[j - 1])
      of 'N', 'n': 
        app(result, tnl)
        inc(i)
      else: InternalError("ropes: invalid format string $" & frmt[i])
    start = i
    while (i <= length - 1): 
      if (frmt[i] != '$'): inc(i)
      else: break 
    if i - 1 >= start: 
      app(result, copy(frmt, start, i - 1))
  assert(RopeInvariant(result))

proc appf(c: var PRope, frmt: TFormatStr, args: openarray[PRope]) = 
  app(c, ropef(frmt, args))

const 
  bufSize = 1024              # 1 KB is reasonable

proc auxRopeEqualsFile(r: PRope, bin: var tfile, buf: Pointer): bool = 
  if (r.data != nil): 
    if r.length > bufSize: 
      internalError("ropes: token too long")
    var readBytes = readBuffer(bin, buf, r.length)
    result = (readBytes == r.length) and
        equalMem(buf, addr(r.data[0]), r.length) # BUGFIX
  else: 
    result = auxRopeEqualsFile(r.left, bin, buf)
    if result: result = auxRopeEqualsFile(r.right, bin, buf)
  
proc RopeEqualsFile(r: PRope, f: string): bool = 
  var bin: tfile
  result = open(bin, f)
  if not result: 
    return                    # not equal if file does not exist
  var buf = alloc(BufSize)
  result = auxRopeEqualsFile(r, bin, buf)
  if result: 
    result = readBuffer(bin, buf, bufSize) == 0 # really at the end of file?
  dealloc(buf)
  close(bin)

proc crcFromRopeAux(r: PRope, startVal: TCrc32): TCrc32 = 
  if r.data != nil: 
    result = startVal
    for i in countup(0, len(r.data) - 1): 
      result = updateCrc32(r.data[i], result)
  else: 
    result = crcFromRopeAux(r.left, startVal)
    result = crcFromRopeAux(r.right, result)

proc newCrcFromRopeAux(r: PRope, startVal: TCrc32): TCrc32 = 
  var stack: TRopeSeq = @[r]
  result = startVal
  while len(stack) > 0: 
    var it = pop(stack)
    while it.data == nil: 
      add(stack, it.right)
      it = it.left
    assert(it.data != nil)
    var i = 0
    var L = len(it.data)
    while i < L: 
      result = updateCrc32(it.data[i], result)
      inc(i)

proc crcFromRope(r: PRope): TCrc32 = 
  result = newCrcFromRopeAux(r, initCrc32)

proc writeRopeIfNotEqual(r: PRope, filename: string): bool = 
  # returns true if overwritten
  var c: TCrc32
  c = crcFromFile(filename)
  if c != crcFromRope(r): 
    writeRope(r, filename)
    result = true
  else: 
    result = false
  
new(N) # init dummy node for splay algorithm