summary refs log tree commit diff stats
path: root/nim/ropes.pas
diff options
context:
space:
mode:
authorAndreas Rumpf <andreas@andi>2008-06-22 16:14:11 +0200
committerAndreas Rumpf <andreas@andi>2008-06-22 16:14:11 +0200
commit405b86068e6a3d39970b9129ceec0a9108464b28 (patch)
treec0449946f54baae6ea88baf453157ddd7faa8f86 /nim/ropes.pas
downloadNim-405b86068e6a3d39970b9129ceec0a9108464b28.tar.gz
Initial import
Diffstat (limited to 'nim/ropes.pas')
-rwxr-xr-xnim/ropes.pas522
1 files changed, 522 insertions, 0 deletions
diff --git a/nim/ropes.pas b/nim/ropes.pas
new file mode 100755
index 000000000..48a38d7b4
--- /dev/null
+++ b/nim/ropes.pas
@@ -0,0 +1,522 @@
+//
+//
+//           The Nimrod Compiler
+//        (c) Copyright 2008 Andreas Rumpf
+//
+//    See the file "copying.txt", included in this
+//    distribution, for details about the copyright.
+//
+unit ropes;
+
+{ Ropes for the C code generator
+
+  Ropes are a data structure that represents a very long string
+  efficiently; especially concatenation is done in O(1) instead of O(N).
+  Ropes make use a lazy evaluation: They are essentially concatenation
+  trees that are only flattened when converting to a native Nimrod
+  string or when written to disk. The empty string is represented by a
+  nil pointer.
+  A little picture makes everything clear:
+
+  "this string" & " is internally " & "represented as"
+
+             con  -- inner nodes do not contain raw data
+            /   \
+           /     \
+          /       \
+        con       "represented as"
+       /   \
+      /     \
+     /       \
+    /         \
+   /           \
+"this string"  " is internally "
+
+  Note that this is the same as:
+  "this string" & (" is internally " & "represented as")
+
+             con
+            /   \
+           /     \
+          /       \
+ "this string"    con
+                 /   \
+                /     \
+               /       \
+              /         \
+             /           \
+" is internally "        "represented as"
+
+  The 'con' operator is associative! This does not matter however for
+  the algorithms we use for ropes.
+
+  Note that the left and right pointers are not needed for leafs.
+  Leafs have relatively high memory overhead (~30 bytes on a 32
+  bit machines) and we produce many of them. This is why we cache and
+  share leafs accross different rope trees.
+  To cache them they are inserted in another tree, a splay tree for best
+  performance. But for the caching tree we use the leafs' left and right
+  pointers.
+
+  Experiments show that for bootstrapping the whole compiler needs
+  ~1 MB less space because of this optimization. For bigger programs
+  this is likely to increase even further.
+}
+
+interface
+
+{$include 'config.inc'}
+
+uses
+  nsystem, msgs, strutils, platform, hashes, crc;
+
+const
+  CacheLeafs = True;
+  countCacheMisses = False; // see what our little optimization gives
+
+type
+  TFormatStr = string;
+  // later we may change it to CString for better
+  // performance of the code generator (assignments copy the format strings
+  // though it is not necessary)
+
+  PRope = ^TRope;
+  TRope = object(NObject)
+    left, right: PRope;
+    len: int;
+    data: string; // != nil if a leaf
+  end;
+  // the empty rope is represented by nil to safe space
+
+  TRopeSeq = array of PRope;
+
+function con(a, b: PRope): PRope; overload;
+function con(a: PRope; const b: string): PRope; overload;
+function con(const a: string; b: PRope): PRope; overload;
+function con(a: array of PRope): PRope; overload;
+
+procedure app(var a: PRope; b: PRope); overload;
+procedure app(var a: PRope; const b: string); overload;
+
+procedure prepend(var a: PRope; b: PRope);
+
+function toRope(const s: string): PRope; overload;
+function toRopeF(const r: BiggestFloat): PRope;
+function toRope(i: BiggestInt): PRope; overload;
+
+function ropeLen(a: PRope): int;
+
+procedure WriteRope(head: PRope; const filename: string);
+function writeRopeIfNotEqual(r: PRope; const filename: string): boolean;
+
+function ropeToStr(p: PRope): string;
+
+function ropeFormat(const frmt: TFormatStr; const args: array of PRope): PRope;
+
+procedure appRopeFormat(var c: PRope; const frmt: TFormatStr;
+  const args: array of PRope);
+
+procedure RopeSeqInsert(var rs: TRopeSeq; r: PRope; at: Natural);
+
+function getCacheStats: string;
+
+function RopeEqualsFile(r: PRope; const f: string): Boolean;
+// returns true if the rope r is the same as the contents of file f
+
+implementation
+
+function ropeLen(a: PRope): int;
+begin
+  if a = nil then result := 0
+  else result := a.len
+end;
+
+function newRope(const data: string = snil): PRope;
+begin
+  new(result);
+  {@ignore}
+  fillChar(result^, sizeof(TRope), 0);
+  {@emit}
+  result.len := length(data);
+  result.data := data;
+end;
+
+// -------------- leaf cache: ---------------------------------------
+var
+  cache: PRope; // the root of the cache tree
+  misses, hits: int;
+  N: PRope; // dummy rope needed for splay algorithm
+
+function getCacheStats: string;
+begin
+  if hits+misses <> 0 then
+    result := 'Misses: ' +{&} ToString(misses) +{&}
+              ' total: ' +{&} toString(hits+misses) +{&}
+              ' quot: '  +{&} toStringF(misses / (hits+misses))
+  else
+    result := ''
+end;
+
+function splay(const s: string; tree: PRope; out cmpres: int): PRope;
+var
+  le, r, y, t: PRope;
+  c: int;
+begin
+  t := tree;
+  N.left := nil; N.right := nil; // reset to nil
+  le := N;
+  r := N;
+  repeat
+    c := cmp(s, t.data);
+    if c < 0 then begin
+      if (t.left <> nil) and (s < t.left.data) then begin
+        y := t.left; t.left := y.right; y.right := t; t := y
+      end;
+      if t.left = nil then break;
+      r.left := t; r := t; t := t.left
+    end
+    else if c > 0 then begin
+      if (t.right <> nil) and (s > t.right.data) then begin
+        y := t.right; t.right := y.left; y.left := t; t := y
+      end;
+      if t.right = nil then break;
+      le.right := t; le := t; t := t.right
+    end
+    else break
+  until false;
+  cmpres := c;
+  le.right := t.left; r.left := t.right; t.left := N.right; t.right := N.left;
+  result := t
+end;
+
+function insertInCache(const s: string; tree: PRope): PRope;
+// Insert i into the tree t, unless it's already there.
+// Return a pointer to the resulting tree.
+var
+  t: PRope;
+  cmp: int;
+begin
+  t := tree;
+  if t = nil then begin
+    result := newRope(s);
+    if countCacheMisses then inc(misses);
+    exit
+  end;
+  t := splay(s, t, cmp);
+  if cmp = 0 then begin
+    // We get here if it's already in the Tree
+    // Don't add it again
+    result := t;
+    if countCacheMisses then inc(hits);
+  end
+  else begin
+    if countCacheMisses then inc(misses);
+    result := newRope(s);
+    if cmp < 0 then begin
+      result.left := t.left; result.right := t; t.left := nil
+    end
+    else begin // i > t.item:
+      result.right := t.right; result.left := t; t.right := nil
+    end
+  end
+end;
+
+function toRope(const s: string): PRope;
+begin
+  if s = '' then
+    result := nil
+  else if cacheLeafs then begin
+    result := insertInCache(s, cache);
+    cache := result;
+  end
+  else
+    result := newRope(s)
+end;
+
+// ------------------------------------------------------------------
+
+procedure RopeSeqInsert(var rs: TRopeSeq; r: PRope; at: Natural);
+var
+  len, i: int;
+begin
+  len := length(rs);
+  if at > len then
+    SetLength(rs, at+1)
+  else
+    SetLength(rs, len+1);
+
+  // move old rope elements:
+  for i := len downto at+1 do
+    rs[i] := rs[i-1]; // this is correct, I used pen and paper to validate it
+  rs[at] := r
+end;
+
+function RopeInvariant(r: PRope): Boolean;
+begin
+  if r = nil then
+    result := true
+  else
+    result := true
+end;
+
+function con(a, b: PRope): PRope; overload;
+begin
+  assert(RopeInvariant(a));
+  assert(RopeInvariant(b));
+  if a = nil then // len is valid for every cord not only for leafs
+    result := b
+  else if b = nil then
+    result := a
+  else begin
+    result := newRope();
+    result.len := a.len + b.len;
+    result.left := a;
+    result.right := b
+  end;
+  assert(RopeInvariant(result));
+end;
+
+function con(a: PRope; const b: string): PRope; overload;
+var
+  r: PRope;
+begin
+  assert(RopeInvariant(a));
+  if b = '' then
+    result := a
+  else begin
+    r := toRope(b);
+    if a = nil then begin
+      result := r
+    end
+    else begin
+      result := newRope();
+      result.len := a.len + r.len;
+      result.left := a;
+      result.right := r;
+    end
+  end;
+  assert(RopeInvariant(result));
+end;
+
+function con(const a: string; b: PRope): PRope; overload;
+var
+  r: PRope;
+begin
+  assert(RopeInvariant(b));
+  if a = '' then
+    result := b
+  else begin
+    r := toRope(a);
+
+    if b = nil then
+      result := r
+    else begin
+      result := newRope();
+      result.len := b.len + r.len;
+      result.left := r;
+      result.right := b;
+    end
+  end;
+  assert(RopeInvariant(result));
+end;
+
+function con(a: array of PRope): PRope; overload;
+var
+  i: int;
+begin
+  result := nil;
+  for i := 0 to high(a) do result := con(result, a[i])
+end;
+
+function toRope(i: BiggestInt): PRope;
+begin
+  result := toRope(ToString(i))
+end;
+
+function toRopeF(const r: BiggestFloat): PRope;
+begin
+  result := toRope(toStringF(r))
+end;
+
+procedure app(var a: PRope; b: PRope); overload;
+begin
+  a := con(a, b)
+end;
+
+procedure app(var a: PRope; const b: string); overload;
+begin
+  a := con(a, b);
+end;
+
+procedure prepend(var a: PRope; b: PRope);
+begin
+  a := con(b, a)
+end;
+
+procedure WriteRopeRec(var f: TTextFile; c: PRope);
+begin
+  assert(RopeInvariant(c));
+
+  if c = nil then exit;
+  if (c.data <> snil) then begin
+    nimWrite(f, c.data)
+  end
+  else begin
+    writeRopeRec(f, c.left);
+    writeRopeRec(f, c.right)
+  end
+end;
+
+procedure WriteRope(head: PRope; const filename: string);
+var
+  f: TTextFile; // we use a textfile for automatic buffer handling
+begin
+  if OpenFile(f, filename, fmWrite) then begin
+    writeRopeRec(f, head);
+    nimCloseFile(f);
+  end
+end;
+
+procedure recRopeToStr(var result: string; var resultLen: int; p: PRope);
+begin
+  if p = nil then exit; // do not add to result
+  if (p.data = snil) then begin
+    recRopeToStr(result, resultLen, p.left);
+    recRopeToStr(result, resultLen, p.right);
+  end
+  else begin
+    CopyMem(@result[resultLen+StrStart], @p.data[strStart], p.len);
+    Inc(resultLen, p.len);
+    assert(resultLen <= length(result));
+  end
+end;
+
+function ropeToStr(p: PRope): string;
+var
+  resultLen: int;
+begin
+  assert(RopeInvariant(p));
+  result := newString(p.len);
+  resultLen := 0;
+  recRopeToStr(result, resultLen, p);
+end;
+
+function ropeFormat(const frmt: TFormatStr; const args: array of PRope): PRope;
+var
+  i, j, len, start: int;
+begin
+  i := strStart;
+  len := length(frmt);
+  result := nil;
+  while i <= len + StrStart - 1 do begin
+    if frmt[i] = '$' then begin
+      inc(i); // skip '$'
+      case frmt[i] of
+        '$': begin app(result, '$'+''); inc(i); end;
+        '0'..'9': begin
+          j := 0;
+          repeat
+            j := (j*10) + Ord(frmt[i]) - ord('0');
+            inc(i);
+          until (i > len + StrStart - 1) or not (frmt[i] in ['0'..'9']);
+          if j > high(args)+1 then
+            internalError('ropes: invalid format string $' + toString(j));
+          app(result, args[j-1]);
+        end;
+        'N', 'n': begin app(result, tnl); inc(i); end;
+        else InternalError('ropes: invalid format string$' + frmt[i]);
+      end
+    end;
+    start := i;
+    while (i <= len + StrStart - 1) do
+      if (frmt[i] <> '$') then inc(i) else break;
+    if i-1 >= start then app(result, ncopy(frmt, start, i-1));
+  end;
+  assert(RopeInvariant(result));
+end;
+
+procedure appRopeFormat(var c: PRope; const frmt: TFormatStr;
+  const args: array of PRope);
+begin
+  app(c, ropeformat(frmt, args))
+end;
+
+const
+  bufSize = 1024; // 1 KB is reasonable
+
+function auxRopeEqualsFile(r: PRope; var bin: TBinaryFile;
+                           buf: Pointer): Boolean;
+var
+  readBytes: int;
+begin
+  if (r.data <> snil) then begin
+    if r.len > bufSize then
+      // A token bigger than 1 KB? - This cannot happen in reality.
+      // Well, at least I hope so. 1 KB did happen!
+      internalError('ropes: token too long');
+    readBytes := readBuffer(bin, buf, r.len);
+    result := (readBytes = r.len) // BUGFIX
+      and equalMem(buf, addr(r.data[strStart]), r.len);
+  end
+  else begin
+    result := auxRopeEqualsFile(r.left, bin, buf);
+    if result then
+      result := auxRopeEqualsFile(r.right, bin, buf);
+  end
+end;
+
+function RopeEqualsFile(r: PRope; const f: string): Boolean;
+var
+  bin: TBinaryFile;
+  buf: Pointer;
+begin
+  result := openFile(bin, f);
+  if not result then exit; // not equal if file does not exist
+  buf := alloc(BufSize);
+  result := auxRopeEqualsFile(r, bin, buf);
+  if result then
+    result := readBuffer(bin, buf, bufSize) = 0; // really at the end of file?
+  dealloc(buf);
+  CloseFile(bin);
+end;
+
+function crcFromRopeAux(r: PRope; startVal: TCrc32): TCrc32;
+var
+  i: int;
+begin
+  if r.data <> snil then begin
+    result := startVal;
+    for i := strStart to length(r.data)+strStart-1 do
+      result := updateCrc32(r.data[i], result);
+  end
+  else begin
+    result := crcFromRopeAux(r.left, startVal);
+    result := crcFromRopeAux(r.right, result);
+  end
+end;
+
+function crcFromRope(r: PRope): TCrc32;
+begin
+  result := crcFromRopeAux(r, initCrc32)
+end;
+
+function writeRopeIfNotEqual(r: PRope; const filename: string): boolean;
+// returns true if overwritten
+var
+  c: TCrc32;
+begin
+  c := crcFromFile(filename);
+  if int(c) <> crcFromRope(r) then begin
+    writeRope(r, filename);
+    result := true
+  end
+  else
+    result := false
+end;
+
+initialization
+  new(N); // init dummy node for splay algorithm
+{@ignore}
+  fillChar(N^, sizeof(N^), 0);
+{@emit}
+end.