1 files changed, 578 insertions, 572 deletions
diff --git a/lib/gc.nim b/lib/gc.nim
index 72a287064..680256a93 100644
--- a/lib/gc.nim
+++ b/lib/gc.nim
@@ -9,20 +9,71 @@
 
 
 #            Garbage Collector
+# Current Features:
+# * incremental
+# * non-recursive
+# * generational
+# * excellent performance
+
+# Future Improvements:
+# * Both dlmalloc and TLSF lack zero-overhead object allocation. Thus, for
+#   small objects we will should use our own allocator.
+# * Support for multi-threading. However, locks for the reference counting
+#   might turn out to be too slow.
+
+# ---------------------------------------------------------------------------
+# Interface to TLSF:
+const
+  useTLSF = false # benchmarking showed that *dlmalloc* is faster than *TLSF*
+
+when useTLSF:
+  {.compile: "tlsf.c".}
+
+  proc tlsfUsed: int {.importc: "TLSF_GET_USED_SIZE", noconv.}
+  proc tlsfMax: int {.importc: "TLSF_GET_MAX_SIZE", noconv.}
+
+  proc tlsf_malloc(size: int): pointer {.importc, noconv.}
+  proc tlsf_free(p: pointer) {.importc, noconv.}
+  proc tlsf_realloc(p: pointer, size: int): pointer {.importc, noconv.}
+else:
+  # use DL malloc
+  {.compile: "dlmalloc.c".}
+  proc tlsfUsed: int {.importc: "dlmalloc_footprint", noconv.}
+  proc tlsfMax: int {.importc: "dlmalloc_max_footprint", noconv.}
+
+  proc tlsf_malloc(size: int): pointer {.importc: "dlmalloc", noconv.}
+  proc tlsf_free(p: pointer) {.importc: "dlfree", noconv.}
+  proc tlsf_realloc(p: pointer, size: int): pointer {.
+    importc: "dlrealloc", noconv.}
 
-# For a description of the algorithms used here see:
-# intern.html
+# ---------------------------------------------------------------------------
 
-{.define: debugGC.}   # we wish to debug the GC...
+proc getOccupiedMem(): int = return tlsfUsed()
+proc getFreeMem(): int = return tlsfMax() - tlsfUsed()
+proc getTotalMem(): int = return tlsfMax()
 
-#when defined(debugGC):
-#  {.define: logGC.} # define if the GC should log some of its activities
+# ---------------------------------------------------------------------------
 
-{.define: cycleGC.}
+# After several attempts, we now use a novel approach for cycle detection:
+# increments/decrements of the reference counters are enqued into a buffer
+# and not immediately performed. The reason is that increments may introduce
+# new garbage cycles. The cycle detector only scans the changed subgraph. This
+# provides superior performance. Of course only cells that may be part of
+# a cycle are considered. However, reallocation does not work with this scheme!
+# Because the queue may contain references to the old cell.
+# The queue is thread-local storage, so that no synchronization is needed for
+# reference counting.
+
+# With this scheme, the entire heap is never searched and there is no need for
+# the AT.
 
 const
+  debugGC = false # we wish to debug the GC...
+  logGC = false
   traceGC = false # extensive debugging
   reallyDealloc = true # for debugging purposes this can be set to false
+  cycleGC = true # (de)activate the cycle GC
+  stressGC = debugGC
 
 # Guess the page size of the system; if it is the
 # wrong value, performance may be worse (this is not
@@ -32,103 +83,107 @@ const
   PageSize = 1 shl PageShift # on 32 bit systems 4096
   CycleIncrease = 2 # is a multiplicative increase
 
-  InitialCycleThreshold = 8*1024*1024 # X MB because cycle checking is slow
-  ZctThreshold = 512  # we collect garbage if the ZCT's size
+  InitialCycleThreshold = 4*1024*1024 # X MB because cycle checking is slow
+  ZctThreshold = 256  # we collect garbage if the ZCT's size
                       # reaches this threshold
-                      # this needs benchmarking...
-
-when defined(debugGC):
-  const stressGC = False
-else:
-  const stressGC = False
+                      # this seems to be a good value
 
-# things the System module thinks should be available:
-when defined(useDL) or defined(nativeDL):
-  type
-    TMallocInfo {.importc: "struct mallinfo", nodecl, final.} = object
-      arena: cint    # non-mmapped space allocated from system
-      ordblks: cint  # number of free chunks
-      smblks: cint   # number of fastbin blocks
-      hblks: cint    # number of mmapped regions
-      hblkhd: cint   # space in mmapped regions
-      usmblks: cint  # maximum total allocated space
-      fsmblks: cint  # space available in freed fastbin blocks
-      uordblks: cint # total allocated space
-      fordblks: cint # total free space
-      keepcost: cint # top-most, releasable (via malloc_trim) space
-
-when defined(useDL):
-  proc mallinfo: TMallocInfo {.importc: "dlmallinfo", nodecl.}
-elif defined(nativeDL):
-  proc mallinfo: TMallocInfo {.importc: "mallinfo", nodecl.}
-
-when defined(useDL) or defined(nativeDL):
-  proc getOccupiedMem(): int = return mallinfo().uordblks
-  proc getFreeMem(): int = return mallinfo().fordblks
-  proc getTotalMem(): int =
-    var m = mallinfo()
-    return int(m.hblkhd) + int(m.arena)
-else: # not available:
-  proc getOccupiedMem(): int = return -1
-  proc getFreeMem(): int = return -1
-  proc getTotalMem(): int = return -1
+const
+  MemAlignment = sizeof(pointer)*2 # minimal memory block that can be allocated
+  BitsPerUnit = sizeof(int)*8
+    # a "unit" is a word, i.e. 4 bytes
+    # on a 32 bit system; I do not use the term "word" because under 32-bit
+    # Windows it is sometimes only 16 bits
 
-var
-  cycleThreshold: int = InitialCycleThreshold
+  BitsPerPage = PageSize div MemAlignment
+  UnitsPerPage = BitsPerPage div BitsPerUnit
+    # how many units do we need to describe a page:
+    # on 32 bit systems this is only 16 (!)
 
-  memUsed: int = 0 # we have to keep track how much we have allocated
+  rcIncrement = 0b1000 # so that lowest 3 bits are not touched
+  # NOTE: Most colors are currently unused
+  rcBlack = 0b000 # cell is colored black; in use or free
+  rcGray = 0b001  # possible member of a cycle
+  rcWhite = 0b010 # member of a garbage cycle
+  rcPurple = 0b011 # possible root of a cycle
+  rcZct = 0b100  # in ZCT
+  rcRed = 0b101 # Candidate cycle undergoing sigma-computation
+  rcOrange = 0b110 # Candidate cycle awaiting epoch boundary
+  rcShift = 3 # shift by rcShift to get the reference counter
+  colorMask = 0b111
+type
+  TWalkOp = enum
+    waZctDecRef, waPush, waCycleDecRef
 
-  recGcLock: int = 0
-    # we use a lock to prevend the garbage collector to
-    # be triggered in a finalizer; the collector should not call
-    # itself this way! Thus every object allocated by a finalizer
-    # will not trigger a garbage collection. This is wasteful but safe.
-    # This is a lock against recursive garbage collection, not a lock for
-    # threads!
-
-when defined(useDL) and not defined(nativeDL):
-  {.compile: "dlmalloc.c".}
+  TCell {.pure.} = object
+    refcount: int  # the refcount and some flags
+    typ: PNimType
+    when debugGC:
+      filename: cstring
+      line: int
 
-type
+  PCell = ptr TCell
   TFinalizer {.compilerproc.} = proc (self: pointer)
     # A ref type can have a finalizer that is called before the object's
     # storage is freed.
   PPointer = ptr pointer
+  TByteArray = array[0..1000_0000, byte]
+  PByte = ptr TByteArray
+  PString = ptr string
 
-proc asgnRef(dest: ppointer, src: pointer) {.compilerproc.}
-proc unsureAsgnRef(dest: ppointer, src: pointer) {.compilerproc.}
-  # unsureAsgnRef updates the reference counters only if dest is not on the
-  # stack. It is used by the code generator if it cannot decide wether a
-  # reference is in the stack or not (this can happen for out/var parameters).
-proc growObj(old: pointer, newsize: int): pointer {.compilerproc.}
-proc newObj(typ: PNimType, size: int): pointer {.compilerproc.}
-proc newSeq(typ: PNimType, len: int): pointer {.compilerproc.}
-
-# implementation:
-
-when defined(useDL):
-  proc nimSize(p: pointer): int {.
-    importc: "dlmalloc_usable_size", header: "dlmalloc.h".}
-elif defined(nativeDL):
-  proc nimSize(p: pointer): int {.
-    importc: "malloc_usable_size", header: "<malloc.h>".}
+  PPageDesc = ptr TPageDesc
+  TBitIndex = range[0..UnitsPerPage-1]
+  TPageDesc {.final, pure.} = object
+    next: PPageDesc # all nodes are connected with this pointer
+    key: TAddress   # start address at bit 0
+    bits: array[TBitIndex, int] # a bit vector
 
-type
-  TWalkOp = enum
-    waNone, waRelease, waZctDecRef, waCycleDecRef, waCycleIncRef, waDebugIncRef
+  PPageDescArray = ptr array[0..1000_000, PPageDesc]
+  TCellSet {.final, pure.} = object
+    counter, max: int
+    head: PPageDesc
+    data: PPageDescArray
 
-  TCollectorData = int
-  TCell {.final.} = object
-    refcount: TCollectorData  # the refcount and bit flags
-    typ: PNimType
-    when stressGC:
-      stackcount: int           # stack counter for debugging
-      drefc: int                # real reference counter for debugging
+  PCellArray = ptr array[0..100_000_000, PCell]
+  TCellSeq {.final, pure.} = object
+    len, cap: int
+    d: PCellArray
 
-  PCell = ptr TCell
+  TGcHeap {.final, pure.} = object # this contains the zero count and
+                                   # non-zero count table
+    mask: TAddress           # mask for fast pointer detection
+    zct: TCellSeq            # the zero count table
+    stackCells: TCellSet     # cells and addresses that look like a cell but
+                             # aren't of the hardware stack
+
+    stackScans: int          # number of performed stack scans (for statistics)
+    cycleCollections: int    # number of performed full collections
+    maxThreshold: int        # max threshold that has been set
+    maxStackSize: int        # max stack size
+    maxStackPages: int       # max number of pages in stack
+    cycleTableSize: int      # max entries in cycle table
+    cycleRoots: TCellSet
+    tempStack: TCellSeq      # temporary stack for recursion elimination
 
 var
   gOutOfMem: ref EOutOfMemory
+  stackBottom: pointer
+  gch: TGcHeap
+  cycleThreshold: int = InitialCycleThreshold
+  recGcLock: int = 0
+    # we use a lock to prevend the garbage collector to be triggered in a
+    # finalizer; the collector should not call itself this way! Thus every
+    # object allocated by a finalizer will not trigger a garbage collection.
+    # This is wasteful but safe. This is a lock against recursive garbage
+    # collection, not a lock for threads!
+
+proc unsureAsgnRef(dest: ppointer, src: pointer) {.compilerproc.}
+  # unsureAsgnRef updates the reference counters only if dest is not on the
+  # stack. It is used by the code generator if it cannot decide wether a
+  # reference is in the stack or not (this can happen for out/var parameters).
+#proc growObj(old: pointer, newsize: int): pointer {.compilerproc.}
+proc newObj(typ: PNimType, size: int): pointer {.compilerproc.}
+proc newSeq(typ: PNimType, len: int): pointer {.compilerproc.}
 
 proc raiseOutOfMem() {.noreturn.} =
   if gOutOfMem == nil:
@@ -145,17 +200,22 @@ proc usrToCell(usr: pointer): PCell {.inline.} =
   # convert pointer to userdata to object (=pointer to refcount)
   result = cast[PCell](cast[TAddress](usr)-%TAddress(sizeof(TCell)))
 
+proc canbeCycleRoot(c: PCell): bool {.inline.} =
+  result = ntfAcyclic notin c.typ.flags
+
 proc extGetCellType(c: pointer): PNimType {.compilerproc.} =
   # used for code generation concerning debugging
   result = usrToCell(c).typ
 
 proc internRefcount(p: pointer): int {.exportc: "getRefcount".} =
   result = int(usrToCell(p).refcount)
-  if result < 0: result = 0
+  if result > 0: result = result shr rcShift
+  else: result = 0
 
 proc gcAlloc(size: int): pointer =
-  result = alloc0(size)
+  result = tlsf_malloc(size)
   if result == nil: raiseOutOfMem()
+  zeroMem(result, size)
 
 proc GC_disable() = inc(recGcLock)
 proc GC_enable() =
@@ -181,95 +241,29 @@ proc nextTry(h, maxHash: int): int {.inline.} =
   # generates each int in range(maxHash) exactly once (see any text on
   # random-number generation for proof).
 
-# ------------------ Any table (AT) -------------
-
-# these values are for DL-malloc known for sure (and other allocators
-# can only be worse):
-when defined(useDL) or not defined(bcc):
-  const MemAlignment = 8 # minimal memory block that can be allocated
-else:
-  const MemAlignment = 4 # Borland's memory manager is terrible!
-
-const
-  BitsPerUnit = sizeof(int)*8
-    # a "unit" is a word, i.e. 4 bytes
-    # on a 32 bit system; I do not use the term "word" because under 32-bit
-    # Windows it is sometimes only 16 bits
-
-  BitsPerPage = PageSize div MemAlignment
-  UnitsPerPage = BitsPerPage div BitsPerUnit
-    # how many units do we need to describe a page:
-    # on 32 bit systems this is only 16 (!)
-
 # this that has to equals zero, otherwise we have to round up UnitsPerPage:
 when BitsPerPage mod BitsPerUnit != 0:
   {.error: "(BitsPerPage mod BitsPerUnit) should be zero!".}
 
-# ------------------- cell set handling ------------------------------
-# A cellset consists of a hash table of page descriptors. A page
-# descriptor has a bit for every Memalignment'th byte in the page.
-# However, only bits corresponding to addresses that start memory blocks
-# are set.
-# Page descriptors are also linked to a list; the list
-# is used for easy traversing of all page descriptors; this allows a
-# fast iterator.
-# We use a specialized hashing scheme; the formula is :
-# hash = Page bitand max
-# We use linear probing with the formular: (5*h)+1
-# Thus we likely get no collisions at all if the pages are given us
-# sequentially by the operating system!
-type
-  PPageDesc = ptr TPageDesc
-
-  TBitIndex = range[0..UnitsPerPage-1]
-
-  TPageDesc {.final.} = object
-    next: PPageDesc # all nodes are connected with this pointer
-    key: TAddress   # start address at bit 0
-    bits: array[TBitIndex, int] # a bit vector
-
-  PPageDescArray = ptr array[0..1000_000, PPageDesc]
-  TCellSet {.final.} = object
-    counter, max: int
-    head: PPageDesc
-    data: PPageDescArray
-
-  PCellArray = ptr array[0..100_000_000, PCell]
-  TCellSeq {.final.} = object
-    len, cap: int
-    d: PCellArray
-
-  TSlowSet {.final.} = object  # used for debugging purposes only
-    L: int # current length
-    cap: int # capacity
-    d: PCellArray
-
-  TGcHeap {.final.} = object # this contains the zero count and
-                             # non-zero count table
-    mask: TAddress           # mask for fast pointer detection
-    zct: TCellSeq            # the zero count table
-    at: TCellSet             # a table that contains all references
-    newAT: TCellSet
-    stackCells: TCellSeq     # cells that need to be decremented because they
-                             # are in the hardware stack; a cell may occur
-                             # several times in this data structure
+# ------------------- cell set handling ---------------------------------------
 
-var
-  stackBottom: pointer
-  gch: TGcHeap
+proc inOperator(s: TCellSeq, c: PCell): bool {.inline.} =
+  for i in 0 .. s.len-1:
+    if s.d[i] == c: return True
+  return False
 
 proc add(s: var TCellSeq, c: PCell) {.inline.} =
   if s.len >= s.cap:
     s.cap = s.cap * 3 div 2
-    s.d = cast[PCellArray](realloc(s.d, s.cap * sizeof(PCell)))
+    s.d = cast[PCellArray](tlsf_realloc(s.d, s.cap * sizeof(PCell)))
     if s.d == nil: raiseOutOfMem()
   s.d[s.len] = c
   inc(s.len)
 
-proc inOperator(s: TCellSeq, c: PCell): bool {.inline.} =
-  for i in 0 .. s.len-1:
-    if s.d[i] == c: return True
-  return False
+proc addZCT(s: var TCellSeq, c: PCell) =
+  if (c.refcount and colorMask) != rcZct:
+    c.refcount = c.refcount and not colorMask or rcZct
+    add(s, c)
 
 proc init(s: var TCellSeq, cap: int = 1024) =
   s.len = 0
@@ -289,13 +283,13 @@ proc CellSetDeinit(s: var TCellSet) =
   var it = s.head
   while it != nil:
     var n = it.next
-    dealloc(it)
+    tlsf_free(it)
     it = n
   s.head = nil # play it safe here
-  dealloc(s.data)
+  tlsf_free(s.data)
   s.data = nil
   s.counter = 0
-
+  
 proc CellSetGet(t: TCellSet, key: TAddress): PPageDesc =
   var h = cast[int](key) and t.max
   while t.data[h] != nil:
@@ -313,15 +307,13 @@ proc CellSetRawInsert(t: TCellSet, data: PPageDescArray,
   data[h] = desc
 
 proc CellSetEnlarge(t: var TCellSet) =
-  var
-    n: PPageDescArray
-    oldMax = t.max
+  var oldMax = t.max
   t.max = ((t.max+1)*2)-1
-  n = cast[PPageDescArray](gcAlloc((t.max + 1) * sizeof(PPageDesc)))
+  var n = cast[PPageDescArray](gcAlloc((t.max + 1) * sizeof(PPageDesc)))
   for i in 0 .. oldmax:
     if t.data[i] != nil:
       CellSetRawInsert(t, n, t.data[i])
-  dealloc(t.data)
+  tlsf_free(t.data)
   t.data = n
 
 proc CellSetPut(t: var TCellSet, key: TAddress): PPageDesc =
@@ -345,14 +337,11 @@ proc CellSetPut(t: var TCellSet, key: TAddress): PPageDesc =
   t.head = result
   t.data[h] = result
 
-# ---------- slightly higher level procs ----------------------------------
+# ---------- slightly higher level procs --------------------------------------
 
 proc in_Operator(s: TCellSet, cell: PCell): bool =
-  var
-    u: TAddress
-    t: PPageDesc
-  u = cast[TAddress](cell)
-  t = CellSetGet(s, u shr PageShift)
+  var u = cast[TAddress](cell)
+  var t = CellSetGet(s, u shr PageShift)
   if t != nil:
     u = (u %% PageSize) /% MemAlignment
     result = (t.bits[u /% BitsPerUnit] and (1 shl (u %% BitsPerUnit))) != 0
@@ -360,21 +349,15 @@ proc in_Operator(s: TCellSet, cell: PCell): bool =
     result = false
 
 proc incl(s: var TCellSet, cell: PCell) =
-  var
-    u: TAddress
-    t: PPageDesc
-  u = cast[TAddress](cell)
-  t = CellSetPut(s, u shr PageShift)
+  var u = cast[TAddress](cell)
+  var t = CellSetPut(s, u shr PageShift)
   u = (u %% PageSize) /% MemAlignment
   t.bits[u /% BitsPerUnit] = t.bits[u /% BitsPerUnit] or
     (1 shl (u %% BitsPerUnit))
 
 proc excl(s: var TCellSet, cell: PCell) =
-  var
-    u: TAddress
-    t: PPageDesc
-  u = cast[TAddress](cell)
-  t = CellSetGet(s, u shr PageShift)
+  var u = cast[TAddress](cell)
+  var t = CellSetGet(s, u shr PageShift)
   if t != nil:
     u = (u %% PageSize) /% MemAlignment
     t.bits[u /% BitsPerUnit] = (t.bits[u /% BitsPerUnit] and
@@ -400,97 +383,43 @@ iterator elements(t: TCellSet): PCell {.inline.} =
 
 # --------------- end of Cellset routines -------------------------------------
 
-proc testPageDescs() =
-  var root: TCellSet
-  CellSetInit(root)
-  #var u = 10_000
-  #while u <= 20_000:
-  #  incl(root, cast[PCell](u))
-  #  inc(u, 8)
-
-  incl(root, cast[PCell](0x81cdfb8))
-  for cell in elements(root):
-    c_fprintf(c_stdout, "%p\n", cast[int](cell))
-
-#testPageDescs()
-
-when defined(debugGC):
+when logGC or traceGC:
   proc writeCell(msg: CString, c: PCell) =
-    if c.typ != nil:
-      if c.typ.kind == tyString:
-        c_fprintf(c_stdout, "%s\n", cast[TAddress](cellToUsr(c)) + sizeof(int)*2)
-      c_fprintf(c_stdout, "%s: %p %d\n", msg, c, c.typ.kind)
-    else: c_fprintf(c_stdout, "%s: %p (nil type)\n", msg, c)
-  proc writePtr(msg: CString, p: Pointer) =
-    c_fprintf(c_stdout, "%s: %p\n", msg, p)
-
+    var kind = -1
+    if c.typ != nil: kind = ord(c.typ.kind)
+    when debugGC:
+      c_fprintf(c_stdout, "[GC] %s: %p %d rc=%ld from %s(%ld)\n",
+                msg, c, kind, c.refcount shr rcShift, c.filename, c.line)
+    else:
+      c_fprintf(c_stdout, "[GC] %s: %p %d rc=%ld\n",
+                msg, c, kind, c.refcount shr rcShift)
 
 when traceGC:
   # traceGC is a special switch to enable extensive debugging
   type
     TCellState = enum
       csAllocated, csZctFreed, csCycFreed
-
-  proc cellSetInit(s: var TSlowSet) =
-    s.L = 0
-    s.cap = 4096
-    s.d = cast[PCellArray](gcAlloc(s.cap * sizeof(PCell)))
-
-  proc cellSetDeinit(s: var TSlowSet) =
-    s.L = 0
-    s.cap = 0
-    dealloc(s.d)
-
-  proc incl(s: var TSlowSet, c: PCell) =
-    if s.L >= s.cap:
-      s.cap = s.cap * 3 div 2
-      s.d = cast[PCellArray](realloc(s.d, s.cap * sizeof(PCell)))
-      if s.d == nil: raiseOutOfMem()
-    s.d[s.L] = c
-    inc(s.L)
-
-  proc excl(s: var TSlowSet, c: PCell) =
-    var i = 0
-    while i < s.L:
-      if s.d[i] == c:
-        s.d[i] = s.d[s.L-1]
-        dec(s.L)
-        break
-      inc(i)
-
-  proc inOperator(s: TSlowSet, c: PCell): bool =
-    var i = 0
-    while i < s.L:
-      if s.d[i] == c: return true
-      inc(i)
-
-  iterator elements(s: TSlowSet): PCell =
-    var i = 0
-    while i < s.L:
-      yield s.d[i]
-      inc(i)
-
   var
-    states: array[TCellState, TSlowSet] # TCellSet]
+    states: array[TCellState, TCellSet]
 
   proc traceCell(c: PCell, state: TCellState) =
     case state
     of csAllocated:
       if c in states[csAllocated]:
-        writeCell("attempt to alloc a already allocated cell", c)
+        writeCell("attempt to alloc an already allocated cell", c)
         assert(false)
       excl(states[csCycFreed], c)
       excl(states[csZctFreed], c)
     of csZctFreed:
-      if c notin states[csAllocated]:
-        writeCell("attempt to free a not allocated cell", c)
-        assert(false)
       if c in states[csZctFreed]:
         writeCell("attempt to free zct cell twice", c)
         assert(false)
       if c in states[csCycFreed]:
         writeCell("attempt to free with zct, but already freed with cyc", c)
         assert(false)
+      if c notin states[csAllocated]:
+        writeCell("attempt to free not an allocated cell", c)
+        assert(false)
       excl(states[csAllocated], c)
     of csCycFreed:
       if c notin states[csAllocated]:
@@ -505,17 +434,30 @@ when traceGC:
       excl(states[csAllocated], c)
     incl(states[state], c)
 
+  proc writeLeakage() =
+    var z = 0
+    var y = 0
+    var e = 0
+    for c in elements(states[csAllocated]):
+      inc(e)
+      if c in states[csZctFreed]: inc(z)
+      elif c in states[csCycFreed]: inc(z)
+      else: writeCell("leak", c)
+    cfprintf(cstdout, "Allocations: %ld; ZCT freed: %ld; CYC freed: %ld\n",
+             e, z, y)
+
 template gcTrace(cell, state: expr): stmt =
   when traceGC: traceCell(cell, state)
 
-# -------------------------------------------------------------------------
+# -----------------------------------------------------------------------------
 
 # forward declarations:
-proc collectCT(gch: var TGcHeap)
-proc IsOnStack(p: pointer): bool
+proc updateZCT()
+proc collectCT(gch: var TGcHeap, zctUpdated: bool)
+proc IsOnStack(p: pointer): bool {.noinline.}
 proc forAllChildren(cell: PCell, op: TWalkOp)
-proc collectCycles(gch: var TGcHeap)
-
+proc doOperation(p: pointer, op: TWalkOp)
+proc forAllChildrenAux(dest: Pointer, mt: PNimType, op: TWalkOp)
 proc reprAny(p: pointer, typ: PNimType): string {.compilerproc.}
 # we need the prototype here for debugging purposes
 
@@ -530,157 +472,86 @@ proc prepareDealloc(cell: PCell) =
     (cast[TFinalizer](cell.typ.finalizer))(cellToUsr(cell))
     dec(recGcLock)
 
-  when defined(nimSize):
-    memUsed = memUsed - nimSize(cell)
-  else:
-    memUsed = memUsed - cell.typ.size
-
-proc checkZCT(): bool =
-  if recGcLock >= 1: return true # prevent endless recursion
-  inc(recGcLock)
-  result = true
-  for i in 0..gch.zct.len-1:
-    var c = gch.zct.d[i]
-    if c.refcount > 0: # should be in the ZCT!
-      writeCell("wrong ZCT entry", c)
-      result = false
-    elif gch.zct.d[-c.refcount] != c:
-      writeCell("wrong ZCT position", c)
-      result = false
-  dec(recGcLock)
-
-proc GC_invariant(): bool =
-  if recGcLock >= 1: return true # prevent endless recursion
-  inc(recGcLock)
-  result = True
-  block checks:
-    if not checkZCT():
-      result = false
-      break checks
-    # set counters back to zero:
-    for c in elements(gch.AT):
-      var t = c.typ
-      if t == nil or t.kind notin {tySequence, tyString, tyRef}:
-        writeCell("corrupt cell?", c)
-        result = false
-        break checks
-      when stressGC: c.drefc = 0
-    for c in elements(gch.AT):
-      forAllChildren(c, waDebugIncRef)
-    when stressGC:
-      for c in elements(gch.AT):
-        var rc = c.refcount
-        if rc < 0: rc = 0
-        if c.drefc > rc + c.stackcount:
-          result = false # failed
-          c_fprintf(c_stdout,
-             "broken cell: %p, refc: %ld, stack: %ld, real: %ld\n",
-             c, c.refcount, c.stackcount, c.drefc)
-          break checks
-  dec(recGcLock)
-
-when stressGC:
-  proc GCdebugHook() =
-    if not GC_invariant():
-      assert(false)
-
-  dbgLineHook = GCdebugHook
-
 proc setStackBottom(theStackBottom: pointer) {.compilerproc.} =
   stackBottom = theStackBottom
 
 proc initGC() =
   when traceGC:
     for i in low(TCellState)..high(TCellState): CellSetInit(states[i])
+  gch.stackScans = 0
+  gch.cycleCollections = 0
+  gch.maxThreshold = 0
+  gch.maxStackSize = 0
+  gch.maxStackPages = 0
+  gch.cycleTableSize = 0
   # init the rt
   init(gch.zct)
-  CellSetInit(gch.at)
-  init(gch.stackCells)
+  init(gch.tempStack)
+  CellSetInit(gch.cycleRoots)
+  CellSetInit(gch.stackCells)
   gch.mask = 0
   new(gOutOfMem) # reserve space for the EOutOfMemory exception here!
-  assert(GC_invariant())
-
-proc decRef(cell: PCell) {.inline.} =
-  assert(cell.refcount > 0) # this should be the case!
-  when stressGC: assert(cell in gch.AT)
-  dec(cell.refcount)
-  if cell.refcount == 0:
-    cell.refcount = -gch.zct.len
-    when stressGC: assert(cell notin gch.zct)
-    add(gch.zct, cell)
-  when stressGC: assert(checkZCT())
-
-proc incRef(cell: PCell) {.inline.} =
-  var rc = cell.refcount
-  if rc <= 0:
-    # remove from zero count table:
-    when stressGC: assert(gch.zct.len > -rc)
-    when stressGC: assert(gch.zct.d[-rc] == cell)
-    gch.zct.d[-rc] = gch.zct.d[gch.zct.len-1]
-    gch.zct.d[-rc].refcount = rc
-    dec(gch.zct.len)
-    cell.refcount = 1
-    when stressGC: assert(checkZCT())
-  else:
-    inc(cell.refcount)
-    when stressGC: assert(checkZCT())
 
-proc asgnRef(dest: ppointer, src: pointer) =
+proc PossibleRoot(gch: var TGcHeap, c: PCell) {.inline.} =
+  if canbeCycleRoot(c): incl(gch.cycleRoots, c)
+
+proc decRef(c: PCell) {.inline.} =
+  when stressGC:
+    if c.refcount <% rcIncrement:
+      writeCell("broken cell", c)
+  assert(c.refcount >% rcIncrement)
+  c.refcount = c.refcount -% rcIncrement
+  if c.refcount <% rcIncrement:
+    addZCT(gch.zct, c)
+  elif canBeCycleRoot(c):
+    possibleRoot(gch, c) 
+
+proc incRef(c: PCell) {.inline.} = 
+  c.refcount = c.refcount +% rcIncrement
+  if canBeCycleRoot(c):
+    # OPT: the code generator should special case this
+    possibleRoot(gch, c)
+
+proc nimGCref(p: pointer) {.compilerproc, inline.} = incRef(usrToCell(p))
+proc nimGCunref(p: pointer) {.compilerproc, inline.} = decRef(usrToCell(p))
+
+proc asgnRef(dest: ppointer, src: pointer) {.compilerproc, inline.} =
   # the code generator calls this proc!
   assert(not isOnStack(dest))
   # BUGFIX: first incRef then decRef!
   if src != nil: incRef(usrToCell(src))
   if dest^ != nil: decRef(usrToCell(dest^))
   dest^ = src
-  when stressGC: assert(GC_invariant())
+
+proc asgnRefNoCycle(dest: ppointer, src: pointer) {.compilerproc, inline.} =
+  # the code generator calls this proc if it is known at compile time that no 
+  # cycle is possible.
+  if src != nil: 
+    var c = usrToCell(src)
+    c.refcount = c.refcount +% rcIncrement
+  if dest^ != nil: 
+    var c = usrToCell(dest^)
+    c.refcount = c.refcount -% rcIncrement
+    if c.refcount <% rcIncrement:
+      addZCT(gch.zct, c)
+  dest^ = src
 
 proc unsureAsgnRef(dest: ppointer, src: pointer) =
   if not IsOnStack(dest):
     if src != nil: incRef(usrToCell(src))
     if dest^ != nil: decRef(usrToCell(dest^))
   dest^ = src
-  when stressGC: assert(GC_invariant())
-
-proc restore(cell: PCell) =
-  if cell notin gch.newAT:
-    incl(gch.newAT, Cell)
-    forAllChildren(cell, waCycleIncRef)
-
-proc doOperation(p: pointer, op: TWalkOp) =
-  if p == nil: return
-  var cell: PCell = usrToCell(p)
-  assert(cell != nil)
-  case op # faster than function pointers because of easy prediction
-  of waNone: assert(false)
-  of waRelease: decRef(cell) # DEAD CODE!
-  of waZctDecRef:
-    decRef(cell)
-  of waCycleDecRef:
-    assert(cell.refcount > 0)
-    dec(cell.refcount)
-  of waCycleIncRef:
-    inc(cell.refcount) # restore proper reference counts!
-    restore(cell)
-  of waDebugIncRef:
-    when stressGC: inc(cell.drefc)
-
-type
-  TByteArray = array[0..1000_0000, byte]
-  PByte = ptr TByteArray
-  PString = ptr string
-
-proc forAllChildrenAux(dest: Pointer, mt: PNimType, op: TWalkOp)
 
 proc getDiscriminant(aa: Pointer, n: ptr TNimNode): int =
   assert(n.kind == nkCase)
-  var d: int32
+  var d: int
   var a = cast[TAddress](aa)
   case n.typ.size
-  of 1: d = toU32(cast[ptr int8](a +% n.offset)^)
-  of 2: d = toU32(cast[ptr int16](a +% n.offset)^)
-  of 4: d = toU32(cast[ptr int32](a +% n.offset)^)
+  of 1: d = ze(cast[ptr int8](a +% n.offset)^)
+  of 2: d = ze(cast[ptr int16](a +% n.offset)^)
+  of 4: d = int(cast[ptr int32](a +% n.offset)^)
   else: assert(false)
-  return int(d)
+  return d
 
 proc selectBranch(aa: Pointer, n: ptr TNimNode): ptr TNimNode =
   var discr = getDiscriminant(aa, n)
@@ -692,8 +563,7 @@ proc selectBranch(aa: Pointer, n: ptr TNimNode): ptr TNimNode =
     result = n.sons[n.len]
 
 proc forAllSlotsAux(dest: pointer, n: ptr TNimNode, op: TWalkOp) =
-  var
-    d = cast[TAddress](dest)
+  var d = cast[TAddress](dest)
   case n.kind
   of nkNone: assert(false)
   of nkSlot: forAllChildrenAux(cast[pointer](d +% n.offset), n.typ, op)
@@ -704,35 +574,21 @@ proc forAllSlotsAux(dest: pointer, n: ptr TNimNode, op: TWalkOp) =
     if m != nil: forAllSlotsAux(dest, m, op)
 
 proc forAllChildrenAux(dest: Pointer, mt: PNimType, op: TWalkOp) =
-  const
-    handledTypes = {tyArray, tyArrayConstr, tyOpenArray, tyRef,
-                    tyString, tySequence, tyObject, tyPureObject, tyTuple}
-  var
-    d = cast[TAddress](dest)
+  var d = cast[TAddress](dest)
   if dest == nil: return # nothing to do
-  case mt.Kind
-  of tyArray, tyArrayConstr, tyOpenArray:
-    if mt.base.kind in handledTypes:
+  if ntfNoRefs notin mt.flags:
+    case mt.Kind
+    of tyArray, tyArrayConstr, tyOpenArray:
       for i in 0..(mt.size div mt.base.size)-1:
         forAllChildrenAux(cast[pointer](d +% i *% mt.base.size), mt.base, op)
-  of tyRef, tyString, tySequence: # leaf:
-    doOperation(cast[ppointer](d)^, op)
-  of tyObject, tyTuple, tyPureObject:
-    forAllSlotsAux(dest, mt.node, op)
-  else: nil
+    of tyRef, tyString, tySequence: # leaf:
+      doOperation(cast[ppointer](d)^, op)
+    of tyObject, tyTuple, tyPureObject:
+      forAllSlotsAux(dest, mt.node, op)
+    else: nil
 
 proc forAllChildren(cell: PCell, op: TWalkOp) =
   assert(cell != nil)
-  when defined(debugGC):
-    if cell.typ == nil:
-      writeCell("cell has no type descriptor", cell)
-      when traceGC:
-        if cell notin states[csAllocated]:
-          writeCell("cell has never been allocated!", cell)
-        if cell in states[csCycFreed]:
-          writeCell("cell has been freed by Cyc", cell)
-        if cell in states[csZctFreed]:
-          writeCell("cell has been freed by Zct", cell)
   assert(cell.typ != nil)
   case cell.typ.Kind
   of tyRef: # common case
@@ -740,46 +596,41 @@ proc forAllChildren(cell: PCell, op: TWalkOp) =
   of tySequence:
     var d = cast[TAddress](cellToUsr(cell))
     var s = cast[PGenericSeq](d)
-    if s != nil: # BUGFIX
+    if s != nil:
       for i in 0..s.len-1:
         forAllChildrenAux(cast[pointer](d +% i *% cell.typ.base.size +%
           GenericSeqSize), cell.typ.base, op)
   of tyString: nil
   else: assert(false)
 
-proc checkCollection() {.inline.} =
+proc checkCollection(zctUpdated: bool) {.inline.} =
   # checks if a collection should be done
   if recGcLock == 0:
-    collectCT(gch)
+    collectCT(gch, zctUpdated)
 
 proc newObj(typ: PNimType, size: int): pointer =
   # generates a new object and sets its reference counter to 0
-  var
-    res: PCell
-  when stressGC: assert(checkZCT())
   assert(typ.kind in {tyRef, tyString, tySequence})
+  var zctUpdated = false
+  if gch.zct.len >= ZctThreshold:
+    updateZCT()
+    zctUpdated = true
   # check if we have to collect:
-  checkCollection()
-  res = cast[PCell](Alloc0(size + sizeof(TCell)))
+  checkCollection(zctUpdated)
+  var res = cast[PCell](gcAlloc(size + sizeof(TCell)))
   when stressGC: assert((cast[TAddress](res) and (MemAlignment-1)) == 0)
-  if res == nil: raiseOutOfMem()
-  when defined(nimSize):
-    memUsed = memUsed + nimSize(res)
-  else:
-    memUsed = memUsed + size
-
   # now it is buffered in the ZCT
   res.typ = typ
-  res.refcount = -gch.zct.len
-  add(gch.zct, res)  # its refcount is zero, so add it to the ZCT
-  incl(gch.at, res)  # add it to the any table too
+  when debugGC:
+    if framePtr != nil and framePtr.prev != nil:
+      res.filename = framePtr.prev.filename
+      res.line = framePtr.prev.line
+  res.refcount = rcZct # refcount is zero, but mark it to be in the ZCT
+  add(gch.zct, res) # its refcount is zero, so add it to the ZCT
   gch.mask = gch.mask or cast[TAddress](res)
-  when defined(debugGC):
-    when defined(logGC): writeCell("new cell", res)
+  when logGC: writeCell("new cell", res)
   gcTrace(res, csAllocated)
   result = cellToUsr(res)
-  assert(res.typ == typ)
-  when stressGC: assert(checkZCT())
 
 proc newSeq(typ: PNimType, len: int): pointer =
   # XXX: overflow checks!
@@ -788,96 +639,138 @@ proc newSeq(typ: PNimType, len: int): pointer =
   cast[PGenericSeq](result).space = len
 
 proc growObj(old: pointer, newsize: int): pointer =
-  var
-    res, ol: PCell
-  when stressGC: assert(checkZCT())
-  checkCollection()
-  ol = usrToCell(old)
+  checkCollection(false)
+  var ol = usrToCell(old)
+  assert(ol.typ != nil)
   assert(ol.typ.kind in {tyString, tySequence})
-  when defined(nimSize):
-    memUsed = memUsed - nimSize(ol)
-  else:
-    memUsed = memUsed - ol.size # this is not exact
-                                # pity that we don't know the old size
-  res = cast[PCell](realloc(ol, newsize + sizeof(TCell)))
-  #res = cast[PCell](gcAlloc(newsize + sizeof(TCell)))
-  #copyMem(res, ol, nimSize(ol))
+  var res = cast[PCell](gcAlloc(newsize + sizeof(TCell)))
+  var elemSize = 1
+  if ol.typ.kind != tyString:
+    elemSize = ol.typ.base.size
+  copyMem(res, ol, cast[PGenericSeq](old).len*elemSize +
+          GenericSeqSize + sizeof(TCell))
+
   assert((cast[TAddress](res) and (MemAlignment-1)) == 0)
-  when defined(nimSize):
-    memUsed = memUsed + nimSize(res)
+  assert(res.refcount shr rcShift <=% 1)
+  #if res.refcount <% rcIncrement:
+  #  add(gch.zct, res)
+  #else: # XXX: what to do here?
+  #  decRef(ol)
+  if (ol.refcount and colorMask) == rcZct:
+    var j = gch.zct.len-1
+    var d = gch.zct.d
+    while j >= 0: 
+      if d[j] == ol:
+        d[j] = res
+        break
+      dec(j)
+  if canBeCycleRoot(ol): excl(gch.cycleRoots, ol)
+  gch.mask = gch.mask or cast[TAddress](res)
+  when logGC:
+    writeCell("growObj old cell", ol)
+    writeCell("growObj new cell", res)
+  gcTrace(ol, csZctFreed)
+  gcTrace(res, csAllocated)
+  when reallyDealloc: tlsf_free(ol)
   else:
-    memUsed = memUsed + newsize
-
-  if res != ol:
-    if res == nil: raiseOutOfMem()
-    if res.refcount <= 0:
-      assert(gch.zct.d[-res.refcount] == ol)
-      gch.zct.d[-res.refcount] = res
-    excl(gch.at, ol)
-    incl(gch.at, res)
-    gch.mask = gch.mask or cast[TAddress](res)
-    when defined(logGC):
-      writeCell("growObj old cell", ol)
-      writeCell("growObj new cell", res)
-    gcTrace(ol, csZctFreed)
-    gcTrace(res, csAllocated)
+    assert(ol.typ != nil)
+    zeroMem(ol, sizeof(TCell))
   result = cellToUsr(res)
-  when stressGC: assert(checkZCT())
 
-proc collectCycles(gch: var TGcHeap) =
-  when defined(logGC):
-    c_fprintf(c_stdout, "collecting cycles!\n")
+# ---------------- cycle collector -------------------------------------------
+
+# When collecting cycles, we have to consider the following:
+# * there may still be references in the stack
+# * some cells may still be in the ZCT, because they are referenced from
+#   the stack (!), so their refcounts are zero
+# the ZCT is a subset of stackCells here, so we only need to care
+# for stackcells
+
+proc doOperation(p: pointer, op: TWalkOp) =
+  if p == nil: return
+  var c: PCell = usrToCell(p)
+  assert(c != nil)
+  case op # faster than function pointers because of easy prediction
+  of waZctDecRef:
+    assert(c.refcount >=% rcIncrement)
+    c.refcount = c.refcount -% rcIncrement
+    when logGC: writeCell("decref (from doOperation)", c)
+    if c.refcount <% rcIncrement: addZCT(gch.zct, c)
+  of waPush:
+    add(gch.tempStack, c)
+  of waCycleDecRef:
+    assert(c.refcount >=% rcIncrement)
+    c.refcount = c.refcount -% rcIncrement
 
-  # step 1: pretend that any node is dead
-  for c in elements(gch.at):
+# we now use a much simpler and non-recursive algorithm for cycle removal
+proc collectCycles(gch: var TGcHeap) =
+  var tabSize = 0
+  for c in elements(gch.cycleRoots):
+    inc(tabSize)
+    assert(c.typ != nil)
     forallChildren(c, waCycleDecRef)
-  CellSetInit(gch.newAt)
-  # step 2: restore life cells
-  for c in elements(gch.at):
-    if c.refcount > 0: restore(c)
-  # step 3: free dead cells:
-  for cell in elements(gch.at):
-    if cell.refcount == 0:
-      # We free an object that is part of a cycle here. Its children
-      # may have been freed already. Thus the finalizer could access
-      # garbage. To handle this case properly we need two passes for
-      # freeing here which is too expensive. We just don't call the
-      # finalizer for now. YYY: Any better ideas?
-      prepareDealloc(cell)
-      gcTrace(cell, csCycFreed)
-      when defined(logGC):
-        writeCell("cycle collector dealloc cell", cell)
-      when reallyDealloc: dealloc(cell)
-  CellSetDeinit(gch.at)
-  gch.at = gch.newAt
-
-proc gcMark(gch: var TGcHeap, p: pointer) =
+  gch.cycleTableSize = max(gch.cycleTableSize, tabSize)
+
+  # restore reference counts (a depth-first traversal is needed):
+  var marker, newRoots: TCellSet
+  CellSetInit(marker)
+  CellSetInit(newRoots)
+  for c in elements(gch.cycleRoots):
+    var needsRestore = false
+    if c in gch.stackCells:
+      needsRestore = true
+      incl(newRoots, c)
+      # we need to scan this later again; maybe stack changes
+      # NOTE: adding to ZCT here does NOT work
+    elif c.refcount >=% rcIncrement:
+      needsRestore = true
+    if needsRestore:
+      if c notin marker:
+        incl(marker, c)
+        gch.tempStack.len = 0
+        forAllChildren(c, waPush)
+        while gch.tempStack.len > 0:
+          dec(gch.tempStack.len)
+          var d = gch.tempStack.d[gch.tempStack.len]
+          d.refcount = d.refcount +% rcIncrement
+          if d notin marker and d in gch.cycleRoots:
+            incl(marker, d)
+            forAllChildren(d, waPush)
+  # remove cycles:
+  for c in elements(gch.cycleRoots):
+    if c.refcount <% rcIncrement and c notin gch.stackCells:
+      gch.tempStack.len = 0
+      forAllChildren(c, waPush)
+      while gch.tempStack.len > 0:
+        dec(gch.tempStack.len)
+        var d = gch.tempStack.d[gch.tempStack.len]
+        if d.refcount <% rcIncrement:
+          if d notin gch.cycleRoots: # d is leaf of c and not part of cycle
+            addZCT(gch.zct, d)
+            when logGC: writeCell("add to ZCT (from cycle collector)", d)
+      prepareDealloc(c)
+      gcTrace(c, csCycFreed)
+      when logGC: writeCell("cycle collector dealloc cell", c)
+      when reallyDealloc: tlsf_free(c)
+      else:
+        assert(c.typ != nil)
+        zeroMem(c, sizeof(TCell))
+  CellSetDeinit(gch.cycleRoots)
+  gch.cycleRoots = newRoots
+
+proc gcMark(p: pointer) {.fastcall.} =
   # the addresses are not as objects on the stack, so turn them to objects:
   var cell = usrToCell(p)
   var c = cast[TAddress](cell)
-  if ((c and gch.mask) == c) and cell in gch.at:
-    # is the page that p "points to" in the AT? (All allocated pages are
-    # always in the AT)
-    incRef(cell)
-    when stressGC: inc(cell.stackcount)
-    add(gch.stackCells, cell)
-
-proc unmarkStackAndRegisters(gch: var TGcHeap) =
-  when stressGC: assert(checkZCT())
-  for i in 0 .. gch.stackCells.len-1:
-    var cell = gch.stackCells.d[i]
-    assert(cell.refcount > 0)
-    when stressGC:
-      assert(cell.stackcount > 0)
-      dec(cell.stackcount)
-    decRef(cell)
-  gch.stackCells.len = 0 # reset to zero
-  when stressGC: assert(checkZCT())
+  if ((c and gch.mask) == c) and c >% 1024:
+    # fast check: does it look like a cell?
+    when logGC: cfprintf(cstdout, "in stackcells %p\n", cell)
+    incl(gch.stackCells, cell)  # yes: mark it
 
 # ----------------- stack management --------------------------------------
 #  inspired from Smart Eiffel (c)
 
-proc stackSize(): int =
+proc stackSize(): int {.noinline.} =
   var stackTop: array[0..1, pointer]
   result = abs(cast[int](addr(stackTop[0])) - cast[int](stackBottom))
 
@@ -888,26 +781,24 @@ when defined(sparc): # For SPARC architecture.
       stackTop: array[0..1, pointer]
     result = p >= addr(stackTop[0]) and p <= stackBottom
 
-  proc markStackAndRegisters(gch: var TGcHeap) =
+  proc markStackAndRegisters(gch: var TGcHeap) {.noinline, cdecl.} =
     when defined(sparcv9):
-      asm  " flushw"
+      asm  """"flushw \n" """
     else:
-      asm  " ta      0x3   ! ST_FLUSH_WINDOWS"
+      asm  """"ta      0x3   ! ST_FLUSH_WINDOWS\n" """
 
     var
       max = stackBottom
       sp: PPointer
       stackTop: array[0..1, pointer]
-    stackTop[0] = nil
-    stackTop[1] = nil
     sp = addr(stackTop[0])
     # Addresses decrease as the stack grows.
     while sp <= max:
-      gcMark(gch, sp^)
+      gcMark(sp^)
       sp = cast[ppointer](cast[TAddress](sp) +% sizeof(pointer))
 
 elif defined(ELATE):
-  {.error: "stack marking code has to be written for this architecture".}
+  {.error: "stack marking code is to be written for this architecture".}
 
 elif defined(hppa) or defined(hp9000) or defined(hp9000s300) or
      defined(hp9000s700) or defined(hp9000s800) or defined(hp9000s820):
@@ -921,24 +812,71 @@ elif defined(hppa) or defined(hp9000) or defined(hp9000s300) or
     result = p <= addr(stackTop[0]) and p >= stackBottom
 
   var
-    jmpbufSize {.importc: "sizeof(jmp_buf)".}: int
+    jmpbufSize {.importc: "sizeof(jmp_buf)", nodecl.}: int
       # a little hack to get the size of a TJmpBuf in the generated C code
       # in a platform independant way
 
-  proc markStackAndRegisters(gch: var TGcHeap) =
+  proc markStackAndRegisters(gch: var TGcHeap) {.noinline, cdecl.} =
     var
       max = stackBottom
       registers: C_JmpBuf # The jmp_buf buffer is in the C stack.
       sp: PPointer        # Used to traverse the stack and registers assuming
                           # that `setjmp' will save registers in the C stack.
-    c_setjmp(registers)   # To fill the C stack with registers.
-    sp = cast[ppointer](cast[TAddress](addr(registers)) +%
-           jmpbufSize -% sizeof(pointer))
-    # sp will traverse the JMP_BUF as well (jmp_buf size is added,
-    # otherwise sp would be below the registers structure).
-    while sp >= max:
-      gcMark(gch, sp^)
-      sp = cast[ppointer](cast[TAddress](sp) -% sizeof(pointer))
+    if c_setjmp(registers) == 0: # To fill the C stack with registers.
+      sp = cast[ppointer](cast[TAddress](addr(registers)) +%
+             jmpbufSize -% sizeof(pointer))
+      # sp will traverse the JMP_BUF as well (jmp_buf size is added,
+      # otherwise sp would be below the registers structure).
+      while sp >= max:
+        gcMark(sp^)
+        sp = cast[ppointer](cast[TAddress](sp) -% sizeof(pointer))
+
+elif defined(I386) and asmVersion:
+  # addresses decrease as the stack grows:
+  proc isOnStack(p: pointer): bool =
+    var
+      stackTop: array [0..1, pointer]
+    result = p >= addr(stackTop[0]) and p <= stackBottom
+
+  proc markStackAndRegisters(gch: var TGcHeap) {.noinline, cdecl.} =
+    # This code should be safe even for aggressive optimizers. The try
+    # statement safes all registers into the safepoint, which we
+    # scan additionally to the stack.
+    type
+      TPtrArray = array[0..0xffffff, pointer]
+    try:
+      var pa = cast[ptr TPtrArray](excHandler)
+      for i in 0 .. sizeof(TSafePoint) - 1:
+        gcMark(pa[i])
+    finally:
+      # iterate over the stack:
+      var max = cast[TAddress](stackBottom)
+      var stackTop{.volatile.}: array [0..15, pointer]
+      var sp {.volatile.} = cast[TAddress](addr(stackTop[0]))
+      while sp <= max:
+        gcMark(cast[ppointer](sp)^)
+        sp = sp +% sizeof(pointer)
+    when false:
+      var counter = 0
+      #mov ebx, OFFSET `stackBottom`
+      #mov ebx, [ebx]
+      asm """
+        pusha
+        mov edi, esp
+        call `getStackBottom`
+        mov ebx, eax
+      L1:
+        cmp edi, ebx
+        ja L2
+        mov eax, [edi]
+        call `gcMark`
+        add edi, 4
+        inc [`counter`]
+        jmp L1
+      L2:
+        popa
+      """
+      cfprintf(cstdout, "stack %ld\n", counter)
 
 else:
   # ---------------------------------------------------------------------------
@@ -949,67 +887,135 @@ else:
       stackTop: array [0..1, pointer]
     result = p >= addr(stackTop[0]) and p <= stackBottom
 
-  proc markStackAndRegisters(gch: var TGcHeap) =
-    var
-      max = stackBottom
-      registers: C_JmpBuf # The jmp_buf buffer is in the C stack.
-      sp: PPointer        # Used to traverse the stack and registers assuming
-                          # that `setjmp' will save registers in the C stack.
-    c_setjmp(registers)   # To fill the C stack with registers.
-    sp = cast[ppointer](addr(registers))
-    while sp <= max:
-      gcMark(gch, sp^)
-      sp = cast[ppointer](cast[TAddress](sp) +% sizeof(pointer))
+  var
+    gRegisters: C_JmpBuf
+    jmpbufSize {.importc: "sizeof(jmp_buf)", nodecl.}: int
+      # a little hack to get the size of a TJmpBuf in the generated C code
+      # in a platform independant way
+
+  proc markStackAndRegisters(gch: var TGcHeap) {.noinline, cdecl.} =
+    when true:
+      # new version: several C compilers are too smart here
+      var
+        max = cast[TAddress](stackBottom)
+        stackTop: array [0..15, pointer]
+      if c_setjmp(gregisters) == 0'i32: # To fill the C stack with registers.
+        # iterate over the registers:
+        var sp = cast[TAddress](addr(gregisters))
+        while sp < cast[TAddress](addr(gregisters))+%jmpbufSize:
+          gcMark(cast[ppointer](sp)^)
+          sp = sp +% sizeof(pointer)
+        # iterate over the stack:
+        sp = cast[TAddress](addr(stackTop[0]))
+        while sp <= max:
+          gcMark(cast[ppointer](sp)^)
+          sp = sp +% sizeof(pointer)
+      else:
+        c_longjmp(gregisters, 42)
+        # this can never happen, but should trick any compiler that is
+        # not as smart as a human
+    else:
+      var
+        max = stackBottom
+        registers: C_JmpBuf # The jmp_buf buffer is in the C stack.
+        sp: PPointer        # Used to traverse the stack and registers assuming
+                            # that `setjmp' will save registers in the C stack.
+      if c_setjmp(registers) == 0'i32: # To fill the C stack with registers.
+        sp = cast[ppointer](addr(registers))
+        while sp <= max:
+          gcMark(sp^)
+          sp = cast[ppointer](cast[TAddress](sp) +% sizeof(pointer))
 
 # ----------------------------------------------------------------------------
 # end of non-portable code
 # ----------------------------------------------------------------------------
 
+proc updateZCT() =
+  # We have to make an additional pass over the ZCT unfortunately, because 
+  # the ZCT may be out of date, which means it contains cells with a
+  # refcount > 0. The reason is that ``incRef`` does not bother to remove
+  # the cell from the ZCT as this might be too slow.
+  var j = 0
+  var L = gch.zct.len # because globals make it hard for the optimizer
+  var d = gch.zct.d
+  while j < L:
+    var c = d[j]
+    if c.refcount >=% rcIncrement:
+      when logGC: writeCell("remove from ZCT", c)
+      # remove from ZCT:
+      dec(L)
+      d[j] = d[L]
+      c.refcount = c.refcount and not colorMask
+      # we have a new cell at position i, so don't increment i
+    else:
+      inc(j)
+  gch.zct.len = L
+
 proc CollectZCT(gch: var TGcHeap) =
-  while gch.zct.len > 0:
-    var c = gch.zct.d[0]
-    assert(c.refcount <= 0)
-    # remove from ZCT:
-    gch.zct.d[0] = gch.zct.d[gch.zct.len-1]
-    gch.zct.d[0].refcount = 0
-    dec(gch.zct.len)
-    # We are about to free the object, call the finalizer BEFORE its
-    # children are deleted as well, because otherwise the finalizer may
-    # access invalid memory. This is done by prepareDealloc():
-    gcTrace(c, csZctFreed)
-    prepareDealloc(c)
-    forAllChildren(c, waZctDecRef)
-    excl(gch.at, c)
-    when defined(logGC):
-      writeCell("zct dealloc cell", c)
-    #when defined(debugGC) and defined(nimSize): zeroMem(c, nimSize(c))
-    when reallyDealloc: dealloc(c)
-
-proc collectCT(gch: var TGcHeap) =
-  when defined(logGC):
-    c_fprintf(c_stdout, "collecting zero count table; stack size: %ld\n",
-              stackSize())
-  when stressGC: assert(checkZCT())
-  if gch.zct.len >= ZctThreshold or memUsed >= cycleThreshold or stressGC:
+  var i = 0
+  while i < gch.zct.len:
+    var c = gch.zct.d[i]
+    assert(c.refcount <% rcIncrement)
+    assert((c.refcount and colorMask) == rcZct)
+    if canBeCycleRoot(c): excl(gch.cycleRoots, c)
+    if c notin gch.stackCells:
+      # remove from ZCT:
+      c.refcount = c.refcount and not colorMask
+      gch.zct.d[i] = gch.zct.d[gch.zct.len-1]
+      # we have a new cell at position i, so don't increment i
+      dec(gch.zct.len)
+      when logGC: writeCell("zct dealloc cell", c)
+      gcTrace(c, csZctFreed)
+      # We are about to free the object, call the finalizer BEFORE its
+      # children are deleted as well, because otherwise the finalizer may
+      # access invalid memory. This is done by prepareDealloc():
+      prepareDealloc(c)
+      forAllChildren(c, waZctDecRef)
+      when reallyDealloc: tlsf_free(c)
+      else:
+        assert(c.typ != nil)
+        zeroMem(c, sizeof(TCell))
+    else:
+      inc(i)
+  when stressGC:
+    for j in 0..gch.zct.len-1: assert(gch.zct.d[j] in gch.stackCells)
+
+proc collectCT(gch: var TGcHeap, zctUpdated: bool) =
+  if gch.zct.len >= ZctThreshold or (cycleGC and
+      getOccupiedMem() >= cycleThreshold) or stressGC:    
+    if not zctUpdated: updateZCT()
+    gch.maxStackSize = max(gch.maxStackSize, stackSize())
+    CellSetInit(gch.stackCells)
     markStackAndRegisters(gch)
-    when stressGC: assert(GC_invariant())
+    gch.maxStackPages = max(gch.maxStackPages, gch.stackCells.counter)
+    inc(gch.stackScans)
     collectZCT(gch)
-    when stressGC: assert(GC_invariant())
-    assert(gch.zct.len == 0)
-    when defined(cycleGC):
-      if memUsed >= cycleThreshold or stressGC:
-        when defined(logGC):
-          c_fprintf(c_stdout, "collecting cycles; memory used: %ld\n", memUsed)
+    when cycleGC:
+      if getOccupiedMem() >= cycleThreshold or stressGC:
         collectCycles(gch)
-        cycleThreshold = max(InitialCycleThreshold, memUsed * cycleIncrease)
-        when defined(logGC):
-          c_fprintf(c_stdout, "now used: %ld; threshold: %ld\n",
-                    memUsed, cycleThreshold)
-    unmarkStackAndRegisters(gch)
-  when stressGC: assert(GC_invariant())
+        collectZCT(gch)
+        inc(gch.cycleCollections)
+        cycleThreshold = max(InitialCycleThreshold, getOccupiedMem() *
+                             cycleIncrease)
+        gch.maxThreshold = max(gch.maxThreshold, cycleThreshold)
+    CellSetDeinit(gch.stackCells)
 
 proc GC_fullCollect() =
   var oldThreshold = cycleThreshold
   cycleThreshold = 0 # forces cycle collection
-  collectCT(gch)
+  collectCT(gch, false)
   cycleThreshold = oldThreshold
+
+proc GC_getStatistics(): string =
+  GC_disable()
+  result = "[GC] total memory: " & $(getTotalMem()) & "\n" &
+           "[GC] occupied memory: " & $(getOccupiedMem()) & "\n" &
+           "[GC] stack scans: " & $gch.stackScans & "\n" &
+           "[GC] stack pages: " & $gch.maxStackPages & "\n" &
+           "[GC] cycle collections: " & $gch.cycleCollections & "\n" &
+           "[GC] max threshold: " & $gch.maxThreshold & "\n" &
+           "[GC] zct capacity: " & $gch.zct.cap & "\n" &
+           "[GC] max cycle table size: " & $gch.cycleTableSize & "\n" &
+           "[GC] max stack size: " & $gch.maxStackSize
+  when traceGC: writeLeakage()
+  GC_enable()