1 files changed, 1058 insertions, 544 deletions
diff --git a/lib/system/alloc.nim b/lib/system/alloc.nim
index 7b52780fe..3de6d8713 100755..100644
--- a/lib/system/alloc.nim
+++ b/lib/system/alloc.nim
@@ -1,292 +1,448 @@
 #
 #
-#            Nimrod's Runtime Library
+#            Nim's Runtime Library
 #        (c) Copyright 2012 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
 #
 
-# Low level allocator for Nimrod. Has been designed to support the GC.
-# TODO: 
-# - eliminate "used" field
-# - make searching for block O(1)
+# Low level allocator for Nim. Has been designed to support the GC.
 {.push profiler:off.}
 
-# ------------ platform specific chunk allocation code -----------------------
-
-# some platforms have really weird unmap behaviour: unmap(blockStart, PageSize)
-# really frees the whole block. Happens for Linux/PowerPC for example. Amd64
-# and x86 are safe though; Windows is special because MEM_RELEASE can only be
-# used with a size of 0:
-const weirdUnmap = not (defined(amd64) or defined(i386)) or defined(windows)
-
-when defined(posix): 
-  const
-    PROT_READ  = 1             # page can be read 
-    PROT_WRITE = 2             # page can be written 
-    MAP_PRIVATE = 2'i32        # Changes are private 
-  
-  when defined(macosx) or defined(bsd):
-    const MAP_ANONYMOUS = 0x1000
-  elif defined(solaris): 
-    const MAP_ANONYMOUS = 0x100
-  else:
-    var
-      MAP_ANONYMOUS {.importc: "MAP_ANONYMOUS", header: "<sys/mman.h>".}: cint
-    
-  proc mmap(adr: pointer, len: int, prot, flags, fildes: cint,
-            off: int): pointer {.header: "<sys/mman.h>".}
-
-  proc munmap(adr: pointer, len: int) {.header: "<sys/mman.h>".}
-  
-  proc osAllocPages(size: int): pointer {.inline.} = 
-    result = mmap(nil, size, PROT_READ or PROT_WRITE, 
-                             MAP_PRIVATE or MAP_ANONYMOUS, -1, 0)
-    if result == nil or result == cast[pointer](-1):
-      raiseOutOfMem()
-      
-  proc osDeallocPages(p: pointer, size: int) {.inline} =
-    when reallyOsDealloc: munmap(p, size)
-  
-elif defined(windows): 
-  const
-    MEM_RESERVE = 0x2000 
-    MEM_COMMIT = 0x1000
-    MEM_TOP_DOWN = 0x100000
-    PAGE_READWRITE = 0x04
-
-    MEM_DECOMMIT = 0x4000
-    MEM_RELEASE = 0x8000
-
-  proc VirtualAlloc(lpAddress: pointer, dwSize: int, flAllocationType,
-                    flProtect: int32): pointer {.
-                    header: "<windows.h>", stdcall.}
-  
-  proc VirtualFree(lpAddress: pointer, dwSize: int, 
-                   dwFreeType: int32) {.header: "<windows.h>", stdcall.}
-  
-  proc osAllocPages(size: int): pointer {.inline.} = 
-    result = VirtualAlloc(nil, size, MEM_RESERVE or MEM_COMMIT,
-                          PAGE_READWRITE)
-    if result == nil: raiseOutOfMem()
-
-  proc osDeallocPages(p: pointer, size: int) {.inline.} =
-    # according to Microsoft, 0 is the only correct value for MEM_RELEASE:
-    # This means that the OS has some different view over how big the block is
-    # that we want to free! So, we cannot reliably release the memory back to
-    # Windows :-(. We have to live with MEM_DECOMMIT instead.
-    # Well that used to be the case but MEM_DECOMMIT fragments the address
-    # space heavily, so we now treat Windows as a strange unmap target.
-    when reallyOsDealloc: VirtualFree(p, 0, MEM_RELEASE)
-    #VirtualFree(p, size, MEM_DECOMMIT)
-
-else: 
-  {.error: "Port memory manager to your platform".}
-
-# --------------------- end of non-portable code -----------------------------
+include osalloc
+import std/private/syslocks
+import std/sysatomics
+
+template track(op, address, size) =
+  when defined(memTracker):
+    memTrackerOp(op, address, size)
 
 # We manage *chunks* of memory. Each chunk is a multiple of the page size.
-# Each chunk starts at an address that is divisible by the page size. Chunks
-# that are bigger than ``ChunkOsReturn`` are returned back to the operating
-# system immediately.
+# Each chunk starts at an address that is divisible by the page size.
+# Small chunks may be divided into smaller cells of reusable pointers to reduce the number of page allocations.
+
+# An allocation of a small pointer looks approximately like this
+#[
+
+  alloc -> rawAlloc -> No free chunk available > Request a new page from tslf -> result = chunk.data -------------+
+              |                                                                                                   |
+              v                                                                                                   |
+    Free chunk available                                                                                          |
+              |                                                                                                   |
+              v                                                                                                   v
+      Fetch shared cells -> No free cells available -> Advance acc -> result = chunk.data + chunk.acc -------> return
+    (may not add new cells)                                                                                       ^
+              |                                                                                                   |
+              v                                                                                                   |
+     Free cells available -> result = chunk.freeList -> Advance chunk.freeList -----------------------------------+
+]#
+# so it is split into 3 paths, where the last path is preferred to prevent unnecessary allocations.
+#
+#
+# A deallocation of a small pointer then looks like this
+#[
+  dealloc -> rawDealloc -> chunk.owner == addr(a) --------------> This thread owns the chunk ------> The current chunk is active    -> Chunk is completely unused -----> Chunk references no foreign cells
+                                      |                                       |                   (Add cell into the current chunk)                 |                  Return the current chunk back to tlsf
+                                      |                                       |                                   |                                 |
+                                      v                                       v                                   v                                 v
+                      A different thread owns this chunk.     The current chunk is not active.          chunk.free was < size      Chunk references foreign cells, noop
+                      Add the cell to a.sharedFreeLists      Add the cell into the active chunk          Activate the chunk                       (end)
+                                    (end)                                    (end)                              (end)
+]#
+# So "true" deallocation is delayed for as long as possible in favor of reusing cells.
 
 const
-  ChunkOsReturn = 256 * PageSize # 1 MB
-  InitialMemoryRequest = ChunkOsReturn div 2 # < ChunkOsReturn!
+  nimMinHeapPages {.intdefine.} = 128 # 0.5 MB
   SmallChunkSize = PageSize
+  MaxFli = when sizeof(int) > 2: 30 else: 14
+  MaxLog2Sli = 5 # 32, this cannot be increased without changing 'uint32'
+                 # everywhere!
+  MaxSli = 1 shl MaxLog2Sli
+  FliOffset = 6
+  RealFli = MaxFli - FliOffset
 
-type 
-  PTrunk = ptr TTrunk
-  TTrunk {.final.} = object 
+  # size of chunks in last matrix bin
+  MaxBigChunkSize = int(1'i32 shl MaxFli - 1'i32 shl (MaxFli-MaxLog2Sli-1))
+  HugeChunkSize = MaxBigChunkSize + 1
+
+type
+  PTrunk = ptr Trunk
+  Trunk = object
     next: PTrunk         # all nodes are connected with this pointer
     key: int             # start address at bit 0
-    bits: array[0..IntsPerTrunk-1, int] # a bit vector
-  
-  TTrunkBuckets = array[0..255, PTrunk]
-  TIntSet {.final.} = object 
-    data: TTrunkBuckets
-  
-type
-  TAlignType = biggestFloat
-  TFreeCell {.final, pure.} = object
-    next: ptr TFreeCell  # next free cell in chunk (overlaid with refcount)
-    zeroField: int       # 0 means cell is not used (overlaid with typ field)
-                         # 1 means cell is manually managed pointer
-                         # otherwise a PNimType is stored in there
-
-  PChunk = ptr TBaseChunk
-  PBigChunk = ptr TBigChunk
-  PSmallChunk = ptr TSmallChunk
-  TBaseChunk {.pure, inheritable.} = object
-    prevSize: int        # size of previous chunk; for coalescing
-    size: int            # if < PageSize it is a small chunk
-    used: bool           # later will be optimized into prevSize...
-  
-  TSmallChunk = object of TBaseChunk
-    next, prev: PSmallChunk  # chunks of the same size
-    freeList: ptr TFreeCell
-    free: int            # how many bytes remain    
-    acc: int             # accumulator for small object allocation
-    data: TAlignType     # start of usable memory
-  
-  TBigChunk = object of TBaseChunk # not necessarily > PageSize!
-    next, prev: PBigChunk    # chunks of the same (or bigger) size
-    align: int
-    data: TAlignType     # start of usable memory
-
-template smallChunkOverhead(): expr = sizeof(TSmallChunk)-sizeof(TAlignType)
-template bigChunkOverhead(): expr = sizeof(TBigChunk)-sizeof(TAlignType)
+    bits: array[0..IntsPerTrunk-1, uint] # a bit vector
 
-proc roundup(x, v: int): int {.inline.} = 
-  result = (x + (v-1)) and not (v-1)
-  sysAssert(result >= x, "roundup: result < x")
-  #return ((-x) and (v-1)) +% x
-
-sysAssert(roundup(14, PageSize) == PageSize, "invalid PageSize")
-sysAssert(roundup(15, 8) == 16, "roundup broken")
-sysAssert(roundup(65, 8) == 72, "roundup broken 2")
+  TrunkBuckets = array[0..255, PTrunk]
+  IntSet = object
+    data: TrunkBuckets
 
 # ------------- chunk table ---------------------------------------------------
 # We use a PtrSet of chunk starts and a table[Page, chunksize] for chunk
 # endings of big chunks. This is needed by the merging operation. The only
 # remaining operation is best-fit for big chunks. Since there is a size-limit
 # for big chunks (because greater than the limit means they are returned back
-# to the OS), a fixed size array can be used. 
+# to the OS), a fixed size array can be used.
 
 type
-  PLLChunk = ptr TLLChunk
-  TLLChunk {.pure.} = object ## *low-level* chunk
+  PLLChunk = ptr LLChunk
+  LLChunk = object ## *low-level* chunk
     size: int                # remaining size
     acc: int                 # accumulator
     next: PLLChunk           # next low-level chunk; only needed for dealloc
 
-  PAvlNode = ptr TAvlNode
-  TAvlNode {.pure, final.} = object 
-    link: array[0..1, PAvlNode] # Left (0) and right (1) links 
+  PAvlNode = ptr AvlNode
+  AvlNode = object
+    link: array[0..1, PAvlNode] # Left (0) and right (1) links
     key, upperBound: int
     level: int
-    
-  TMemRegion {.final, pure.} = object
-    minLargeObj, maxLargeObj: int
-    freeSmallChunks: array[0..SmallChunkSize div MemAlign-1, PSmallChunk]
+
+const
+  RegionHasLock = false # hasThreadSupport and defined(gcDestructors)
+
+type
+  FreeCell {.final, pure.} = object
+    # A free cell is a pointer that has been freed, meaning it became available for reuse.
+    # It may become foreign if it is lent to a chunk that did not create it, doing so reduces the amount of needed pages.
+    next: ptr FreeCell  # next free cell in chunk (overlaid with refcount)
+    when not defined(gcDestructors):
+      zeroField: int       # 0 means cell is not used (overlaid with typ field)
+                          # 1 means cell is manually managed pointer
+                          # otherwise a PNimType is stored in there
+    else:
+      alignment: int
+
+  PChunk = ptr BaseChunk
+  PBigChunk = ptr BigChunk
+  PSmallChunk = ptr SmallChunk
+  BaseChunk {.pure, inheritable.} = object
+    prevSize: int        # size of previous chunk; for coalescing
+                         # 0th bit == 1 if 'used
+    size: int            # if < PageSize it is a small chunk
+    owner: ptr MemRegion
+
+  SmallChunk = object of BaseChunk
+    next, prev: PSmallChunk  # chunks of the same size
+    freeList: ptr FreeCell   # Singly linked list of cells. They may be from foreign chunks or from the current chunk.
+                             #  Should be `nil` when the chunk isn't active in `a.freeSmallChunks`.
+    free: int32              # Bytes this chunk is able to provide using both the accumulator and free cells.
+                             # When a cell is considered foreign, its source chunk's free field is NOT adjusted until it
+                             #  reaches dealloc while the source chunk is active.
+                             # Instead, the receiving chunk gains the capacity and thus reserves space in the foreign chunk.
+    acc: uint32              # Offset from data, used when there are no free cells available but the chunk is considered free.
+    foreignCells: int        # When a free cell is given to a chunk that is not its origin,
+                             #  both the cell and the source chunk are considered foreign.
+                             # Receiving a foreign cell can happen both when deallocating from another thread or when
+                             #  the active chunk in `a.freeSmallChunks` is not the current chunk.
+                             # Freeing a chunk while `foreignCells > 0` leaks memory as all references to it become lost.
+    data {.align: MemAlign.}: UncheckedArray[byte]      # start of usable memory
+
+  BigChunk = object of BaseChunk # not necessarily > PageSize!
+    next, prev: PBigChunk    # chunks of the same (or bigger) size
+    data {.align: MemAlign.}: UncheckedArray[byte]      # start of usable memory
+
+  HeapLinks = object
+    len: int
+    chunks: array[30, (PBigChunk, int)]
+    next: ptr HeapLinks
+
+  MemRegion = object
+    when not defined(gcDestructors):
+      minLargeObj, maxLargeObj: int
+    freeSmallChunks: array[0..max(1, SmallChunkSize div MemAlign-1), PSmallChunk]
+      # List of available chunks per size class. Only one is expected to be active per class.
+    when defined(gcDestructors):
+      sharedFreeLists: array[0..max(1, SmallChunkSize div MemAlign-1), ptr FreeCell]
+        # When a thread frees a pointer it did not create, it must not adjust the counters.
+        # Instead, the cell is placed here and deferred until the next allocation.
+    flBitmap: uint32
+    slBitmap: array[RealFli, uint32]
+    matrix: array[RealFli, array[MaxSli, PBigChunk]]
     llmem: PLLChunk
-    currMem, maxMem, freeMem: int # memory sizes (allocated from OS)
-    lastSize: int # needed for the case that OS gives us pages linearly 
-    freeChunksList: PBigChunk # XXX make this a datastructure with O(1) access
-    chunkStarts: TIntSet
-    root, deleted, last, freeAvlNodes: PAvlNode
-  
-# shared:
-var
-  bottomData: TAvlNode
-  bottom: PAvlNode
-
-{.push stack_trace: off.}
-proc initAllocator() =
-  when not defined(useNimRtl):
-    bottom = addr(bottomData)
-    bottom.link[0] = bottom
-    bottom.link[1] = bottom
-{.pop.}
+    currMem, maxMem, freeMem, occ: int # memory sizes (allocated from OS)
+    lastSize: int # needed for the case that OS gives us pages linearly
+    when RegionHasLock:
+      lock: SysLock
+    when defined(gcDestructors):
+      sharedFreeListBigChunks: PBigChunk # make no attempt at avoiding false sharing for now for this object field
+
+    chunkStarts: IntSet
+    when not defined(gcDestructors):
+      root, deleted, last, freeAvlNodes: PAvlNode
+    lockActive, locked, blockChunkSizeIncrease: bool # if locked, we cannot free pages.
+    nextChunkSize: int
+    when not defined(gcDestructors):
+      bottomData: AvlNode
+    heapLinks: HeapLinks
+    when defined(nimTypeNames):
+      allocCounter, deallocCounter: int
+
+template smallChunkOverhead(): untyped = sizeof(SmallChunk)
+template bigChunkOverhead(): untyped = sizeof(BigChunk)
+
+when hasThreadSupport:
+  template loada(x: untyped): untyped = atomicLoadN(unsafeAddr x, ATOMIC_RELAXED)
+  template storea(x, y: untyped) = atomicStoreN(unsafeAddr x, y, ATOMIC_RELAXED)
+
+  when false:
+    # not yet required
+    template atomicStatDec(x, diff: untyped) = discard atomicSubFetch(unsafeAddr x, diff, ATOMIC_RELAXED)
+    template atomicStatInc(x, diff: untyped) = discard atomicAddFetch(unsafeAddr x, diff, ATOMIC_RELAXED)
+else:
+  template loada(x: untyped): untyped = x
+  template storea(x, y: untyped) = x = y
 
-proc incCurrMem(a: var TMemRegion, bytes: int) {.inline.} = 
-  inc(a.currMem, bytes)
+template atomicStatDec(x, diff: untyped) = dec x, diff
+template atomicStatInc(x, diff: untyped) = inc x, diff
 
-proc decCurrMem(a: var TMemRegion, bytes: int) {.inline.} =
+const
+  fsLookupTable: array[byte, int8] = [
+    -1'i8, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7
+  ]
+
+proc msbit(x: uint32): int {.inline.} =
+  let a = if x <= 0xff_ff'u32:
+            (if x <= 0xff: 0 else: 8)
+          else:
+            (if x <= 0xff_ff_ff'u32: 16 else: 24)
+  result = int(fsLookupTable[byte(x shr a)]) + a
+
+proc lsbit(x: uint32): int {.inline.} =
+  msbit(x and ((not x) + 1))
+
+proc setBit(nr: int; dest: var uint32) {.inline.} =
+  dest = dest or (1u32 shl (nr and 0x1f))
+
+proc clearBit(nr: int; dest: var uint32) {.inline.} =
+  dest = dest and not (1u32 shl (nr and 0x1f))
+
+proc mappingSearch(r, fl, sl: var int) {.inline.} =
+  #let t = (1 shl (msbit(uint32 r) - MaxLog2Sli)) - 1
+  # This diverges from the standard TLSF algorithm because we need to ensure
+  # PageSize alignment:
+  let t = roundup((1 shl (msbit(uint32 r) - MaxLog2Sli)), PageSize) - 1
+  r = r + t
+  r = r and not t
+  r = min(r, MaxBigChunkSize).int
+  fl = msbit(uint32 r)
+  sl = (r shr (fl - MaxLog2Sli)) - MaxSli
+  dec fl, FliOffset
+  sysAssert((r and PageMask) == 0, "mappingSearch: still not aligned")
+
+# See http://www.gii.upv.es/tlsf/files/papers/tlsf_desc.pdf for details of
+# this algorithm.
+
+proc mappingInsert(r: int): tuple[fl, sl: int] {.inline.} =
+  sysAssert((r and PageMask) == 0, "mappingInsert: still not aligned")
+  result.fl = msbit(uint32 r)
+  result.sl = (r shr (result.fl - MaxLog2Sli)) - MaxSli
+  dec result.fl, FliOffset
+
+template mat(): untyped = a.matrix[fl][sl]
+
+proc findSuitableBlock(a: MemRegion; fl, sl: var int): PBigChunk {.inline.} =
+  let tmp = a.slBitmap[fl] and (not 0u32 shl sl)
+  result = nil
+  if tmp != 0:
+    sl = lsbit(tmp)
+    result = mat()
+  else:
+    fl = lsbit(a.flBitmap and (not 0u32 shl (fl + 1)))
+    if fl > 0:
+      sl = lsbit(a.slBitmap[fl])
+      result = mat()
+
+template clearBits(sl, fl) =
+  clearBit(sl, a.slBitmap[fl])
+  if a.slBitmap[fl] == 0u32:
+    # do not forget to cascade:
+    clearBit(fl, a.flBitmap)
+
+proc removeChunkFromMatrix(a: var MemRegion; b: PBigChunk) =
+  let (fl, sl) = mappingInsert(b.size)
+  if b.next != nil: b.next.prev = b.prev
+  if b.prev != nil: b.prev.next = b.next
+  if mat() == b:
+    mat() = b.next
+    if mat() == nil:
+      clearBits(sl, fl)
+  b.prev = nil
+  b.next = nil
+
+proc removeChunkFromMatrix2(a: var MemRegion; b: PBigChunk; fl, sl: int) =
+  mat() = b.next
+  if mat() != nil:
+    mat().prev = nil
+  else:
+    clearBits(sl, fl)
+  b.prev = nil
+  b.next = nil
+
+proc addChunkToMatrix(a: var MemRegion; b: PBigChunk) =
+  let (fl, sl) = mappingInsert(b.size)
+  b.prev = nil
+  b.next = mat()
+  if mat() != nil:
+    mat().prev = b
+  mat() = b
+  setBit(sl, a.slBitmap[fl])
+  setBit(fl, a.flBitmap)
+
+proc incCurrMem(a: var MemRegion, bytes: int) {.inline.} =
+  atomicStatInc(a.currMem, bytes)
+
+proc decCurrMem(a: var MemRegion, bytes: int) {.inline.} =
   a.maxMem = max(a.maxMem, a.currMem)
-  dec(a.currMem, bytes)
+  atomicStatDec(a.currMem, bytes)
 
-proc getMaxMem(a: var TMemRegion): int =
-  # Since we update maxPagesCount only when freeing pages, 
+proc getMaxMem(a: var MemRegion): int =
+  # Since we update maxPagesCount only when freeing pages,
   # maxPagesCount may not be up to date. Thus we use the
   # maximum of these both values here:
   result = max(a.currMem, a.maxMem)
-  
-proc llAlloc(a: var TMemRegion, size: int): pointer =
+
+const nimMaxHeap {.intdefine.} = 0
+
+proc allocPages(a: var MemRegion, size: int): pointer =
+  when nimMaxHeap != 0:
+    if a.occ + size > nimMaxHeap * 1024 * 1024:
+      raiseOutOfMem()
+  osAllocPages(size)
+
+proc tryAllocPages(a: var MemRegion, size: int): pointer =
+  when nimMaxHeap != 0:
+    if a.occ + size > nimMaxHeap * 1024 * 1024:
+      raiseOutOfMem()
+  osTryAllocPages(size)
+
+proc llAlloc(a: var MemRegion, size: int): pointer =
   # *low-level* alloc for the memory managers data structures. Deallocation
-  # is done at he end of the allocator's life time.
+  # is done at the end of the allocator's life time.
   if a.llmem == nil or size > a.llmem.size:
-    # the requested size is ``roundup(size+sizeof(TLLChunk), PageSize)``, but
+    # the requested size is ``roundup(size+sizeof(LLChunk), PageSize)``, but
     # since we know ``size`` is a (small) constant, we know the requested size
     # is one page:
-    sysAssert roundup(size+sizeof(TLLChunk), PageSize) == PageSize, "roundup 6"
+    sysAssert roundup(size+sizeof(LLChunk), PageSize) == PageSize, "roundup 6"
     var old = a.llmem # can be nil and is correct with nil
-    a.llmem = cast[PLLChunk](osAllocPages(PageSize))
+    a.llmem = cast[PLLChunk](allocPages(a, PageSize))
+    when defined(nimAvlcorruption):
+      trackLocation(a.llmem, PageSize)
     incCurrMem(a, PageSize)
-    a.llmem.size = PageSize - sizeof(TLLChunk)
-    a.llmem.acc = sizeof(TLLChunk)
+    a.llmem.size = PageSize - sizeof(LLChunk)
+    a.llmem.acc = sizeof(LLChunk)
     a.llmem.next = old
-  result = cast[pointer](cast[TAddress](a.llmem) + a.llmem.acc)
+  result = cast[pointer](cast[int](a.llmem) + a.llmem.acc)
   dec(a.llmem.size, size)
   inc(a.llmem.acc, size)
   zeroMem(result, size)
 
-proc allocAvlNode(a: var TMemRegion, key, upperBound: int): PAvlNode =
-  if a.freeAvlNodes != nil:
-    result = a.freeAvlNodes
-    a.freeAvlNodes = a.freeAvlNodes.link[0]
+when not defined(gcDestructors):
+  proc getBottom(a: var MemRegion): PAvlNode =
+    result = addr(a.bottomData)
+    if result.link[0] == nil:
+      result.link[0] = result
+      result.link[1] = result
+
+  proc allocAvlNode(a: var MemRegion, key, upperBound: int): PAvlNode =
+    if a.freeAvlNodes != nil:
+      result = a.freeAvlNodes
+      a.freeAvlNodes = a.freeAvlNodes.link[0]
+    else:
+      result = cast[PAvlNode](llAlloc(a, sizeof(AvlNode)))
+      when defined(nimAvlcorruption):
+        cprintf("tracking location: %p\n", result)
+    result.key = key
+    result.upperBound = upperBound
+    let bottom = getBottom(a)
+    result.link[0] = bottom
+    result.link[1] = bottom
+    result.level = 1
+    #when defined(nimAvlcorruption):
+    #  track("allocAvlNode", result, sizeof(AvlNode))
+    sysAssert(bottom == addr(a.bottomData), "bottom data")
+    sysAssert(bottom.link[0] == bottom, "bottom link[0]")
+    sysAssert(bottom.link[1] == bottom, "bottom link[1]")
+
+  proc deallocAvlNode(a: var MemRegion, n: PAvlNode) {.inline.} =
+    n.link[0] = a.freeAvlNodes
+    a.freeAvlNodes = n
+
+proc addHeapLink(a: var MemRegion; p: PBigChunk, size: int): ptr HeapLinks =
+  var it = addr(a.heapLinks)
+  while it != nil and it.len >= it.chunks.len: it = it.next
+  if it == nil:
+    var n = cast[ptr HeapLinks](llAlloc(a, sizeof(HeapLinks)))
+    n.next = a.heapLinks.next
+    a.heapLinks.next = n
+    n.chunks[0] = (p, size)
+    n.len = 1
+    result = n
   else:
-    result = cast[PAvlNode](llAlloc(a, sizeof(TAvlNode)))
-  result.key = key
-  result.upperBound = upperBound
-  result.link[0] = bottom
-  result.link[1] = bottom
-  result.level = 1
-  sysAssert(bottom == addr(bottomData), "bottom data")
-  sysAssert(bottom.link[0] == bottom, "bottom link[0]")
-  sysAssert(bottom.link[1] == bottom, "bottom link[1]")
-
-proc deallocAvlNode(a: var TMemRegion, n: PAvlNode) {.inline.} =
-  n.link[0] = a.freeAvlNodes
-  a.freeAvlNodes = n
-
-include "system/avltree"
-
-proc llDeallocAll(a: var TMemRegion) =
+    let L = it.len
+    it.chunks[L] = (p, size)
+    inc it.len
+    result = it
+
+when not defined(gcDestructors):
+  include "system/avltree"
+
+proc llDeallocAll(a: var MemRegion) =
   var it = a.llmem
   while it != nil:
     # we know each block in the list has the size of 1 page:
     var next = it.next
     osDeallocPages(it, PageSize)
     it = next
-  
-proc IntSetGet(t: TIntSet, key: int): PTrunk = 
+  a.llmem = nil
+
+proc intSetGet(t: IntSet, key: int): PTrunk =
   var it = t.data[key and high(t.data)]
-  while it != nil: 
+  while it != nil:
     if it.key == key: return it
     it = it.next
   result = nil
 
-proc IntSetPut(a: var TMemRegion, t: var TIntSet, key: int): PTrunk = 
-  result = IntSetGet(t, key)
+proc intSetPut(a: var MemRegion, t: var IntSet, key: int): PTrunk =
+  result = intSetGet(t, key)
   if result == nil:
     result = cast[PTrunk](llAlloc(a, sizeof(result[])))
     result.next = t.data[key and high(t.data)]
     t.data[key and high(t.data)] = result
     result.key = key
 
-proc Contains(s: TIntSet, key: int): bool = 
-  var t = IntSetGet(s, key shr TrunkShift)
-  if t != nil: 
+proc contains(s: IntSet, key: int): bool =
+  var t = intSetGet(s, key shr TrunkShift)
+  if t != nil:
     var u = key and TrunkMask
-    result = (t.bits[u shr IntShift] and (1 shl (u and IntMask))) != 0
-  else: 
+    result = (t.bits[u shr IntShift] and (uint(1) shl (u and IntMask))) != 0
+  else:
     result = false
-  
-proc Incl(a: var TMemRegion, s: var TIntSet, key: int) = 
-  var t = IntSetPut(a, s, key shr TrunkShift)
+
+proc incl(a: var MemRegion, s: var IntSet, key: int) =
+  var t = intSetPut(a, s, key shr TrunkShift)
   var u = key and TrunkMask
-  t.bits[u shr IntShift] = t.bits[u shr IntShift] or (1 shl (u and IntMask))
+  t.bits[u shr IntShift] = t.bits[u shr IntShift] or (uint(1) shl (u and IntMask))
 
-proc Excl(s: var TIntSet, key: int) = 
-  var t = IntSetGet(s, key shr TrunkShift)
+proc excl(s: var IntSet, key: int) =
+  var t = intSetGet(s, key shr TrunkShift)
   if t != nil:
     var u = key and TrunkMask
     t.bits[u shr IntShift] = t.bits[u shr IntShift] and not
-        (1 shl (u and IntMask))
+        (uint(1) shl (u and IntMask))
 
-iterator elements(t: TIntSet): int {.inline.} =
+iterator elements(t: IntSet): int {.inline.} =
   # while traversing it is forbidden to change the set!
   for h in 0..high(t.data):
     var r = t.data[h]
@@ -303,117 +459,140 @@ iterator elements(t: TIntSet): int {.inline.} =
           w = w shr 1
         inc(i)
       r = r.next
-  
-proc isSmallChunk(c: PChunk): bool {.inline.} = 
-  return c.size <= SmallChunkSize-smallChunkOverhead()
-  
-proc chunkUnused(c: PChunk): bool {.inline.} = 
-  result = not c.used
-
-iterator allObjects(m: TMemRegion): pointer {.inline.} =
-  for s in elements(m.chunkStarts):
-    let c = cast[PChunk](s shl PageShift)
-    if not chunkUnused(c):
-      if isSmallChunk(c):
-        var c = cast[PSmallChunk](c)
-        
-        let size = c.size
-        var a = cast[TAddress](addr(c.data))
-        let limit = a + c.acc
-        while a <% limit:
-          yield cast[pointer](a)
-          a = a +% size
-      else:
-        let c = cast[PBigChunk](c)
-        yield addr(c.data)
 
-proc isCell(p: pointer): bool {.inline.} =
-  result = cast[ptr TFreeCell](p).zeroField >% 1
+proc isSmallChunk(c: PChunk): bool {.inline.} =
+  result = c.size <= SmallChunkSize-smallChunkOverhead()
+
+proc chunkUnused(c: PChunk): bool {.inline.} =
+  result = (c.prevSize and 1) == 0
+
+iterator allObjects(m: var MemRegion): pointer {.inline.} =
+  m.locked = true
+  for s in elements(m.chunkStarts):
+    # we need to check here again as it could have been modified:
+    if s in m.chunkStarts:
+      let c = cast[PChunk](s shl PageShift)
+      if not chunkUnused(c):
+        if isSmallChunk(c):
+          var c = cast[PSmallChunk](c)
+
+          let size = c.size
+          var a = cast[int](addr(c.data))
+          let limit = a + c.acc.int
+          while a <% limit:
+            yield cast[pointer](a)
+            a = a +% size
+        else:
+          let c = cast[PBigChunk](c)
+          yield addr(c.data)
+  m.locked = false
+
+proc iterToProc*(iter: typed, envType: typedesc; procName: untyped) {.
+                      magic: "Plugin", compileTime.}
+
+when not defined(gcDestructors):
+  proc isCell(p: pointer): bool {.inline.} =
+    result = cast[ptr FreeCell](p).zeroField >% 1
 
 # ------------- chunk management ----------------------------------------------
-proc pageIndex(c: PChunk): int {.inline.} = 
-  result = cast[TAddress](c) shr PageShift
+proc pageIndex(c: PChunk): int {.inline.} =
+  result = cast[int](c) shr PageShift
 
-proc pageIndex(p: pointer): int {.inline.} = 
-  result = cast[TAddress](p) shr PageShift
+proc pageIndex(p: pointer): int {.inline.} =
+  result = cast[int](p) shr PageShift
 
-proc pageAddr(p: pointer): PChunk {.inline.} = 
-  result = cast[PChunk](cast[TAddress](p) and not PageMask)
+proc pageAddr(p: pointer): PChunk {.inline.} =
+  result = cast[PChunk](cast[int](p) and not PageMask)
   #sysAssert(Contains(allocator.chunkStarts, pageIndex(result)))
 
-proc requestOsChunks(a: var TMemRegion, size: int): PBigChunk = 
+when false:
+  proc writeFreeList(a: MemRegion) =
+    var it = a.freeChunksList
+    c_fprintf(stdout, "freeChunksList: %p\n", it)
+    while it != nil:
+      c_fprintf(stdout, "it: %p, next: %p, prev: %p, size: %ld\n",
+                it, it.next, it.prev, it.size)
+      it = it.next
+
+proc requestOsChunks(a: var MemRegion, size: int): PBigChunk =
+  when not defined(emscripten):
+    if not a.blockChunkSizeIncrease:
+      let usedMem = a.occ #a.currMem # - a.freeMem
+      if usedMem < 64 * 1024:
+        a.nextChunkSize = PageSize*4
+      else:
+        a.nextChunkSize = min(roundup(usedMem shr 2, PageSize), a.nextChunkSize * 2)
+        a.nextChunkSize = min(a.nextChunkSize, MaxBigChunkSize).int
+
+  var size = size
+  if size > a.nextChunkSize:
+    result = cast[PBigChunk](allocPages(a, size))
+  else:
+    result = cast[PBigChunk](tryAllocPages(a, a.nextChunkSize))
+    if result == nil:
+      result = cast[PBigChunk](allocPages(a, size))
+      a.blockChunkSizeIncrease = true
+    else:
+      size = a.nextChunkSize
+
   incCurrMem(a, size)
   inc(a.freeMem, size)
-  result = cast[PBigChunk](osAllocPages(size))
-  sysAssert((cast[TAddress](result) and PageMask) == 0, "requestOsChunks 1")
+  let heapLink = a.addHeapLink(result, size)
+  when defined(debugHeapLinks):
+    cprintf("owner: %p; result: %p; next pointer %p; size: %ld\n", addr(a),
+      result, heapLink, size)
+
+  when defined(memtracker):
+    trackLocation(addr result.size, sizeof(int))
+
+  sysAssert((cast[int](result) and PageMask) == 0, "requestOsChunks 1")
   #zeroMem(result, size)
   result.next = nil
   result.prev = nil
-  result.used = false
   result.size = size
   # update next.prevSize:
-  var nxt = cast[TAddress](result) +% size
+  var nxt = cast[int](result) +% size
   sysAssert((nxt and PageMask) == 0, "requestOsChunks 2")
   var next = cast[PChunk](nxt)
   if pageIndex(next) in a.chunkStarts:
     #echo("Next already allocated!")
-    next.prevSize = size
+    next.prevSize = size or (next.prevSize and 1)
   # set result.prevSize:
   var lastSize = if a.lastSize != 0: a.lastSize else: PageSize
-  var prv = cast[TAddress](result) -% lastSize
+  var prv = cast[int](result) -% lastSize
   sysAssert((nxt and PageMask) == 0, "requestOsChunks 3")
   var prev = cast[PChunk](prv)
   if pageIndex(prev) in a.chunkStarts and prev.size == lastSize:
     #echo("Prev already allocated!")
-    result.prevSize = lastSize
+    result.prevSize = lastSize or (result.prevSize and 1)
   else:
-    result.prevSize = 0 # unknown
+    result.prevSize = 0 or (result.prevSize and 1) # unknown
+    # but do not overwrite 'used' field
   a.lastSize = size # for next request
+  sysAssert((cast[int](result) and PageMask) == 0, "requestOschunks: unaligned chunk")
 
-proc freeOsChunks(a: var TMemRegion, p: pointer, size: int) = 
-  # update next.prevSize:
-  var c = cast[PChunk](p)
-  var nxt = cast[TAddress](p) +% c.size
-  sysAssert((nxt and PageMask) == 0, "freeOsChunks")
-  var next = cast[PChunk](nxt)
-  if pageIndex(next) in a.chunkStarts:
-    next.prevSize = 0 # XXX used
-  excl(a.chunkStarts, pageIndex(p))
-  osDeallocPages(p, size)
-  decCurrMem(a, size)
-  dec(a.freeMem, size)
-  #c_fprintf(c_stdout, "[Alloc] back to OS: %ld\n", size)
-
-proc isAccessible(a: TMemRegion, p: pointer): bool {.inline.} = 
-  result = Contains(a.chunkStarts, pageIndex(p))
+proc isAccessible(a: MemRegion, p: pointer): bool {.inline.} =
+  result = contains(a.chunkStarts, pageIndex(p))
 
-proc contains[T](list, x: T): bool = 
+proc contains[T](list, x: T): bool =
   var it = list
   while it != nil:
     if it == x: return true
     it = it.next
-    
-proc writeFreeList(a: TMemRegion) =
-  var it = a.freeChunksList
-  c_fprintf(c_stdout, "freeChunksList: %p\n", it)
-  while it != nil: 
-    c_fprintf(c_stdout, "it: %p, next: %p, prev: %p\n", 
-              it, it.next, it.prev)
-    it = it.next
 
-proc ListAdd[T](head: var T, c: T) {.inline.} = 
+proc listAdd[T](head: var T, c: T) {.inline.} =
   sysAssert(c notin head, "listAdd 1")
   sysAssert c.prev == nil, "listAdd 2"
   sysAssert c.next == nil, "listAdd 3"
   c.next = head
-  if head != nil: 
+  if head != nil:
     sysAssert head.prev == nil, "listAdd 4"
     head.prev = c
   head = c
 
-proc ListRemove[T](head: var T, c: T) {.inline.} =
+proc listRemove[T](head: var T, c: T) {.inline.} =
   sysAssert(c in head, "listRemove")
-  if c == head: 
+  if c == head:
     head = c.next
     sysAssert c.prev == nil, "listRemove 2"
     if head != nil: head.prev = nil
@@ -423,353 +602,639 @@ proc ListRemove[T](head: var T, c: T) {.inline.} =
     if c.next != nil: c.next.prev = c.prev
   c.next = nil
   c.prev = nil
-  
-proc updatePrevSize(a: var TMemRegion, c: PBigChunk, 
-                    prevSize: int) {.inline.} = 
-  var ri = cast[PChunk](cast[TAddress](c) +% c.size)
-  sysAssert((cast[TAddress](ri) and PageMask) == 0, "updatePrevSize")
+
+proc updatePrevSize(a: var MemRegion, c: PBigChunk,
+                    prevSize: int) {.inline.} =
+  var ri = cast[PChunk](cast[int](c) +% c.size)
+  sysAssert((cast[int](ri) and PageMask) == 0, "updatePrevSize")
   if isAccessible(a, ri):
-    ri.prevSize = prevSize
-  
-proc freeBigChunk(a: var TMemRegion, c: PBigChunk) = 
+    ri.prevSize = prevSize or (ri.prevSize and 1)
+
+proc splitChunk2(a: var MemRegion, c: PBigChunk, size: int): PBigChunk =
+  result = cast[PBigChunk](cast[int](c) +% size)
+  result.size = c.size - size
+  track("result.size", addr result.size, sizeof(int))
+  when not defined(nimOptimizedSplitChunk):
+    # still active because of weird codegen issue on some of our CIs:
+    result.next = nil
+    result.prev = nil
+  # size and not used:
+  result.prevSize = size
+  result.owner = addr a
+  sysAssert((size and 1) == 0, "splitChunk 2")
+  sysAssert((size and PageMask) == 0,
+      "splitChunk: size is not a multiple of the PageSize")
+  updatePrevSize(a, c, result.size)
+  c.size = size
+  incl(a, a.chunkStarts, pageIndex(result))
+
+proc splitChunk(a: var MemRegion, c: PBigChunk, size: int) =
+  let rest = splitChunk2(a, c, size)
+  addChunkToMatrix(a, rest)
+
+proc freeBigChunk(a: var MemRegion, c: PBigChunk) =
   var c = c
   sysAssert(c.size >= PageSize, "freeBigChunk")
   inc(a.freeMem, c.size)
-  when coalescRight:
-    var ri = cast[PChunk](cast[TAddress](c) +% c.size)
-    sysAssert((cast[TAddress](ri) and PageMask) == 0, "freeBigChunk 2")
-    if isAccessible(a, ri) and chunkUnused(ri):
-      sysAssert(not isSmallChunk(ri), "freeBigChunk 3")
-      if not isSmallChunk(ri):
-        ListRemove(a.freeChunksList, cast[PBigChunk](ri))
-        inc(c.size, ri.size)
-        excl(a.chunkStarts, pageIndex(ri))
+  c.prevSize = c.prevSize and not 1  # set 'used' to false
   when coalescLeft:
-    if c.prevSize != 0: 
-      var le = cast[PChunk](cast[TAddress](c) -% c.prevSize)
-      sysAssert((cast[TAddress](le) and PageMask) == 0, "freeBigChunk 4")
+    let prevSize = c.prevSize
+    if prevSize != 0:
+      var le = cast[PChunk](cast[int](c) -% prevSize)
+      sysAssert((cast[int](le) and PageMask) == 0, "freeBigChunk 4")
       if isAccessible(a, le) and chunkUnused(le):
         sysAssert(not isSmallChunk(le), "freeBigChunk 5")
-        if not isSmallChunk(le):
-          ListRemove(a.freeChunksList, cast[PBigChunk](le))
+        if not isSmallChunk(le) and le.size < MaxBigChunkSize:
+          removeChunkFromMatrix(a, cast[PBigChunk](le))
           inc(le.size, c.size)
           excl(a.chunkStarts, pageIndex(c))
           c = cast[PBigChunk](le)
+          if c.size > MaxBigChunkSize:
+            let rest = splitChunk2(a, c, MaxBigChunkSize)
+            when defined(nimOptimizedSplitChunk):
+              rest.next = nil
+              rest.prev = nil
+            addChunkToMatrix(a, c)
+            c = rest
+  when coalescRight:
+    var ri = cast[PChunk](cast[int](c) +% c.size)
+    sysAssert((cast[int](ri) and PageMask) == 0, "freeBigChunk 2")
+    if isAccessible(a, ri) and chunkUnused(ri):
+      sysAssert(not isSmallChunk(ri), "freeBigChunk 3")
+      if not isSmallChunk(ri) and c.size < MaxBigChunkSize:
+        removeChunkFromMatrix(a, cast[PBigChunk](ri))
+        inc(c.size, ri.size)
+        excl(a.chunkStarts, pageIndex(ri))
+        if c.size > MaxBigChunkSize:
+          let rest = splitChunk2(a, c, MaxBigChunkSize)
+          addChunkToMatrix(a, rest)
+  addChunkToMatrix(a, c)
 
-  if c.size < ChunkOsReturn or weirdUnmap:
-    incl(a, a.chunkStarts, pageIndex(c))
-    updatePrevSize(a, c, c.size)
-    ListAdd(a.freeChunksList, c)
-    c.used = false
-  else:
-    freeOsChunks(a, c, c.size)
-
-proc splitChunk(a: var TMemRegion, c: PBigChunk, size: int) = 
-  var rest = cast[PBigChunk](cast[TAddress](c) +% size)
-  sysAssert(rest notin a.freeChunksList, "splitChunk")
-  rest.size = c.size - size
-  rest.used = false
-  rest.next = nil
-  rest.prev = nil
-  rest.prevSize = size
-  updatePrevSize(a, c, rest.size)
-  c.size = size
-  incl(a, a.chunkStarts, pageIndex(rest))
-  ListAdd(a.freeChunksList, rest)
-
-proc getBigChunk(a: var TMemRegion, size: int): PBigChunk = 
-  # use first fit for now:
-  sysAssert((size and PageMask) == 0, "getBigChunk 1")
+proc getBigChunk(a: var MemRegion, size: int): PBigChunk =
   sysAssert(size > 0, "getBigChunk 2")
-  result = a.freeChunksList
-  block search:
-    while result != nil:
-      sysAssert chunkUnused(result), "getBigChunk 3"
-      if result.size == size: 
-        ListRemove(a.freeChunksList, result)
-        break search
-      elif result.size > size:
-        ListRemove(a.freeChunksList, result)
-        splitChunk(a, result, size)
-        break search
-      result = result.next
-      sysAssert result != a.freeChunksList, "getBigChunk 4"
-    if size < InitialMemoryRequest: 
-      result = requestOsChunks(a, InitialMemoryRequest)
+  var size = size # roundup(size, PageSize)
+  var fl = 0
+  var sl = 0
+  mappingSearch(size, fl, sl)
+  sysAssert((size and PageMask) == 0, "getBigChunk: unaligned chunk")
+  result = findSuitableBlock(a, fl, sl)
+
+  when RegionHasLock:
+    if not a.lockActive:
+      a.lockActive = true
+      initSysLock(a.lock)
+    acquireSys a.lock
+
+  if result == nil:
+    if size < nimMinHeapPages * PageSize:
+      result = requestOsChunks(a, nimMinHeapPages * PageSize)
       splitChunk(a, result, size)
     else:
       result = requestOsChunks(a, size)
-  result.prevSize = 0 # XXX why is this needed?
-  result.used = true
+      # if we over allocated split the chunk:
+      if result.size > size:
+        splitChunk(a, result, size)
+    result.owner = addr a
+  else:
+    removeChunkFromMatrix2(a, result, fl, sl)
+    if result.size >= size + PageSize:
+      splitChunk(a, result, size)
+  # set 'used' to to true:
+  result.prevSize = 1
+  track("setUsedToFalse", addr result.size, sizeof(int))
+  sysAssert result.owner == addr a, "getBigChunk: No owner set!"
+
   incl(a, a.chunkStarts, pageIndex(result))
   dec(a.freeMem, size)
+  when RegionHasLock:
+    releaseSys a.lock
+
+proc getHugeChunk(a: var MemRegion; size: int): PBigChunk =
+  result = cast[PBigChunk](allocPages(a, size))
+  when RegionHasLock:
+    if not a.lockActive:
+      a.lockActive = true
+      initSysLock(a.lock)
+    acquireSys a.lock
+  incCurrMem(a, size)
+  # XXX add this to the heap links. But also remove it from it later.
+  when false: a.addHeapLink(result, size)
+  sysAssert((cast[int](result) and PageMask) == 0, "getHugeChunk")
+  result.next = nil
+  result.prev = nil
+  result.size = size
+  # set 'used' to to true:
+  result.prevSize = 1
+  result.owner = addr a
+  incl(a, a.chunkStarts, pageIndex(result))
+  when RegionHasLock:
+    releaseSys a.lock
 
-proc getSmallChunk(a: var TMemRegion): PSmallChunk = 
+proc freeHugeChunk(a: var MemRegion; c: PBigChunk) =
+  let size = c.size
+  sysAssert(size >= HugeChunkSize, "freeHugeChunk: invalid size")
+  excl(a.chunkStarts, pageIndex(c))
+  decCurrMem(a, size)
+  osDeallocPages(c, size)
+
+proc getSmallChunk(a: var MemRegion): PSmallChunk =
   var res = getBigChunk(a, PageSize)
   sysAssert res.prev == nil, "getSmallChunk 1"
   sysAssert res.next == nil, "getSmallChunk 2"
   result = cast[PSmallChunk](res)
 
 # -----------------------------------------------------------------------------
-proc isAllocatedPtr(a: TMemRegion, p: pointer): bool
-
-proc allocInv(a: TMemRegion): bool =
-  ## checks some (not all yet) invariants of the allocator's data structures.
-  for s in low(a.freeSmallChunks)..high(a.freeSmallChunks):
-    var c = a.freeSmallChunks[s]
-    while c != nil:
-      if c.next == c: return false
-      if c.size != s * MemAlign: return false
-      var it = c.freeList
-      while it != nil:
-        if it.zeroField != 0: return false
-        it = it.next
-      c = c.next
-  result = true
-
-proc rawAlloc(a: var TMemRegion, requestedSize: int): pointer =
+when not defined(gcDestructors):
+  proc isAllocatedPtr(a: MemRegion, p: pointer): bool {.benign.}
+
+when true:
+  template allocInv(a: MemRegion): bool = true
+else:
+  proc allocInv(a: MemRegion): bool =
+    ## checks some (not all yet) invariants of the allocator's data structures.
+    for s in low(a.freeSmallChunks)..high(a.freeSmallChunks):
+      var c = a.freeSmallChunks[s]
+      while not (c == nil):
+        if c.next == c:
+          echo "[SYSASSERT] c.next == c"
+          return false
+        if not (c.size == s * MemAlign):
+          echo "[SYSASSERT] c.size != s * MemAlign"
+          return false
+        var it = c.freeList
+        while not (it == nil):
+          if not (it.zeroField == 0):
+            echo "[SYSASSERT] it.zeroField != 0"
+            c_printf("%ld %p\n", it.zeroField, it)
+            return false
+          it = it.next
+        c = c.next
+    result = true
+
+when false:
+  var
+    rsizes: array[50_000, int]
+    rsizesLen: int
+
+  proc trackSize(size: int) =
+    rsizes[rsizesLen] = size
+    inc rsizesLen
+
+  proc untrackSize(size: int) =
+    for i in 0 .. rsizesLen-1:
+      if rsizes[i] == size:
+        rsizes[i] = rsizes[rsizesLen-1]
+        dec rsizesLen
+        return
+    c_fprintf(stdout, "%ld\n", size)
+    sysAssert(false, "untracked size!")
+else:
+  template trackSize(x) = discard
+  template untrackSize(x) = discard
+
+proc deallocBigChunk(a: var MemRegion, c: PBigChunk) =
+  when RegionHasLock:
+    acquireSys a.lock
+  dec a.occ, c.size
+  untrackSize(c.size)
+  sysAssert a.occ >= 0, "rawDealloc: negative occupied memory (case B)"
+  when not defined(gcDestructors):
+    a.deleted = getBottom(a)
+    del(a, a.root, cast[int](addr(c.data)))
+  if c.size >= HugeChunkSize: freeHugeChunk(a, c)
+  else: freeBigChunk(a, c)
+  when RegionHasLock:
+    releaseSys a.lock
+
+when defined(gcDestructors):
+  template atomicPrepend(head, elem: untyped) =
+    # see also https://en.cppreference.com/w/cpp/atomic/atomic_compare_exchange
+    when hasThreadSupport:
+      while true:
+        elem.next.storea head.loada
+        if atomicCompareExchangeN(addr head, addr elem.next, elem, weak = true, ATOMIC_RELEASE, ATOMIC_RELAXED):
+          break
+    else:
+      elem.next.storea head.loada
+      head.storea elem
+
+  proc addToSharedFreeListBigChunks(a: var MemRegion; c: PBigChunk) {.inline.} =
+    sysAssert c.next == nil, "c.next pointer must be nil"
+    atomicPrepend a.sharedFreeListBigChunks, c
+
+  proc addToSharedFreeList(c: PSmallChunk; f: ptr FreeCell; size: int) {.inline.} =
+    atomicPrepend c.owner.sharedFreeLists[size], f
+
+  const MaxSteps = 20
+
+  proc compensateCounters(a: var MemRegion; c: PSmallChunk; size: int) =
+    # rawDealloc did NOT do the usual:
+    # `inc(c.free, size); dec(a.occ, size)` because it wasn't the owner of these
+    # memory locations. We have to compensate here for these for the entire list.
+    var it = c.freeList
+    var total = 0
+    while it != nil:
+      inc total, size
+      let chunk = cast[PSmallChunk](pageAddr(it))
+      if c != chunk:
+        # The cell is foreign, potentially even from a foreign thread.
+        # It must block the current chunk from being freed, as doing so would leak memory.
+        inc c.foreignCells
+      it = it.next
+    # By not adjusting the foreign chunk we reserve space in it to prevent deallocation
+    inc(c.free, total)
+    dec(a.occ, total)
+
+  proc freeDeferredObjects(a: var MemRegion; root: PBigChunk) =
+    var it = root
+    var maxIters = MaxSteps # make it time-bounded
+    while true:
+      let rest = it.next.loada
+      it.next.storea nil
+      deallocBigChunk(a, cast[PBigChunk](it))
+      if maxIters == 0:
+        if rest != nil:
+          addToSharedFreeListBigChunks(a, rest)
+          sysAssert a.sharedFreeListBigChunks != nil, "re-enqueing failed"
+        break
+      it = rest
+      dec maxIters
+      if it == nil: break
+
+proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
+  when defined(nimTypeNames):
+    inc(a.allocCounter)
   sysAssert(allocInv(a), "rawAlloc: begin")
-  sysAssert(roundup(65, 8) == 72, "rawAlloc 1")
-  sysAssert requestedSize >= sizeof(TFreeCell), "rawAlloc 2"
+  sysAssert(roundup(65, 8) == 72, "rawAlloc: roundup broken")
   var size = roundup(requestedSize, MemAlign)
+  sysAssert(size >= sizeof(FreeCell), "rawAlloc: requested size too small")
   sysAssert(size >= requestedSize, "insufficient allocated size!")
-  #c_fprintf(c_stdout, "alloc; size: %ld; %ld\n", requestedSize, size)
-  if size <= SmallChunkSize-smallChunkOverhead(): 
+  #c_fprintf(stdout, "alloc; size: %ld; %ld\n", requestedSize, size)
+
+  if size <= SmallChunkSize-smallChunkOverhead():
+    template fetchSharedCells(tc: PSmallChunk) =
+      # Consumes cells from (potentially) foreign threads from `a.sharedFreeLists[s]`
+      when defined(gcDestructors):
+        if tc.freeList == nil:
+          when hasThreadSupport:
+            # Steal the entire list from `sharedFreeList`:
+            tc.freeList = atomicExchangeN(addr a.sharedFreeLists[s], nil, ATOMIC_RELAXED)
+          else:
+            tc.freeList = a.sharedFreeLists[s]
+            a.sharedFreeLists[s] = nil
+          # if `tc.freeList` isn't nil, `tc` will gain capacity.
+          # We must calculate how much it gained and how many foreign cells are included.
+          compensateCounters(a, tc, size)
+
     # allocate a small block: for small chunks, we use only its next pointer
-    var s = size div MemAlign
+    let s = size div MemAlign
     var c = a.freeSmallChunks[s]
-    if c == nil: 
+    if c == nil:
+      # There is no free chunk of the requested size available, we need a new one.
       c = getSmallChunk(a)
+      # init all fields in case memory didn't get zeroed
       c.freeList = nil
+      c.foreignCells = 0
       sysAssert c.size == PageSize, "rawAlloc 3"
       c.size = size
-      c.acc = size
-      c.free = SmallChunkSize - smallChunkOverhead() - size
+      c.acc = size.uint32
+      c.free = SmallChunkSize - smallChunkOverhead() - size.int32
+      sysAssert c.owner == addr(a), "rawAlloc: No owner set!"
       c.next = nil
       c.prev = nil
-      ListAdd(a.freeSmallChunks[s], c)
+      # Shared cells are fetched here in case `c.size * 2 >= SmallChunkSize - smallChunkOverhead()`.
+      # For those single cell chunks, we would otherwise have to allocate a new one almost every time.
+      fetchSharedCells(c)
+      if c.free >= size:
+        # Because removals from `a.freeSmallChunks[s]` only happen in the other alloc branch and during dealloc,
+        #  we must not add it to the list if it cannot be used the next time a pointer of `size` bytes is needed.
+        listAdd(a.freeSmallChunks[s], c)
       result = addr(c.data)
-      sysAssert((cast[TAddress](result) and (MemAlign-1)) == 0, "rawAlloc 4")
+      sysAssert((cast[int](result) and (MemAlign-1)) == 0, "rawAlloc 4")
     else:
+      # There is a free chunk of the requested size available, use it.
       sysAssert(allocInv(a), "rawAlloc: begin c != nil")
       sysAssert c.next != c, "rawAlloc 5"
       #if c.size != size:
-      #  c_fprintf(c_stdout, "csize: %lld; size %lld\n", c.size, size)
+      #  c_fprintf(stdout, "csize: %lld; size %lld\n", c.size, size)
       sysAssert c.size == size, "rawAlloc 6"
       if c.freeList == nil:
-        sysAssert(c.acc + smallChunkOverhead() + size <= SmallChunkSize, 
+        sysAssert(c.acc.int + smallChunkOverhead() + size <= SmallChunkSize,
                   "rawAlloc 7")
-        result = cast[pointer](cast[TAddress](addr(c.data)) +% c.acc)
+        result = cast[pointer](cast[int](addr(c.data)) +% c.acc.int)
         inc(c.acc, size)
       else:
+        # There are free cells available, prefer them over the accumulator
         result = c.freeList
-        sysAssert(c.freeList.zeroField == 0, "rawAlloc 8")
+        when not defined(gcDestructors):
+          sysAssert(c.freeList.zeroField == 0, "rawAlloc 8")
         c.freeList = c.freeList.next
+        if cast[PSmallChunk](pageAddr(result)) != c:
+          # This cell isn't a blocker for the current chunk's deallocation anymore
+          dec(c.foreignCells)
+        else:
+          sysAssert(c == cast[PSmallChunk](pageAddr(result)), "rawAlloc: Bad cell")
+      # Even if the cell we return is foreign, the local chunk's capacity decreases.
+      # The capacity was previously reserved in the source chunk (when it first got allocated),
+      #  then added into the current chunk during dealloc,
+      #  so the source chunk will not be freed or leak memory because of this.
       dec(c.free, size)
-      sysAssert((cast[TAddress](result) and (MemAlign-1)) == 0, "rawAlloc 9")
+      sysAssert((cast[int](result) and (MemAlign-1)) == 0, "rawAlloc 9")
       sysAssert(allocInv(a), "rawAlloc: end c != nil")
-    sysAssert(allocInv(a), "rawAlloc: before c.free < size")
-    if c.free < size:
-      sysAssert(allocInv(a), "rawAlloc: before listRemove test")
-      ListRemove(a.freeSmallChunks[s], c)
-      sysAssert(allocInv(a), "rawAlloc: end listRemove test")
-    sysAssert(((cast[TAddress](result) and PageMask) - smallChunkOverhead()) %%
+      # We fetch deferred cells *after* advancing `c.freeList`/`acc` to adjust `c.free`.
+      # If after the adjustment it turns out there's free cells available,
+      #  the chunk stays in `a.freeSmallChunks[s]` and the need for a new chunk is delayed.
+      fetchSharedCells(c)
+      sysAssert(allocInv(a), "rawAlloc: before c.free < size")
+      if c.free < size:
+        # Even after fetching shared cells the chunk has no usable memory left. It is no longer the active chunk
+        sysAssert(allocInv(a), "rawAlloc: before listRemove test")
+        listRemove(a.freeSmallChunks[s], c)
+        sysAssert(allocInv(a), "rawAlloc: end listRemove test")
+    sysAssert(((cast[int](result) and PageMask) - smallChunkOverhead()) %%
                size == 0, "rawAlloc 21")
     sysAssert(allocInv(a), "rawAlloc: end small size")
+    inc a.occ, size
+    trackSize(c.size)
   else:
-    size = roundup(requestedSize+bigChunkOverhead(), PageSize)
+    when defined(gcDestructors):
+      when hasThreadSupport:
+        let deferredFrees = atomicExchangeN(addr a.sharedFreeListBigChunks, nil, ATOMIC_RELAXED)
+      else:
+        let deferredFrees = a.sharedFreeListBigChunks
+        a.sharedFreeListBigChunks = nil
+      if deferredFrees != nil:
+        freeDeferredObjects(a, deferredFrees)
+
+    size = requestedSize + bigChunkOverhead() #  roundup(requestedSize+bigChunkOverhead(), PageSize)
     # allocate a large block
-    var c = getBigChunk(a, size)
+    var c = if size >= HugeChunkSize: getHugeChunk(a, size)
+            else: getBigChunk(a, size)
     sysAssert c.prev == nil, "rawAlloc 10"
     sysAssert c.next == nil, "rawAlloc 11"
-    sysAssert c.size == size, "rawAlloc 12"
     result = addr(c.data)
-    sysAssert((cast[TAddress](result) and (MemAlign-1)) == 0, "rawAlloc 13")
-    if a.root == nil: a.root = bottom
-    add(a, a.root, cast[TAddress](result), cast[TAddress](result)+%size)
+    sysAssert((cast[int](c) and (MemAlign-1)) == 0, "rawAlloc 13")
+    sysAssert((cast[int](c) and PageMask) == 0, "rawAlloc: Not aligned on a page boundary")
+    when not defined(gcDestructors):
+      if a.root == nil: a.root = getBottom(a)
+      add(a, a.root, cast[int](result), cast[int](result)+%size)
+    inc a.occ, c.size
+    trackSize(c.size)
   sysAssert(isAccessible(a, result), "rawAlloc 14")
   sysAssert(allocInv(a), "rawAlloc: end")
+  when logAlloc: cprintf("var pointer_%p = alloc(%ld) # %p\n", result, requestedSize, addr a)
 
-proc rawAlloc0(a: var TMemRegion, requestedSize: int): pointer =
+proc rawAlloc0(a: var MemRegion, requestedSize: int): pointer =
   result = rawAlloc(a, requestedSize)
   zeroMem(result, requestedSize)
 
-proc rawDealloc(a: var TMemRegion, p: pointer) =
+proc rawDealloc(a: var MemRegion, p: pointer) =
+  when defined(nimTypeNames):
+    inc(a.deallocCounter)
   #sysAssert(isAllocatedPtr(a, p), "rawDealloc: no allocated pointer")
   sysAssert(allocInv(a), "rawDealloc: begin")
   var c = pageAddr(p)
+  sysAssert(c != nil, "rawDealloc: begin")
   if isSmallChunk(c):
     # `p` is within a small chunk:
     var c = cast[PSmallChunk](c)
-    var s = c.size
-    sysAssert(((cast[TAddress](p) and PageMask) - smallChunkOverhead()) %%
-               s == 0, "rawDealloc 3")
-    var f = cast[ptr TFreeCell](p)
-    #echo("setting to nil: ", $cast[TAddress](addr(f.zeroField)))
-    sysAssert(f.zeroField != 0, "rawDealloc 1")
-    f.zeroField = 0
-    f.next = c.freeList
-    c.freeList = f
-    when overwriteFree: 
-      # set to 0xff to check for usage after free bugs:
-      c_memset(cast[pointer](cast[int](p) +% sizeof(TFreeCell)), -1'i32, 
-               s -% sizeof(TFreeCell))
-    # check if it is not in the freeSmallChunks[s] list:
-    if c.free < s:
-      # add it to the freeSmallChunks[s] array:
-      ListAdd(a.freeSmallChunks[s div memAlign], c)
-      inc(c.free, s)
+    let s = c.size
+    #       ^ We might access thread foreign storage here.
+    # The other thread cannot possibly free this block as it's still alive.
+    var f = cast[ptr FreeCell](p)
+    if c.owner == addr(a):
+      # We own the block, there is no foreign thread involved.
+      dec a.occ, s
+      untrackSize(s)
+      sysAssert a.occ >= 0, "rawDealloc: negative occupied memory (case A)"
+      sysAssert(((cast[int](p) and PageMask) - smallChunkOverhead()) %%
+                s == 0, "rawDealloc 3")
+      when not defined(gcDestructors):
+        #echo("setting to nil: ", $cast[int](addr(f.zeroField)))
+        sysAssert(f.zeroField != 0, "rawDealloc 1")
+        f.zeroField = 0
+      when overwriteFree:
+        # set to 0xff to check for usage after free bugs:
+        nimSetMem(cast[pointer](cast[int](p) +% sizeof(FreeCell)), -1'i32,
+                s -% sizeof(FreeCell))
+      let activeChunk = a.freeSmallChunks[s div MemAlign]
+      if activeChunk != nil and c != activeChunk:
+        # This pointer is not part of the active chunk, lend it out
+        #  and do not adjust the current chunk (same logic as compensateCounters.)
+        # Put the cell into the active chunk,
+        #  may prevent a queue of available chunks from forming in a.freeSmallChunks[s div MemAlign].
+        #  This queue would otherwise waste memory in the form of free cells until we return to those chunks.
+        f.next = activeChunk.freeList
+        activeChunk.freeList = f # lend the cell
+        inc(activeChunk.free, s) # By not adjusting the current chunk's capacity it is prevented from being freed
+        inc(activeChunk.foreignCells) # The cell is now considered foreign from the perspective of the active chunk
+      else:
+        f.next = c.freeList
+        c.freeList = f
+        if c.free < s:
+          # The chunk could not have been active as it didn't have enough space to give
+          listAdd(a.freeSmallChunks[s div MemAlign], c)
+          inc(c.free, s)
+        else:
+          inc(c.free, s)
+          # Free only if the entire chunk is unused and there are no borrowed cells.
+          # If the chunk were to be freed while it references foreign cells,
+          #  the foreign chunks will leak memory and can never be freed.
+          if c.free == SmallChunkSize-smallChunkOverhead() and c.foreignCells == 0:
+            listRemove(a.freeSmallChunks[s div MemAlign], c)
+            c.size = SmallChunkSize
+            freeBigChunk(a, cast[PBigChunk](c))
     else:
-      inc(c.free, s)
-      if c.free == SmallChunkSize-smallChunkOverhead():
-        ListRemove(a.freeSmallChunks[s div memAlign], c)
-        c.size = SmallChunkSize
-        freeBigChunk(a, cast[PBigChunk](c))
-    sysAssert(((cast[TAddress](p) and PageMask) - smallChunkOverhead()) %%
+      when logAlloc: cprintf("dealloc(pointer_%p) # SMALL FROM %p CALLER %p\n", p, c.owner, addr(a))
+
+      when defined(gcDestructors):
+        addToSharedFreeList(c, f, s div MemAlign)
+    sysAssert(((cast[int](p) and PageMask) - smallChunkOverhead()) %%
                s == 0, "rawDealloc 2")
   else:
     # set to 0xff to check for usage after free bugs:
-    when overwriteFree: c_memset(p, -1'i32, c.size -% bigChunkOverhead())
-    # free big chunk
-    var c = cast[PBigChunk](c)
-    a.deleted = bottom
-    del(a, a.root, cast[int](addr(c.data)))
-    freeBigChunk(a, c)
-  sysAssert(allocInv(a), "rawDealloc: end")
-
-proc isAllocatedPtr(a: TMemRegion, p: pointer): bool = 
-  if isAccessible(a, p):
-    var c = pageAddr(p)
-    if not chunkUnused(c):
-      if isSmallChunk(c):
-        var c = cast[PSmallChunk](c)
-        var offset = (cast[TAddress](p) and (PageSize-1)) -% 
-                     smallChunkOverhead()
-        result = (c.acc >% offset) and (offset %% c.size == 0) and
-          (cast[ptr TFreeCell](p).zeroField >% 1)
+    when overwriteFree: nimSetMem(p, -1'i32, c.size -% bigChunkOverhead())
+    when logAlloc: cprintf("dealloc(pointer_%p) # BIG %p\n", p, c.owner)
+    when defined(gcDestructors):
+      if c.owner == addr(a):
+        deallocBigChunk(a, cast[PBigChunk](c))
       else:
-        var c = cast[PBigChunk](c)
-        result = p == addr(c.data) and cast[ptr TFreeCell](p).zeroField >% 1
-
-proc prepareForInteriorPointerChecking(a: var TMemRegion) {.inline.} =
-  a.minLargeObj = lowGauge(a.root)
-  a.maxLargeObj = highGauge(a.root)
+        addToSharedFreeListBigChunks(c.owner[], cast[PBigChunk](c))
+    else:
+      deallocBigChunk(a, cast[PBigChunk](c))
 
-proc interiorAllocatedPtr(a: TMemRegion, p: pointer): pointer =
-  if isAccessible(a, p):
-    var c = pageAddr(p)
-    if not chunkUnused(c):
-      if isSmallChunk(c):
-        var c = cast[PSmallChunk](c)
-        var offset = (cast[TAddress](p) and (PageSize-1)) -% 
-                     smallChunkOverhead()
-        if c.acc >% offset:
-          sysAssert(cast[TAddress](addr(c.data)) +% offset ==
-                    cast[TAddress](p), "offset is not what you think it is")
-          var d = cast[ptr TFreeCell](cast[TAddress](addr(c.data)) +% 
-                    offset -% (offset %% c.size))
-          if d.zeroField >% 1:
+  sysAssert(allocInv(a), "rawDealloc: end")
+  #when logAlloc: cprintf("dealloc(pointer_%p)\n", p)
+
+when not defined(gcDestructors):
+  proc isAllocatedPtr(a: MemRegion, p: pointer): bool =
+    if isAccessible(a, p):
+      var c = pageAddr(p)
+      if not chunkUnused(c):
+        if isSmallChunk(c):
+          var c = cast[PSmallChunk](c)
+          var offset = (cast[int](p) and (PageSize-1)) -%
+                      smallChunkOverhead()
+          result = (c.acc.int >% offset) and (offset %% c.size == 0) and
+            (cast[ptr FreeCell](p).zeroField >% 1)
+        else:
+          var c = cast[PBigChunk](c)
+          result = p == addr(c.data) and cast[ptr FreeCell](p).zeroField >% 1
+
+  proc prepareForInteriorPointerChecking(a: var MemRegion) {.inline.} =
+    a.minLargeObj = lowGauge(a.root)
+    a.maxLargeObj = highGauge(a.root)
+
+  proc interiorAllocatedPtr(a: MemRegion, p: pointer): pointer =
+    if isAccessible(a, p):
+      var c = pageAddr(p)
+      if not chunkUnused(c):
+        if isSmallChunk(c):
+          var c = cast[PSmallChunk](c)
+          var offset = (cast[int](p) and (PageSize-1)) -%
+                      smallChunkOverhead()
+          if c.acc.int >% offset:
+            sysAssert(cast[int](addr(c.data)) +% offset ==
+                      cast[int](p), "offset is not what you think it is")
+            var d = cast[ptr FreeCell](cast[int](addr(c.data)) +%
+                      offset -% (offset %% c.size))
+            if d.zeroField >% 1:
+              result = d
+              sysAssert isAllocatedPtr(a, result), " result wrong pointer!"
+        else:
+          var c = cast[PBigChunk](c)
+          var d = addr(c.data)
+          if p >= d and cast[ptr FreeCell](d).zeroField >% 1:
             result = d
             sysAssert isAllocatedPtr(a, result), " result wrong pointer!"
-      else:
-        var c = cast[PBigChunk](c)
-        var d = addr(c.data)
-        if p >= d and cast[ptr TFreeCell](d).zeroField >% 1:
-          result = d
-          sysAssert isAllocatedPtr(a, result), " result wrong pointer!"
-  else:
-    var q = cast[int](p)
-    if q >=% a.minLargeObj and q <=% a.maxLargeObj:
-      # this check is highly effective! Test fails for 99,96% of all checks on
-      # an x86-64.
-      var avlNode = inRange(a.root, q)
-      if avlNode != nil:
-        var k = cast[pointer](avlNode.key)
-        var c = cast[PBigChunk](pageAddr(k))
-        sysAssert(addr(c.data) == k, " k is not the same as addr(c.data)!")
-        if cast[ptr TFreeCell](k).zeroField >% 1:
-          result = k
-          sysAssert isAllocatedPtr(a, result), " result wrong pointer!"
+    else:
+      var q = cast[int](p)
+      if q >=% a.minLargeObj and q <=% a.maxLargeObj:
+        # this check is highly effective! Test fails for 99,96% of all checks on
+        # an x86-64.
+        var avlNode = inRange(a.root, q)
+        if avlNode != nil:
+          var k = cast[pointer](avlNode.key)
+          var c = cast[PBigChunk](pageAddr(k))
+          sysAssert(addr(c.data) == k, " k is not the same as addr(c.data)!")
+          if cast[ptr FreeCell](k).zeroField >% 1:
+            result = k
+            sysAssert isAllocatedPtr(a, result), " result wrong pointer!"
 
 proc ptrSize(p: pointer): int =
-  var x = cast[pointer](cast[TAddress](p) -% sizeof(TFreeCell))
-  var c = pageAddr(p)
-  sysAssert(not chunkUnused(c), "ptrSize")
-  result = c.size -% sizeof(TFreeCell)
-  if not isSmallChunk(c):
-    dec result, bigChunkOverhead()
-
-proc alloc(allocator: var TMemRegion, size: int): pointer =
-  result = rawAlloc(allocator, size+sizeof(TFreeCell))
-  cast[ptr TFreeCell](result).zeroField = 1 # mark it as used
-  sysAssert(not isAllocatedPtr(allocator, result), "alloc")
-  result = cast[pointer](cast[TAddress](result) +% sizeof(TFreeCell))
-
-proc alloc0(allocator: var TMemRegion, size: int): pointer =
+  when not defined(gcDestructors):
+    var x = cast[pointer](cast[int](p) -% sizeof(FreeCell))
+    var c = pageAddr(p)
+    sysAssert(not chunkUnused(c), "ptrSize")
+    result = c.size -% sizeof(FreeCell)
+    if not isSmallChunk(c):
+      dec result, bigChunkOverhead()
+  else:
+    var c = pageAddr(p)
+    sysAssert(not chunkUnused(c), "ptrSize")
+    result = c.size
+    if not isSmallChunk(c):
+      dec result, bigChunkOverhead()
+
+proc alloc(allocator: var MemRegion, size: Natural): pointer {.gcsafe.} =
+  when not defined(gcDestructors):
+    result = rawAlloc(allocator, size+sizeof(FreeCell))
+    cast[ptr FreeCell](result).zeroField = 1 # mark it as used
+    sysAssert(not isAllocatedPtr(allocator, result), "alloc")
+    result = cast[pointer](cast[int](result) +% sizeof(FreeCell))
+    track("alloc", result, size)
+  else:
+    result = rawAlloc(allocator, size)
+
+proc alloc0(allocator: var MemRegion, size: Natural): pointer =
   result = alloc(allocator, size)
   zeroMem(result, size)
 
-proc dealloc(allocator: var TMemRegion, p: pointer) =
-  var x = cast[pointer](cast[TAddress](p) -% sizeof(TFreeCell))
-  sysAssert(cast[ptr TFreeCell](x).zeroField == 1, "dealloc 1")
-  rawDealloc(allocator, x)
-  sysAssert(not isAllocatedPtr(allocator, x), "dealloc 2")
+proc dealloc(allocator: var MemRegion, p: pointer) =
+  when not defined(gcDestructors):
+    sysAssert(p != nil, "dealloc: p is nil")
+    var x = cast[pointer](cast[int](p) -% sizeof(FreeCell))
+    sysAssert(x != nil, "dealloc: x is nil")
+    sysAssert(isAccessible(allocator, x), "is not accessible")
+    sysAssert(cast[ptr FreeCell](x).zeroField == 1, "dealloc: object header corrupted")
+    rawDealloc(allocator, x)
+    sysAssert(not isAllocatedPtr(allocator, x), "dealloc: object still accessible")
+    track("dealloc", p, 0)
+  else:
+    rawDealloc(allocator, p)
 
-proc realloc(allocator: var TMemRegion, p: pointer, newsize: int): pointer =
+proc realloc(allocator: var MemRegion, p: pointer, newsize: Natural): pointer =
   if newsize > 0:
-    result = alloc0(allocator, newsize)
+    result = alloc(allocator, newsize)
     if p != nil:
-      copyMem(result, p, ptrSize(p))
+      copyMem(result, p, min(ptrSize(p), newsize))
       dealloc(allocator, p)
   elif p != nil:
     dealloc(allocator, p)
 
-proc deallocOsPages(a: var TMemRegion) =
+proc realloc0(allocator: var MemRegion, p: pointer, oldsize, newsize: Natural): pointer =
+  result = realloc(allocator, p, newsize)
+  if newsize > oldsize:
+    zeroMem(cast[pointer](cast[uint](result) + uint(oldsize)), newsize - oldsize)
+
+proc deallocOsPages(a: var MemRegion) =
   # we free every 'ordinarily' allocated page by iterating over the page bits:
-  for p in elements(a.chunkStarts):
-    var page = cast[PChunk](p shl pageShift)
-    when not weirdUnmap:
-      var size = if page.size < PageSize: PageSize else: page.size
-      osDeallocPages(page, size)
-    else:
-      # Linux on PowerPC for example frees MORE than asked if 'munmap'
-      # receives the start of an originally mmap'ed memory block. This is not
-      # too bad, but we must not access 'page.size' then as that could trigger
-      # a segfault. But we don't need to access 'page.size' here anyway,
-      # because calling munmap with PageSize suffices:
-      osDeallocPages(page, PageSize)
+  var it = addr(a.heapLinks)
+  while true:
+    let next = it.next
+    for i in 0..it.len-1:
+      let (p, size) = it.chunks[i]
+      when defined(debugHeapLinks):
+        cprintf("owner %p; dealloc A: %p size: %ld; next: %p\n", addr(a),
+          it, size, next)
+      sysAssert size >= PageSize, "origSize too small"
+      osDeallocPages(p, size)
+    it = next
+    if it == nil: break
   # And then we free the pages that are in use for the page bits:
   llDeallocAll(a)
 
-proc getFreeMem(a: TMemRegion): int {.inline.} = result = a.freeMem
-proc getTotalMem(a: TMemRegion): int {.inline.} = result = a.currMem
-proc getOccupiedMem(a: TMemRegion): int {.inline.} = 
-  result = a.currMem - a.freeMem
+proc getFreeMem(a: MemRegion): int {.inline.} = result = a.freeMem
+proc getTotalMem(a: MemRegion): int {.inline.} = result = a.currMem
+proc getOccupiedMem(a: MemRegion): int {.inline.} =
+  result = a.occ
+  # a.currMem - a.freeMem
+
+when defined(nimTypeNames):
+  proc getMemCounters(a: MemRegion): (int, int) {.inline.} =
+    (a.allocCounter, a.deallocCounter)
 
 # ---------------------- thread memory region -------------------------------
 
-template InstantiateForRegion(allocator: expr) =
-  when false:
+template instantiateForRegion(allocator: untyped) {.dirty.} =
+  {.push stackTrace: off.}
+
+  when defined(nimFulldebug):
     proc interiorAllocatedPtr*(p: pointer): pointer =
       result = interiorAllocatedPtr(allocator, p)
 
     proc isAllocatedPtr*(p: pointer): bool =
-      let p = cast[pointer](cast[TAddress](p)-%TAddress(sizeof(TCell)))
+      let p = cast[pointer](cast[int](p)-%ByteAddress(sizeof(Cell)))
       result = isAllocatedPtr(allocator, p)
 
   proc deallocOsPages = deallocOsPages(allocator)
 
-  proc alloc(size: int): pointer =
+  proc allocImpl(size: Natural): pointer =
     result = alloc(allocator, size)
 
-  proc alloc0(size: int): pointer =
+  proc alloc0Impl(size: Natural): pointer =
     result = alloc0(allocator, size)
 
-  proc dealloc(p: pointer) =
+  proc deallocImpl(p: pointer) =
     dealloc(allocator, p)
 
-  proc realloc(p: pointer, newsize: int): pointer =
-    result = realloc(allocator, p, newsize)
+  proc reallocImpl(p: pointer, newSize: Natural): pointer =
+    result = realloc(allocator, p, newSize)
+
+  proc realloc0Impl(p: pointer, oldSize, newSize: Natural): pointer =
+    result = realloc(allocator, p, newSize)
+    if newSize > oldSize:
+      zeroMem(cast[pointer](cast[uint](result) + uint(oldSize)), newSize - oldSize)
 
   when false:
     proc countFreeMem(): int =
@@ -779,45 +1244,94 @@ template InstantiateForRegion(allocator: expr) =
         inc(result, it.size)
         it = it.next
 
-  proc getFreeMem(): int = 
-    result = allocator.freeMem
+  when hasThreadSupport and not defined(gcDestructors):
+    proc addSysExitProc(quitProc: proc() {.noconv.}) {.importc: "atexit", header: "<stdlib.h>".}
+
+    var sharedHeap: MemRegion
+    var heapLock: SysLock
+    initSysLock(heapLock)
+    addSysExitProc(proc() {.noconv.} = deinitSys(heapLock))
+
+  proc getFreeMem(): int =
     #sysAssert(result == countFreeMem())
+    result = allocator.freeMem
+
+  proc getTotalMem(): int =
+    result = allocator.currMem
+
+  proc getOccupiedMem(): int =
+    result = allocator.occ #getTotalMem() - getFreeMem()
 
-  proc getTotalMem(): int = return allocator.currMem
-  proc getOccupiedMem(): int = return getTotalMem() - getFreeMem()
+  proc getMaxMem*(): int =
+    result = getMaxMem(allocator)
+
+  when defined(nimTypeNames):
+    proc getMemCounters*(): (int, int) = getMemCounters(allocator)
 
   # -------------------- shared heap region ----------------------------------
-  when hasThreadSupport:
-    var sharedHeap: TMemRegion
-    var heapLock: TSysLock
-    InitSysLock(HeapLock)
 
-  proc allocShared(size: int): pointer =
-    when hasThreadSupport:
-      AcquireSys(HeapLock)
+  proc allocSharedImpl(size: Natural): pointer =
+    when hasThreadSupport and not defined(gcDestructors):
+      acquireSys(heapLock)
       result = alloc(sharedHeap, size)
-      ReleaseSys(HeapLock)
+      releaseSys(heapLock)
     else:
-      result = alloc(size)
+      result = allocImpl(size)
 
-  proc allocShared0(size: int): pointer =
-    result = allocShared(size)
+  proc allocShared0Impl(size: Natural): pointer =
+    result = allocSharedImpl(size)
     zeroMem(result, size)
 
-  proc deallocShared(p: pointer) =
-    when hasThreadSupport: 
-      AcquireSys(HeapLock)
+  proc deallocSharedImpl(p: pointer) =
+    when hasThreadSupport and not defined(gcDestructors):
+      acquireSys(heapLock)
       dealloc(sharedHeap, p)
-      ReleaseSys(HeapLock)
+      releaseSys(heapLock)
+    else:
+      deallocImpl(p)
+
+  proc reallocSharedImpl(p: pointer, newSize: Natural): pointer =
+    when hasThreadSupport and not defined(gcDestructors):
+      acquireSys(heapLock)
+      result = realloc(sharedHeap, p, newSize)
+      releaseSys(heapLock)
     else:
-      dealloc(p)
+      result = reallocImpl(p, newSize)
 
-  proc reallocShared(p: pointer, newsize: int): pointer =
-    when hasThreadSupport: 
-      AcquireSys(HeapLock)
-      result = realloc(sharedHeap, p, newsize)
-      ReleaseSys(HeapLock)
+  proc reallocShared0Impl(p: pointer, oldSize, newSize: Natural): pointer =
+    when hasThreadSupport and not defined(gcDestructors):
+      acquireSys(heapLock)
+      result = realloc0(sharedHeap, p, oldSize, newSize)
+      releaseSys(heapLock)
     else:
-      result = realloc(p, newsize)
+      result = realloc0Impl(p, oldSize, newSize)
+
+  when hasThreadSupport:
+    when defined(gcDestructors):
+      proc getFreeSharedMem(): int =
+        allocator.freeMem
+
+      proc getTotalSharedMem(): int =
+        allocator.currMem
+
+      proc getOccupiedSharedMem(): int =
+        allocator.occ
+
+    else:
+      template sharedMemStatsShared(v: int) =
+        acquireSys(heapLock)
+        result = v
+        releaseSys(heapLock)
+
+      proc getFreeSharedMem(): int =
+        sharedMemStatsShared(sharedHeap.freeMem)
+
+      proc getTotalSharedMem(): int =
+        sharedMemStatsShared(sharedHeap.currMem)
+
+      proc getOccupiedSharedMem(): int =
+        sharedMemStatsShared(sharedHeap.occ)
+        #sharedMemStatsShared(sharedHeap.currMem - sharedHeap.freeMem)
+  {.pop.}
 
 {.pop.}