summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorAndreas Rumpf <rumpf_a@web.de>2022-10-09 11:44:02 +0200
committerGitHub <noreply@github.com>2022-10-09 11:44:02 +0200
commitfbc6975922ea644f0bcf1ad034edd780b3dcfa87 (patch)
tree38bbc743e66a340c64b515d9ba774f16a16a2bbe
parentb47d5486dbd0e6677e42d2afd613fb921b3e12eb (diff)
downloadNim-fbc6975922ea644f0bcf1ad034edd780b3dcfa87.tar.gz
threaded alloc (#20492)
* allocator: catch up with multi-threading techniques
* removed the global thread lock
* more atomics for fun and profit
* added important sysAssert
* stats remain thread local and don't have to be atomic
* undo split chunk optimizations in the hope it makes the CI happy
-rw-r--r--lib/system/alloc.nim331
-rw-r--r--lib/system/atomics.nim4
-rw-r--r--lib/system/threads.nim30
3 files changed, 247 insertions, 118 deletions
diff --git a/lib/system/alloc.nim b/lib/system/alloc.nim
index c55aea079..88f680500 100644
--- a/lib/system/alloc.nim
+++ b/lib/system/alloc.nim
@@ -44,6 +44,29 @@ type
   IntSet = object
     data: TrunkBuckets
 
+# ------------- chunk table ---------------------------------------------------
+# We use a PtrSet of chunk starts and a table[Page, chunksize] for chunk
+# endings of big chunks. This is needed by the merging operation. The only
+# remaining operation is best-fit for big chunks. Since there is a size-limit
+# for big chunks (because greater than the limit means they are returned back
+# to the OS), a fixed size array can be used.
+
+type
+  PLLChunk = ptr LLChunk
+  LLChunk = object ## *low-level* chunk
+    size: int                # remaining size
+    acc: int                 # accumulator
+    next: PLLChunk           # next low-level chunk; only needed for dealloc
+
+  PAvlNode = ptr AvlNode
+  AvlNode = object
+    link: array[0..1, PAvlNode] # Left (0) and right (1) links
+    key, upperBound: int
+    level: int
+
+const
+  RegionHasLock = false # hasThreadSupport and defined(gcDestructors)
+
 type
   FreeCell {.final, pure.} = object
     next: ptr FreeCell  # next free cell in chunk (overlaid with refcount)
@@ -61,12 +84,15 @@ type
     prevSize: int        # size of previous chunk; for coalescing
                          # 0th bit == 1 if 'used
     size: int            # if < PageSize it is a small chunk
+    owner: ptr MemRegion
 
   SmallChunk = object of BaseChunk
     next, prev: PSmallChunk  # chunks of the same size
     freeList: ptr FreeCell
     free: int            # how many bytes remain
     acc: int             # accumulator for small object allocation
+    when defined(gcDestructors):
+      sharedFreeList: ptr FreeCell # make no attempt at avoiding false sharing for now for this object field
     when defined(nimAlignPragma):
       data {.align: MemAlign.}: UncheckedArray[byte]      # start of usable memory
     else:
@@ -79,29 +105,6 @@ type
     else:
       data: UncheckedArray[byte]
 
-template smallChunkOverhead(): untyped = sizeof(SmallChunk)
-template bigChunkOverhead(): untyped = sizeof(BigChunk)
-
-# ------------- chunk table ---------------------------------------------------
-# We use a PtrSet of chunk starts and a table[Page, chunksize] for chunk
-# endings of big chunks. This is needed by the merging operation. The only
-# remaining operation is best-fit for big chunks. Since there is a size-limit
-# for big chunks (because greater than the limit means they are returned back
-# to the OS), a fixed size array can be used.
-
-type
-  PLLChunk = ptr LLChunk
-  LLChunk = object ## *low-level* chunk
-    size: int                # remaining size
-    acc: int                 # accumulator
-    next: PLLChunk           # next low-level chunk; only needed for dealloc
-
-  PAvlNode = ptr AvlNode
-  AvlNode = object
-    link: array[0..1, PAvlNode] # Left (0) and right (1) links
-    key, upperBound: int
-    level: int
-
   HeapLinks = object
     len: int
     chunks: array[30, (PBigChunk, int)]
@@ -117,10 +120,15 @@ type
     llmem: PLLChunk
     currMem, maxMem, freeMem, occ: int # memory sizes (allocated from OS)
     lastSize: int # needed for the case that OS gives us pages linearly
+    when RegionHasLock:
+      lock: SysLock
+    when defined(gcDestructors):
+      sharedFreeListBigChunks: PBigChunk # make no attempt at avoiding false sharing for now for this object field
+
     chunkStarts: IntSet
     when not defined(gcDestructors):
       root, deleted, last, freeAvlNodes: PAvlNode
-    locked, blockChunkSizeIncrease: bool # if locked, we cannot free pages.
+    lockActive, locked, blockChunkSizeIncrease: bool # if locked, we cannot free pages.
     nextChunkSize: int
     when not defined(gcDestructors):
       bottomData: AvlNode
@@ -128,6 +136,24 @@ type
     when defined(nimTypeNames):
       allocCounter, deallocCounter: int
 
+template smallChunkOverhead(): untyped = sizeof(SmallChunk)
+template bigChunkOverhead(): untyped = sizeof(BigChunk)
+
+when hasThreadSupport:
+  template loada(x: untyped): untyped = atomicLoadN(unsafeAddr x, ATOMIC_RELAXED)
+  template storea(x, y: untyped) = atomicStoreN(unsafeAddr x, y, ATOMIC_RELAXED)
+
+  when false:
+    # not yet required
+    template atomicStatDec(x, diff: untyped) = discard atomicSubFetch(unsafeAddr x, diff, ATOMIC_RELAXED)
+    template atomicStatInc(x, diff: untyped) = discard atomicAddFetch(unsafeAddr x, diff, ATOMIC_RELAXED)
+else:
+  template loada(x: untyped): untyped = x
+  template storea(x, y: untyped) = x = y
+
+template atomicStatDec(x, diff: untyped) = dec x, diff
+template atomicStatInc(x, diff: untyped) = inc x, diff
+
 const
   fsLookupTable: array[byte, int8] = [
     -1'i8, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -234,11 +260,11 @@ proc addChunkToMatrix(a: var MemRegion; b: PBigChunk) =
   setBit(fl, a.flBitmap)
 
 proc incCurrMem(a: var MemRegion, bytes: int) {.inline.} =
-  inc(a.currMem, bytes)
+  atomicStatInc(a.currMem, bytes)
 
 proc decCurrMem(a: var MemRegion, bytes: int) {.inline.} =
   a.maxMem = max(a.maxMem, a.currMem)
-  dec(a.currMem, bytes)
+  atomicStatDec(a.currMem, bytes)
 
 proc getMaxMem(a: var MemRegion): int =
   # Since we update maxPagesCount only when freeing pages,
@@ -536,12 +562,13 @@ proc splitChunk2(a: var MemRegion, c: PBigChunk, size: int): PBigChunk =
   result = cast[PBigChunk](cast[ByteAddress](c) +% size)
   result.size = c.size - size
   track("result.size", addr result.size, sizeof(int))
-  # XXX check if these two nil assignments are dead code given
-  # addChunkToMatrix's implementation:
-  result.next = nil
-  result.prev = nil
+  when not defined(nimOptimizedSplitChunk):
+    # still active because of weird codegen issue on some of our CIs:
+    result.next = nil
+    result.prev = nil
   # size and not used:
   result.prevSize = size
+  result.owner = addr a
   sysAssert((size and 1) == 0, "splitChunk 2")
   sysAssert((size and PageMask) == 0,
       "splitChunk: size is not a multiple of the PageSize")
@@ -572,6 +599,9 @@ proc freeBigChunk(a: var MemRegion, c: PBigChunk) =
           c = cast[PBigChunk](le)
           if c.size > MaxBigChunkSize:
             let rest = splitChunk2(a, c, MaxBigChunkSize)
+            when defined(nimOptimizedSplitChunk):
+              rest.next = nil
+              rest.prev = nil
             addChunkToMatrix(a, c)
             c = rest
   when coalescRight:
@@ -596,6 +626,13 @@ proc getBigChunk(a: var MemRegion, size: int): PBigChunk =
   mappingSearch(size, fl, sl)
   sysAssert((size and PageMask) == 0, "getBigChunk: unaligned chunk")
   result = findSuitableBlock(a, fl, sl)
+
+  when RegionHasLock:
+    if not a.lockActive:
+      a.lockActive = true
+      initSysLock(a.lock)
+    acquireSys a.lock
+
   if result == nil:
     if size < nimMinHeapPages * PageSize:
       result = requestOsChunks(a, nimMinHeapPages * PageSize)
@@ -605,6 +642,7 @@ proc getBigChunk(a: var MemRegion, size: int): PBigChunk =
       # if we over allocated split the chunk:
       if result.size > size:
         splitChunk(a, result, size)
+    result.owner = addr a
   else:
     removeChunkFromMatrix2(a, result, fl, sl)
     if result.size >= size + PageSize:
@@ -612,12 +650,20 @@ proc getBigChunk(a: var MemRegion, size: int): PBigChunk =
   # set 'used' to to true:
   result.prevSize = 1
   track("setUsedToFalse", addr result.size, sizeof(int))
+  sysAssert result.owner == addr a, "getBigChunk: No owner set!"
 
   incl(a, a.chunkStarts, pageIndex(result))
   dec(a.freeMem, size)
+  when RegionHasLock:
+    releaseSys a.lock
 
 proc getHugeChunk(a: var MemRegion; size: int): PBigChunk =
   result = cast[PBigChunk](osAllocPages(size))
+  when RegionHasLock:
+    if not a.lockActive:
+      a.lockActive = true
+      initSysLock(a.lock)
+    acquireSys a.lock
   incCurrMem(a, size)
   # XXX add this to the heap links. But also remove it from it later.
   when false: a.addHeapLink(result, size)
@@ -627,7 +673,10 @@ proc getHugeChunk(a: var MemRegion; size: int): PBigChunk =
   result.size = size
   # set 'used' to to true:
   result.prevSize = 1
+  result.owner = addr a
   incl(a, a.chunkStarts, pageIndex(result))
+  when RegionHasLock:
+    releaseSys a.lock
 
 proc freeHugeChunk(a: var MemRegion; c: PBigChunk) =
   let size = c.size
@@ -691,6 +740,69 @@ else:
   template trackSize(x) = discard
   template untrackSize(x) = discard
 
+proc deallocBigChunk(a: var MemRegion, c: PBigChunk) =
+  when RegionHasLock:
+    acquireSys a.lock
+  dec a.occ, c.size
+  untrackSize(c.size)
+  sysAssert a.occ >= 0, "rawDealloc: negative occupied memory (case B)"
+  when not defined(gcDestructors):
+    a.deleted = getBottom(a)
+    del(a, a.root, cast[int](addr(c.data)))
+  if c.size >= HugeChunkSize: freeHugeChunk(a, c)
+  else: freeBigChunk(a, c)
+  when RegionHasLock:
+    releaseSys a.lock
+
+when defined(gcDestructors):
+  template atomicPrepend(head, elem: untyped) =
+    # see also https://en.cppreference.com/w/cpp/atomic/atomic_compare_exchange
+    while true:
+      elem.next.storea head.loada
+      if atomicCompareExchangeN(addr head, addr elem.next, elem, weak = true, ATOMIC_RELEASE, ATOMIC_RELAXED):
+        break
+
+  proc addToSharedFreeListBigChunks(a: var MemRegion; c: PBigChunk) {.inline.} =
+    sysAssert c.next == nil, "c.next pointer must be nil"
+    atomicPrepend a.sharedFreeListBigChunks, c
+
+  proc addToSharedFreeList(c: PSmallChunk; f: ptr FreeCell) {.inline.} =
+    atomicPrepend c.sharedFreeList, f
+
+  proc compensateCounters(a: var MemRegion; c: PSmallChunk; size: int) =
+    # rawDealloc did NOT do the usual:
+    # `inc(c.free, size); dec(a.occ, size)` because it wasn't the owner of these
+    # memory locations. We have to compensate here for these for the entire list.
+    # Well, not for the entire list, but for `max` elements of the list because
+    # we split the list in order to achieve bounded response times.
+    var it = c.freeList
+    var x = 0
+    var maxIters = 20 # make it time-bounded
+    while it != nil:
+      if maxIters == 0:
+        let rest = it.next.loada
+        it.next.storea nil
+        addToSharedFreeList(c, rest)
+        break
+      inc x, size
+      it = it.next.loada
+      dec maxIters
+    inc(c.free, x)
+    dec(a.occ, x)
+
+  proc freeDeferredObjects(a: var MemRegion; root: PBigChunk) =
+    var it = root
+    var maxIters = 20 # make it time-bounded
+    while true:
+      if maxIters == 0:
+        let rest = it.next.loada
+        it.next.storea nil
+        addToSharedFreeListBigChunks(a, rest)
+        break
+      it = it.next.loada
+      dec maxIters
+      if it == nil: break
+
 proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
   when defined(nimTypeNames):
     inc(a.allocCounter)
@@ -703,7 +815,7 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
   #c_fprintf(stdout, "alloc; size: %ld; %ld\n", requestedSize, size)
   if size <= SmallChunkSize-smallChunkOverhead():
     # allocate a small block: for small chunks, we use only its next pointer
-    var s = size div MemAlign
+    let s = size div MemAlign
     var c = a.freeSmallChunks[s]
     if c == nil:
       c = getSmallChunk(a)
@@ -711,7 +823,10 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
       sysAssert c.size == PageSize, "rawAlloc 3"
       c.size = size
       c.acc = size
+      when defined(gcDestructors):
+        c.sharedFreeList = nil
       c.free = SmallChunkSize - smallChunkOverhead() - size
+      sysAssert c.owner == addr(a), "rawAlloc: No owner set!"
       c.next = nil
       c.prev = nil
       listAdd(a.freeSmallChunks[s], c)
@@ -723,6 +838,10 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
       #if c.size != size:
       #  c_fprintf(stdout, "csize: %lld; size %lld\n", c.size, size)
       sysAssert c.size == size, "rawAlloc 6"
+      when defined(gcDestructors):
+        if c.freeList == nil:
+          c.freeList = atomicExchangeN(addr c.sharedFreeList, nil, ATOMIC_RELAXED)
+          compensateCounters(a, c, size)
       if c.freeList == nil:
         sysAssert(c.acc + smallChunkOverhead() + size <= SmallChunkSize,
                   "rawAlloc 7")
@@ -747,6 +866,11 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
     inc a.occ, size
     trackSize(c.size)
   else:
+    when defined(gcDestructors):
+      let deferredFrees = atomicExchangeN(addr a.sharedFreeListBigChunks, nil, ATOMIC_RELAXED)
+      if deferredFrees != nil:
+        freeDeferredObjects(a, deferredFrees)
+
     size = requestedSize + bigChunkOverhead() #  roundup(requestedSize+bigChunkOverhead(), PageSize)
     # allocate a large block
     var c = if size >= HugeChunkSize: getHugeChunk(a, size)
@@ -779,48 +903,52 @@ proc rawDealloc(a: var MemRegion, p: pointer) =
     # `p` is within a small chunk:
     var c = cast[PSmallChunk](c)
     var s = c.size
-    dec a.occ, s
-    untrackSize(s)
-    sysAssert a.occ >= 0, "rawDealloc: negative occupied memory (case A)"
-    sysAssert(((cast[ByteAddress](p) and PageMask) - smallChunkOverhead()) %%
-               s == 0, "rawDealloc 3")
+    #       ^ We might access thread foreign storage here.
+    # The other thread cannot possibly free this block as it's still alive.
     var f = cast[ptr FreeCell](p)
-    when not defined(gcDestructors):
-      #echo("setting to nil: ", $cast[ByteAddress](addr(f.zeroField)))
-      sysAssert(f.zeroField != 0, "rawDealloc 1")
-      f.zeroField = 0
-    f.next = c.freeList
-    c.freeList = f
-    when overwriteFree:
-      # set to 0xff to check for usage after free bugs:
-      nimSetMem(cast[pointer](cast[int](p) +% sizeof(FreeCell)), -1'i32,
-               s -% sizeof(FreeCell))
-    # check if it is not in the freeSmallChunks[s] list:
-    if c.free < s:
-      # add it to the freeSmallChunks[s] array:
-      listAdd(a.freeSmallChunks[s div MemAlign], c)
-      inc(c.free, s)
+    if c.owner == addr(a):
+      # We own the block, there is no foreign thread involved.
+      dec a.occ, s
+      untrackSize(s)
+      sysAssert a.occ >= 0, "rawDealloc: negative occupied memory (case A)"
+      sysAssert(((cast[ByteAddress](p) and PageMask) - smallChunkOverhead()) %%
+                s == 0, "rawDealloc 3")
+      when not defined(gcDestructors):
+        #echo("setting to nil: ", $cast[ByteAddress](addr(f.zeroField)))
+        sysAssert(f.zeroField != 0, "rawDealloc 1")
+        f.zeroField = 0
+      f.next = c.freeList
+      c.freeList = f
+      when overwriteFree:
+        # set to 0xff to check for usage after free bugs:
+        nimSetMem(cast[pointer](cast[int](p) +% sizeof(FreeCell)), -1'i32,
+                s -% sizeof(FreeCell))
+      # check if it is not in the freeSmallChunks[s] list:
+      if c.free < s:
+        # add it to the freeSmallChunks[s] array:
+        listAdd(a.freeSmallChunks[s div MemAlign], c)
+        inc(c.free, s)
+      else:
+        inc(c.free, s)
+        if c.free == SmallChunkSize-smallChunkOverhead():
+          listRemove(a.freeSmallChunks[s div MemAlign], c)
+          c.size = SmallChunkSize
+          freeBigChunk(a, cast[PBigChunk](c))
     else:
-      inc(c.free, s)
-      if c.free == SmallChunkSize-smallChunkOverhead():
-        listRemove(a.freeSmallChunks[s div MemAlign], c)
-        c.size = SmallChunkSize
-        freeBigChunk(a, cast[PBigChunk](c))
+      when defined(gcDestructors):
+        addToSharedFreeList(c, f)
     sysAssert(((cast[ByteAddress](p) and PageMask) - smallChunkOverhead()) %%
                s == 0, "rawDealloc 2")
   else:
     # set to 0xff to check for usage after free bugs:
     when overwriteFree: nimSetMem(p, -1'i32, c.size -% bigChunkOverhead())
-    # free big chunk
-    var c = cast[PBigChunk](c)
-    dec a.occ, c.size
-    untrackSize(c.size)
-    sysAssert a.occ >= 0, "rawDealloc: negative occupied memory (case B)"
-    when not defined(gcDestructors):
-      a.deleted = getBottom(a)
-      del(a, a.root, cast[int](addr(c.data)))
-    if c.size >= HugeChunkSize: freeHugeChunk(a, c)
-    else: freeBigChunk(a, c)
+    when defined(gcDestructors):
+      if c.owner == addr(a):
+        deallocBigChunk(a, cast[PBigChunk](c))
+      else:
+        addToSharedFreeListBigChunks(c.owner[], cast[PBigChunk](c))
+    else:
+      deallocBigChunk(a, cast[PBigChunk](c))
   sysAssert(allocInv(a), "rawDealloc: end")
   when logAlloc: cprintf("dealloc(pointer_%p)\n", p)
 
@@ -1002,7 +1130,7 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
         inc(result, it.size)
         it = it.next
 
-  when hasThreadSupport:
+  when hasThreadSupport and not defined(gcDestructors):
     proc addSysExitProc(quitProc: proc() {.noconv.}) {.importc: "atexit", header: "<stdlib.h>".}
 
     var sharedHeap: MemRegion
@@ -1012,36 +1140,16 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
 
   proc getFreeMem(): int =
     #sysAssert(result == countFreeMem())
-    when hasThreadSupport and defined(gcDestructors):
-      acquireSys(heapLock)
-      result = sharedHeap.freeMem
-      releaseSys(heapLock)
-    else:
-      result = allocator.freeMem
+    result = allocator.freeMem
 
   proc getTotalMem(): int =
-    when hasThreadSupport and defined(gcDestructors):
-      acquireSys(heapLock)
-      result = sharedHeap.currMem
-      releaseSys(heapLock)
-    else:
-      result = allocator.currMem
+    result = allocator.currMem
 
   proc getOccupiedMem(): int =
-    when hasThreadSupport and defined(gcDestructors):
-      acquireSys(heapLock)
-      result = sharedHeap.occ
-      releaseSys(heapLock)
-    else:
-      result = allocator.occ #getTotalMem() - getFreeMem()
+    result = allocator.occ #getTotalMem() - getFreeMem()
 
   proc getMaxMem*(): int =
-    when hasThreadSupport and defined(gcDestructors):
-      acquireSys(heapLock)
-      result = getMaxMem(sharedHeap)
-      releaseSys(heapLock)
-    else:
-      result = getMaxMem(allocator)
+    result = getMaxMem(allocator)
 
   when defined(nimTypeNames):
     proc getMemCounters*(): (int, int) = getMemCounters(allocator)
@@ -1049,7 +1157,7 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
   # -------------------- shared heap region ----------------------------------
 
   proc allocSharedImpl(size: Natural): pointer =
-    when hasThreadSupport:
+    when hasThreadSupport and not defined(gcDestructors):
       acquireSys(heapLock)
       result = alloc(sharedHeap, size)
       releaseSys(heapLock)
@@ -1061,7 +1169,7 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
     zeroMem(result, size)
 
   proc deallocSharedImpl(p: pointer) =
-    when hasThreadSupport:
+    when hasThreadSupport and not defined(gcDestructors):
       acquireSys(heapLock)
       dealloc(sharedHeap, p)
       releaseSys(heapLock)
@@ -1069,7 +1177,7 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
       deallocImpl(p)
 
   proc reallocSharedImpl(p: pointer, newSize: Natural): pointer =
-    when hasThreadSupport:
+    when hasThreadSupport and not defined(gcDestructors):
       acquireSys(heapLock)
       result = realloc(sharedHeap, p, newSize)
       releaseSys(heapLock)
@@ -1077,7 +1185,7 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
       result = reallocImpl(p, newSize)
 
   proc reallocShared0Impl(p: pointer, oldSize, newSize: Natural): pointer =
-    when hasThreadSupport:
+    when hasThreadSupport and not defined(gcDestructors):
       acquireSys(heapLock)
       result = realloc0(sharedHeap, p, oldSize, newSize)
       releaseSys(heapLock)
@@ -1085,20 +1193,31 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
       result = realloc0Impl(p, oldSize, newSize)
 
   when hasThreadSupport:
-    template sharedMemStatsShared(v: int) =
-      acquireSys(heapLock)
-      result = v
-      releaseSys(heapLock)
+    when defined(gcDestructors):
+      proc getFreeSharedMem(): int =
+        allocator.freeMem
+
+      proc getTotalSharedMem(): int =
+        allocator.currMem
+
+      proc getOccupiedSharedMem(): int =
+        allocator.occ
+
+    else:
+      template sharedMemStatsShared(v: int) =
+        acquireSys(heapLock)
+        result = v
+        releaseSys(heapLock)
 
-    proc getFreeSharedMem(): int =
-      sharedMemStatsShared(sharedHeap.freeMem)
+      proc getFreeSharedMem(): int =
+        sharedMemStatsShared(sharedHeap.freeMem)
 
-    proc getTotalSharedMem(): int =
-      sharedMemStatsShared(sharedHeap.currMem)
+      proc getTotalSharedMem(): int =
+        sharedMemStatsShared(sharedHeap.currMem)
 
-    proc getOccupiedSharedMem(): int =
-      sharedMemStatsShared(sharedHeap.occ)
-      #sharedMemStatsShared(sharedHeap.currMem - sharedHeap.freeMem)
+      proc getOccupiedSharedMem(): int =
+        sharedMemStatsShared(sharedHeap.occ)
+        #sharedMemStatsShared(sharedHeap.currMem - sharedHeap.freeMem)
   {.pop.}
 
 {.pop.}
diff --git a/lib/system/atomics.nim b/lib/system/atomics.nim
index 8d29c287d..61a60fa53 100644
--- a/lib/system/atomics.nim
+++ b/lib/system/atomics.nim
@@ -17,7 +17,7 @@ type
   AtomType* = SomeNumber|pointer|ptr|char|bool
     ## Type Class representing valid types for use with atomic procs
 
-when someGcc and hasThreadSupport:
+when someGcc:
   type AtomMemModel* = distinct cint
 
   var ATOMIC_RELAXED* {.importc: "__ATOMIC_RELAXED", nodecl.}: AtomMemModel
@@ -164,7 +164,7 @@ when someGcc and hasThreadSupport:
     ## ignore this parameter.
 
   template fence*() = atomicThreadFence(ATOMIC_SEQ_CST)
-elif someVcc and hasThreadSupport:
+elif someVcc:
   type AtomMemModel* = distinct cint
 
   const
diff --git a/lib/system/threads.nim b/lib/system/threads.nim
index aaaa33bb7..4e190e443 100644
--- a/lib/system/threads.nim
+++ b/lib/system/threads.nim
@@ -53,13 +53,23 @@ when defined(nimPreviewSlimSystem):
 const
   hasAllocStack = defined(zephyr) # maybe freertos too?
 
+when defined(gcDestructors):
+  proc allocThreadStorage(size: int): pointer =
+    result = c_malloc(csize_t size)
+    zeroMem(result, size)
+
+  proc deallocThreadStorage(p: pointer) = c_free(p)
+else:
+  template allocThreadStorage(size: untyped): untyped = allocShared0(size)
+  template deallocThreadStorage(p: pointer) = deallocShared(p)
+
 when hasAllocStack or defined(zephyr) or defined(freertos):
   const
-    nimThreadStackSize {.intdefine.} = 8192 
+    nimThreadStackSize {.intdefine.} = 8192
     nimThreadStackGuard {.intdefine.} = 128
 
-    StackGuardSize = nimThreadStackGuard 
-    ThreadStackSize = nimThreadStackSize - nimThreadStackGuard 
+    StackGuardSize = nimThreadStackGuard
+    ThreadStackSize = nimThreadStackSize - nimThreadStackGuard
 else:
   const
     StackGuardSize = 4096
@@ -176,7 +186,7 @@ else:
     finally:
       afterThreadRuns()
       when hasAllocStack:
-        deallocShared(thrd.rawStack)
+        deallocThreadStorage(thrd.rawStack)
 
 proc threadProcWrapStackFrame[TArg](thrd: ptr Thread[TArg]) {.raises: [].} =
   when defined(boehmgc):
@@ -207,7 +217,7 @@ template threadProcWrapperBody(closure: untyped): untyped =
   # mark as not running anymore:
   thrd.core = nil
   thrd.dataFn = nil
-  deallocShared(cast[pointer](core))
+  deallocThreadStorage(cast[pointer](core))
 
 {.push stack_trace:off.}
 when defined(windows):
@@ -278,7 +288,7 @@ when false:
     t.dataFn = nil
     ## if thread `t` already exited, `t.core` will be `null`.
     if not isNil(t.core):
-      deallocShared(t.core)
+      deallocThreadStorage(t.core)
       t.core = nil
 
 when hostOS == "windows":
@@ -290,7 +300,7 @@ when hostOS == "windows":
     ## Entry point is the proc `tp`.
     ## `param` is passed to `tp`. `TArg` can be `void` if you
     ## don't need to pass any data to the thread.
-    t.core = cast[PGcThread](allocShared0(sizeof(GcThread)))
+    t.core = cast[PGcThread](allocThreadStorage(sizeof(GcThread)))
 
     when TArg isnot void: t.data = param
     t.dataFn = tp
@@ -315,7 +325,7 @@ elif defined(genode):
   proc createThread*[TArg](t: var Thread[TArg],
                            tp: proc (arg: TArg) {.thread, nimcall.},
                            param: TArg) =
-    t.core = cast[PGcThread](allocShared0(sizeof(GcThread)))
+    t.core = cast[PGcThread](allocThreadStorage(sizeof(GcThread)))
 
     when TArg isnot void: t.data = param
     t.dataFn = tp
@@ -339,7 +349,7 @@ else:
     ## Entry point is the proc `tp`. `param` is passed to `tp`.
     ## `TArg` can be `void` if you
     ## don't need to pass any data to the thread.
-    t.core = cast[PGcThread](allocShared0(sizeof(GcThread)))
+    t.core = cast[PGcThread](allocThreadStorage(sizeof(GcThread)))
 
     when TArg isnot void: t.data = param
     t.dataFn = tp
@@ -348,7 +358,7 @@ else:
     doAssert pthread_attr_init(a) == 0
     when hasAllocStack:
       var
-        rawstk = allocShared0(ThreadStackSize + StackGuardSize)
+        rawstk = allocThreadStorage(ThreadStackSize + StackGuardSize)
         stk = cast[pointer](cast[uint](rawstk) + StackGuardSize)
       let setstacksizeResult = pthread_attr_setstack(addr a, stk, ThreadStackSize)
       t.rawStack = rawstk