15 files changed, 798 insertions, 363 deletions
diff --git a/lib/pure/collections/queues.nim b/lib/pure/collections/queues.nim
new file mode 100644
index 000000000..2130d9949
--- /dev/null
+++ b/lib/pure/collections/queues.nim
@@ -0,0 +1,89 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2011 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Implementation of a queue. The underlying implementation uses a ``seq``.
+
+import math
+
+type
+  TQueue* {.pure, final.}[T] = object ## a queue
+    data: seq[T]
+    rd, wr, count, mask: int
+    
+proc initQueue*[T](initialSize=4): TQueue[T] =
+  ## creates a new queue. `initialSize` needs to be a power of 2.
+  assert IsPowerOfTwo(initialSize)
+  result.mask = initialSize-1
+  newSeq(result.data, initialSize)
+
+proc len*[T](q: TQueue[T]): int =
+  ## returns the number of elements of `q`.
+  result = q.count
+
+iterator items*[T](q: TQueue[T]): T =
+  ## yields every element of `q`.
+  var i = q.rd
+  var c = q.count
+  while c > 0:
+    dec c
+    yield q.data[i]
+    i = (i + 1) and q.mask
+
+proc add*[T](q: var TQueue[T], item: T) =
+  ## adds an `item` to the end of the queue `q`.
+  var cap = q.mask+1
+  if q.count >= cap:
+    var n: seq[T]
+    newSeq(n, cap*2)
+    var i = 0
+    for x in items(q):
+      shallowCopy(n[i], x)
+      inc i
+    shallowCopy(q.data, n)
+    q.mask = cap*2 - 1
+    q.wr = q.count
+    q.rd = 0
+  inc q.count
+  q.data[q.wr] = item
+  q.wr = (q.wr + 1) and q.mask
+
+proc enqueue*[T](q: var TQueue[T], item: T) =
+  ## alias for the ``add`` operation.
+  add(q, item)
+
+proc dequeue*[T](q: var TQueue[T]): T =
+  ## removes and returns the first element of the queue `q`.
+  assert q.count > 0
+  dec q.count
+  result = q.data[q.rd]
+  q.rd = (q.rd + 1) and q.mask
+
+proc `$`*[T](q: TQueue[T]): string = 
+  ## turns a queue into its string representation.
+  result = "["
+  for x in items(q):
+    if result.len > 1: result.add(", ")
+    result.add($x)
+  result.add("]")
+
+when isMainModule:
+  var q = initQueue[int]()
+  q.add(123)
+  q.add(9)
+  q.add(4)
+  var first = q.dequeue
+  q.add(56)
+  q.add(6)
+  var second = q.dequeue
+  q.add(789)
+  
+  assert first == 123
+  assert second == 9
+  assert($q == "[4, 56, 6, 789]")
+
diff --git a/lib/pure/marshal.nim b/lib/pure/marshal.nim
index f96d177ae..354d70a71 100755
--- a/lib/pure/marshal.nim
+++ b/lib/pure/marshal.nim
@@ -8,7 +8,26 @@
 #
 
 ## This module contains procs for serialization and deseralization of 
-## arbitrary Nimrod data structures. The serialization format uses JSON.
+## arbitrary Nimrod data structures. The serialization format uses JSON.
+##
+## **Restriction**: For objects their type is **not** serialized. This means
+## essentially that it does not work if the object has some other runtime
+## type than its compiletime type:
+##
+## .. code-block:: nimrod
+## 
+##   type 
+##     TA = object
+##     TB = object of TA
+##       f: int
+##
+##   var
+##     a: ref TA
+##     b: ref TB
+##
+##   new(b)
+##   a = b
+##   echo($$a[]) # produces "{}", not "{f: 0}"
 
 import streams, typeinfo, json, intsets, tables
 
@@ -286,3 +305,15 @@ when isMainModule:
   echo($$test7)
   testit(test7)
 
+  type 
+    TA = object
+    TB = object of TA
+      f: int
+
+  var
+    a: ref TA
+    b: ref TB
+  new(b)
+  a = b
+  echo($$a[]) # produces "{}", not "{f: 0}"
+
diff --git a/lib/pure/osproc.nim b/lib/pure/osproc.nim
index 60bef813d..2b7047143 100755
--- a/lib/pure/osproc.nim
+++ b/lib/pure/osproc.nim
@@ -77,11 +77,14 @@ proc startProcess*(command: string,
   ## If ``env == nil`` the environment is inherited of
   ## the parent process. `options` are additional flags that may be passed
   ## to `startProcess`. See the documentation of ``TProcessOption`` for the
-  ## meaning of these flags.
+  ## meaning of these flags. You need to `close` the process when done.
   ##
   ## Return value: The newly created process object. Nil is never returned,
   ## but ``EOS`` is raised in case of an error.
 
+proc close*(p: PProcess) {.rtl, extern: "nosp$1".}
+  ## When the process has finished executing, cleanup related handles
+
 proc suspend*(p: PProcess) {.rtl, extern: "nosp$1".}
   ## Suspends the process `p`.
 
@@ -179,6 +182,7 @@ proc execProcesses*(cmds: openArray[string],
             err.add("\n")
           echo(err)
         result = max(waitForExit(q[r]), result)
+        if q[r] != nil: close(q[r])
         q[r] = startProcessAux(cmds[i], options=options)
         r = (r + 1) mod n
     else:
@@ -189,15 +193,18 @@ proc execProcesses*(cmds: openArray[string],
           if not running(q[r]):
             #echo(outputStream(q[r]).readLine())
             result = max(waitForExit(q[r]), result)
+            if q[r] != nil: close(q[r])
             q[r] = startProcessAux(cmds[i], options=options)
             inc(i)
             if i > high(cmds): break
     for i in 0..m-1:
+      if q[i] != nil: close(q[i])
       result = max(waitForExit(q[i]), result)
   else:
     for i in 0..high(cmds):
       var p = startProcessAux(cmds[i], options=options)
       result = max(waitForExit(p), result)
+      close(p)
 
 proc select*(readfds: var seq[PProcess], timeout = 500): int
   ## `select` with a sensible Nimrod interface. `timeout` is in miliseconds.
@@ -215,6 +222,8 @@ when not defined(useNimRtl):
     while running(p) or not outp.atEnd(outp):
       result.add(outp.readLine())
       result.add("\n")
+    outp.close(outp)
+    close(p)
 
 when false:
   proc deallocCStringArray(a: cstringArray) =
@@ -356,6 +365,12 @@ when defined(Windows) and not defined(useNimRtl):
     result.FProcessHandle = procInfo.hProcess
     result.id = procInfo.dwProcessID
 
+  proc close(p: PProcess) =
+    discard CloseHandle(p.inputHandle)
+    discard CloseHandle(p.outputHandle)
+    discard CloseHandle(p.errorHandle)
+    discard CloseHandle(p.FProcessHandle)
+
   proc suspend(p: PProcess) =
     discard SuspendThread(p.FProcessHandle)
 
@@ -523,6 +538,11 @@ elif not defined(useNimRtl):
     discard close(p_stdin[readIdx])
     discard close(p_stdout[writeIdx])
 
+  proc close(p: PProcess) =
+    discard close(p.inputHandle)
+    discard close(p.outputHandle)
+    discard close(p.errorHandle)
+
   proc suspend(p: PProcess) =
     discard kill(p.id, SIGSTOP)
 
diff --git a/lib/system.nim b/lib/system.nim
index 5ece9375e..5c7102664 100755
--- a/lib/system.nim
+++ b/lib/system.nim
@@ -785,6 +785,10 @@ when hasThreadSupport and not hasSharedHeap:
 else:
   {.pragma: rtlThreadVar.}
 
+template sysAssert(cond: expr) =
+  # change this to activate system asserts
+  nil
+
 include "system/inclrtl"
 
 when not defined(ecmascript) and not defined(nimrodVm):
@@ -1251,7 +1255,7 @@ proc each*[T](data: var openArray[T], op: proc (x: var T)) =
   for i in 0..data.len-1: op(data[i])
 
 iterator fields*[T: tuple](x: T): expr {.magic: "Fields", noSideEffect.}
-  ## iterates over every field of `x`. Warning: This is really transforms
+  ## iterates over every field of `x`. Warning: This really transforms
   ## the 'for' and unrolls the loop. The current implementation also has a bug
   ## that affects symbol binding in the loop body.
 iterator fields*[S: tuple, T: tuple](x: S, y: T): tuple[a, b: expr] {.
@@ -1261,13 +1265,13 @@ iterator fields*[S: tuple, T: tuple](x: S, y: T): tuple[a, b: expr] {.
   ## The current implementation also has a bug that affects symbol binding
   ## in the loop body.
 iterator fieldPairs*[T: tuple](x: T): expr {.magic: "FieldPairs", noSideEffect.}
-  ## iterates over every field of `x`. Warning: This is really transforms
+  ## iterates over every field of `x`. Warning: This really transforms
   ## the 'for' and unrolls the loop. The current implementation also has a bug
   ## that affects symbol binding in the loop body.
 iterator fieldPairs*[S: tuple, T: tuple](x: S, y: T): tuple[a, b: expr] {.
   magic: "FieldPairs", noSideEffect.}
   ## iterates over every field of `x` and `y`.
-  ## Warning: This is really transforms the 'for' and unrolls the loop. 
+  ## Warning: This really transforms the 'for' and unrolls the loop. 
   ## The current implementation also has a bug that affects symbol binding
   ## in the loop body.
 
@@ -1703,10 +1707,10 @@ when not defined(EcmaScript) and not defined(NimrodVM):
 
   # ----------------------------------------------------------------------------
 
-  proc atomicInc*(memLoc: var int, x: int): int {.inline.}
+  proc atomicInc*(memLoc: var int, x: int = 1): int {.inline.}
     ## atomic increment of `memLoc`. Returns the value after the operation.
   
-  proc atomicDec*(memLoc: var int, x: int): int {.inline.}
+  proc atomicDec*(memLoc: var int, x: int = 1): int {.inline.}
     ## atomic decrement of `memLoc`. Returns the value after the operation.
 
   include "system/atomics"
@@ -1719,6 +1723,7 @@ when not defined(EcmaScript) and not defined(NimrodVM):
       context: C_JmpBuf
 
   when hasThreadSupport:
+    include "system/syslocks"
     include "system/threads"
   else:
     initStackBottom()
@@ -1739,14 +1744,14 @@ when not defined(EcmaScript) and not defined(NimrodVM):
   proc reprAny(p: pointer, typ: PNimType): string {.compilerRtl.}
 
   proc getDiscriminant(aa: Pointer, n: ptr TNimNode): int =
-    assert(n.kind == nkCase)
+    sysAssert(n.kind == nkCase)
     var d: int
     var a = cast[TAddress](aa)
     case n.typ.size
     of 1: d = ze(cast[ptr int8](a +% n.offset)[])
     of 2: d = ze(cast[ptr int16](a +% n.offset)[])
     of 4: d = int(cast[ptr int32](a +% n.offset)[])
-    else: assert(false)
+    else: sysAssert(false)
     return d
 
   proc selectBranch(aa: Pointer, n: ptr TNimNode): ptr TNimNode =
@@ -1764,6 +1769,8 @@ when not defined(EcmaScript) and not defined(NimrodVM):
   {.pop.}
 
   include "system/sysio"
+  when hasThreadSupport:
+    include "system/inboxes"
 
   iterator lines*(filename: string): string =
     ## Iterate over any line in the file named `filename`.
diff --git a/lib/system/alloc.nim b/lib/system/alloc.nim
index 3273242d6..8a54e0ddd 100755
--- a/lib/system/alloc.nim
+++ b/lib/system/alloc.nim
@@ -128,12 +128,12 @@ template bigChunkOverhead(): expr = sizeof(TBigChunk)-sizeof(TAlignType)
 
 proc roundup(x, v: int): int {.inline.} = 
   result = (x + (v-1)) and not (v-1)
-  assert(result >= x)
+  sysAssert(result >= x)
   #return ((-x) and (v-1)) +% x
 
-assert(roundup(14, PageSize) == PageSize)
-assert(roundup(15, 8) == 16)
-assert(roundup(65, 8) == 72)
+sysAssert(roundup(14, PageSize) == PageSize)
+sysAssert(roundup(15, 8) == 16)
+sysAssert(roundup(65, 8) == 72)
 
 # ------------- chunk table ---------------------------------------------------
 # We use a PtrSet of chunk starts and a table[Page, chunksize] for chunk
@@ -149,35 +149,35 @@ type
     acc: int                 # accumulator
     next: PLLChunk           # next low-level chunk; only needed for dealloc
     
-  TAllocator {.final, pure.} = object
+  TMemRegion {.final, pure.} = object
     llmem: PLLChunk
     currMem, maxMem, freeMem: int # memory sizes (allocated from OS)
     lastSize: int # needed for the case that OS gives us pages linearly 
     freeSmallChunks: array[0..SmallChunkSize div MemAlign-1, PSmallChunk]
     freeChunksList: PBigChunk # XXX make this a datastructure with O(1) access
     chunkStarts: TIntSet
-   
-proc incCurrMem(a: var TAllocator, bytes: int) {.inline.} = 
+  
+proc incCurrMem(a: var TMemRegion, bytes: int) {.inline.} = 
   inc(a.currMem, bytes)
 
-proc decCurrMem(a: var TAllocator, bytes: int) {.inline.} =
+proc decCurrMem(a: var TMemRegion, bytes: int) {.inline.} =
   a.maxMem = max(a.maxMem, a.currMem)
   dec(a.currMem, bytes)
 
-proc getMaxMem(a: var TAllocator): int =
+proc getMaxMem(a: var TMemRegion): int =
   # Since we update maxPagesCount only when freeing pages, 
   # maxPagesCount may not be up to date. Thus we use the
   # maximum of these both values here:
   return max(a.currMem, a.maxMem)
   
-proc llAlloc(a: var TAllocator, size: int): pointer =
+proc llAlloc(a: var TMemRegion, size: int): pointer =
   # *low-level* alloc for the memory managers data structures. Deallocation
   # is done at he end of the allocator's life time.
   if a.llmem == nil or size > a.llmem.size:
     # the requested size is ``roundup(size+sizeof(TLLChunk), PageSize)``, but
     # since we know ``size`` is a (small) constant, we know the requested size
     # is one page:
-    assert roundup(size+sizeof(TLLChunk), PageSize) == PageSize
+    sysAssert roundup(size+sizeof(TLLChunk), PageSize) == PageSize
     var old = a.llmem # can be nil and is correct with nil
     a.llmem = cast[PLLChunk](osAllocPages(PageSize))
     incCurrMem(a, PageSize)
@@ -189,7 +189,7 @@ proc llAlloc(a: var TAllocator, size: int): pointer =
   inc(a.llmem.acc, size)
   zeroMem(result, size)
   
-proc llDeallocAll(a: var TAllocator) =
+proc llDeallocAll(a: var TMemRegion) =
   var it = a.llmem
   while it != nil:
     # we know each block in the list has the size of 1 page:
@@ -204,7 +204,7 @@ proc IntSetGet(t: TIntSet, key: int): PTrunk =
     it = it.next
   result = nil
 
-proc IntSetPut(a: var TAllocator, t: var TIntSet, key: int): PTrunk = 
+proc IntSetPut(a: var TMemRegion, t: var TIntSet, key: int): PTrunk = 
   result = IntSetGet(t, key)
   if result == nil:
     result = cast[PTrunk](llAlloc(a, sizeof(result[])))
@@ -220,7 +220,7 @@ proc Contains(s: TIntSet, key: int): bool =
   else: 
     result = false
   
-proc Incl(a: var TAllocator, s: var TIntSet, key: int) = 
+proc Incl(a: var TMemRegion, s: var TIntSet, key: int) = 
   var t = IntSetPut(a, s, key shr TrunkShift)
   var u = key and TrunkMask
   t.bits[u shr IntShift] = t.bits[u shr IntShift] or (1 shl (u and IntMask))
@@ -259,13 +259,13 @@ proc pageIndex(p: pointer): int {.inline.} =
 
 proc pageAddr(p: pointer): PChunk {.inline.} = 
   result = cast[PChunk](cast[TAddress](p) and not PageMask)
-  #assert(Contains(allocator.chunkStarts, pageIndex(result)))
+  #sysAssert(Contains(allocator.chunkStarts, pageIndex(result)))
 
-proc requestOsChunks(a: var TAllocator, size: int): PBigChunk = 
+proc requestOsChunks(a: var TMemRegion, size: int): PBigChunk = 
   incCurrMem(a, size)
   inc(a.freeMem, size)
   result = cast[PBigChunk](osAllocPages(size))
-  assert((cast[TAddress](result) and PageMask) == 0)
+  sysAssert((cast[TAddress](result) and PageMask) == 0)
   #zeroMem(result, size)
   result.next = nil
   result.prev = nil
@@ -273,7 +273,7 @@ proc requestOsChunks(a: var TAllocator, size: int): PBigChunk =
   result.size = size
   # update next.prevSize:
   var nxt = cast[TAddress](result) +% size
-  assert((nxt and PageMask) == 0)
+  sysAssert((nxt and PageMask) == 0)
   var next = cast[PChunk](nxt)
   if pageIndex(next) in a.chunkStarts:
     #echo("Next already allocated!")
@@ -281,7 +281,7 @@ proc requestOsChunks(a: var TAllocator, size: int): PBigChunk =
   # set result.prevSize:
   var lastSize = if a.lastSize != 0: a.lastSize else: PageSize
   var prv = cast[TAddress](result) -% lastSize
-  assert((nxt and PageMask) == 0)
+  sysAssert((nxt and PageMask) == 0)
   var prev = cast[PChunk](prv)
   if pageIndex(prev) in a.chunkStarts and prev.size == lastSize:
     #echo("Prev already allocated!")
@@ -290,11 +290,11 @@ proc requestOsChunks(a: var TAllocator, size: int): PBigChunk =
     result.prevSize = 0 # unknown
   a.lastSize = size # for next request
 
-proc freeOsChunks(a: var TAllocator, p: pointer, size: int) = 
+proc freeOsChunks(a: var TMemRegion, p: pointer, size: int) = 
   # update next.prevSize:
   var c = cast[PChunk](p)
   var nxt = cast[TAddress](p) +% c.size
-  assert((nxt and PageMask) == 0)
+  sysAssert((nxt and PageMask) == 0)
   var next = cast[PChunk](nxt)
   if pageIndex(next) in a.chunkStarts:
     next.prevSize = 0 # XXX used
@@ -304,7 +304,7 @@ proc freeOsChunks(a: var TAllocator, p: pointer, size: int) =
   dec(a.freeMem, size)
   #c_fprintf(c_stdout, "[Alloc] back to OS: %ld\n", size)
 
-proc isAccessible(a: TAllocator, p: pointer): bool {.inline.} = 
+proc isAccessible(a: TMemRegion, p: pointer): bool {.inline.} = 
   result = Contains(a.chunkStarts, pageIndex(p))
 
 proc contains[T](list, x: T): bool = 
@@ -313,7 +313,7 @@ proc contains[T](list, x: T): bool =
     if it == x: return true
     it = it.next
     
-proc writeFreeList(a: TAllocator) =
+proc writeFreeList(a: TMemRegion) =
   var it = a.freeChunksList
   c_fprintf(c_stdout, "freeChunksList: %p\n", it)
   while it != nil: 
@@ -322,23 +322,23 @@ proc writeFreeList(a: TAllocator) =
     it = it.next
 
 proc ListAdd[T](head: var T, c: T) {.inline.} = 
-  assert(c notin head)
-  assert c.prev == nil
-  assert c.next == nil
+  sysAssert(c notin head)
+  sysAssert c.prev == nil
+  sysAssert c.next == nil
   c.next = head
   if head != nil: 
-    assert head.prev == nil
+    sysAssert head.prev == nil
     head.prev = c
   head = c
 
 proc ListRemove[T](head: var T, c: T) {.inline.} =
-  assert(c in head)
+  sysAssert(c in head)
   if c == head: 
     head = c.next
-    assert c.prev == nil
+    sysAssert c.prev == nil
     if head != nil: head.prev = nil
   else:
-    assert c.prev != nil
+    sysAssert c.prev != nil
     c.prev.next = c.next
     if c.next != nil: c.next.prev = c.prev
   c.next = nil
@@ -350,22 +350,22 @@ proc isSmallChunk(c: PChunk): bool {.inline.} =
 proc chunkUnused(c: PChunk): bool {.inline.} = 
   result = not c.used
   
-proc updatePrevSize(a: var TAllocator, c: PBigChunk, 
+proc updatePrevSize(a: var TMemRegion, c: PBigChunk, 
                     prevSize: int) {.inline.} = 
   var ri = cast[PChunk](cast[TAddress](c) +% c.size)
-  assert((cast[TAddress](ri) and PageMask) == 0)
+  sysAssert((cast[TAddress](ri) and PageMask) == 0)
   if isAccessible(a, ri):
     ri.prevSize = prevSize
   
-proc freeBigChunk(a: var TAllocator, c: PBigChunk) = 
+proc freeBigChunk(a: var TMemRegion, c: PBigChunk) = 
   var c = c
-  assert(c.size >= PageSize)
+  sysAssert(c.size >= PageSize)
   inc(a.freeMem, c.size)
   when coalescRight:
     var ri = cast[PChunk](cast[TAddress](c) +% c.size)
-    assert((cast[TAddress](ri) and PageMask) == 0)
+    sysAssert((cast[TAddress](ri) and PageMask) == 0)
     if isAccessible(a, ri) and chunkUnused(ri):
-      assert(not isSmallChunk(ri))
+      sysAssert(not isSmallChunk(ri))
       if not isSmallChunk(ri):
         ListRemove(a.freeChunksList, cast[PBigChunk](ri))
         inc(c.size, ri.size)
@@ -373,9 +373,9 @@ proc freeBigChunk(a: var TAllocator, c: PBigChunk) =
   when coalescLeft:
     if c.prevSize != 0: 
       var le = cast[PChunk](cast[TAddress](c) -% c.prevSize)
-      assert((cast[TAddress](le) and PageMask) == 0)
+      sysAssert((cast[TAddress](le) and PageMask) == 0)
       if isAccessible(a, le) and chunkUnused(le):
-        assert(not isSmallChunk(le))
+        sysAssert(not isSmallChunk(le))
         if not isSmallChunk(le):
           ListRemove(a.freeChunksList, cast[PBigChunk](le))
           inc(le.size, c.size)
@@ -390,9 +390,9 @@ proc freeBigChunk(a: var TAllocator, c: PBigChunk) =
   else:
     freeOsChunks(a, c, c.size)
 
-proc splitChunk(a: var TAllocator, c: PBigChunk, size: int) = 
+proc splitChunk(a: var TMemRegion, c: PBigChunk, size: int) = 
   var rest = cast[PBigChunk](cast[TAddress](c) +% size)
-  assert(rest notin a.freeChunksList)
+  sysAssert(rest notin a.freeChunksList)
   rest.size = c.size - size
   rest.used = false
   rest.next = nil
@@ -403,14 +403,14 @@ proc splitChunk(a: var TAllocator, c: PBigChunk, size: int) =
   incl(a, a.chunkStarts, pageIndex(rest))
   ListAdd(a.freeChunksList, rest)
 
-proc getBigChunk(a: var TAllocator, size: int): PBigChunk = 
+proc getBigChunk(a: var TMemRegion, size: int): PBigChunk = 
   # use first fit for now:
-  assert((size and PageMask) == 0)
-  assert(size > 0)
+  sysAssert((size and PageMask) == 0)
+  sysAssert(size > 0)
   result = a.freeChunksList
   block search:
     while result != nil:
-      assert chunkUnused(result)
+      sysAssert chunkUnused(result)
       if result.size == size: 
         ListRemove(a.freeChunksList, result)
         break search
@@ -419,7 +419,7 @@ proc getBigChunk(a: var TAllocator, size: int): PBigChunk =
         splitChunk(a, result, size)
         break search
       result = result.next
-      assert result != a.freeChunksList
+      sysAssert result != a.freeChunksList
     if size < InitialMemoryRequest: 
       result = requestOsChunks(a, InitialMemoryRequest)
       splitChunk(a, result, size)
@@ -430,10 +430,10 @@ proc getBigChunk(a: var TAllocator, size: int): PBigChunk =
   incl(a, a.chunkStarts, pageIndex(result))
   dec(a.freeMem, size)
 
-proc getSmallChunk(a: var TAllocator): PSmallChunk = 
+proc getSmallChunk(a: var TMemRegion): PSmallChunk = 
   var res = getBigChunk(a, PageSize)
-  assert res.prev == nil
-  assert res.next == nil
+  sysAssert res.prev == nil
+  sysAssert res.next == nil
   result = cast[PSmallChunk](res)
 
 # -----------------------------------------------------------------------------
@@ -442,9 +442,13 @@ proc getCellSize(p: pointer): int {.inline.} =
   var c = pageAddr(p)
   result = c.size
   
-proc rawAlloc(a: var TAllocator, requestedSize: int): pointer =
-  assert(roundup(65, 8) == 72)
-  assert requestedSize >= sizeof(TFreeCell)
+proc memSize(a: TMemRegion, p: pointer): int {.inline.} =
+  var c = pageAddr(p)
+  result = c.size
+    
+proc rawAlloc(a: var TMemRegion, requestedSize: int): pointer =
+  sysAssert(roundup(65, 8) == 72)
+  sysAssert requestedSize >= sizeof(TFreeCell)
   var size = roundup(requestedSize, MemAlign)
   #c_fprintf(c_stdout, "alloc; size: %ld; %ld\n", requestedSize, size)
   if size <= SmallChunkSize-smallChunkOverhead(): 
@@ -454,7 +458,7 @@ proc rawAlloc(a: var TAllocator, requestedSize: int): pointer =
     if c == nil: 
       c = getSmallChunk(a)
       c.freeList = nil
-      assert c.size == PageSize
+      sysAssert c.size == PageSize
       c.size = size
       c.acc = size
       c.free = SmallChunkSize - smallChunkOverhead() - size
@@ -462,36 +466,40 @@ proc rawAlloc(a: var TAllocator, requestedSize: int): pointer =
       c.prev = nil
       ListAdd(a.freeSmallChunks[s], c)
       result = addr(c.data)
-      assert((cast[TAddress](result) and (MemAlign-1)) == 0)
+      sysAssert((cast[TAddress](result) and (MemAlign-1)) == 0)
     else:
-      assert c.next != c
+      sysAssert c.next != c
       #if c.size != size:
       #  c_fprintf(c_stdout, "csize: %lld; size %lld\n", c.size, size)
-      assert c.size == size
+      sysAssert c.size == size
       if c.freeList == nil:
-        assert(c.acc + smallChunkOverhead() + size <= SmallChunkSize) 
+        sysAssert(c.acc + smallChunkOverhead() + size <= SmallChunkSize) 
         result = cast[pointer](cast[TAddress](addr(c.data)) +% c.acc)
         inc(c.acc, size)      
       else:
         result = c.freeList
-        assert(c.freeList.zeroField == 0)
+        sysAssert(c.freeList.zeroField == 0)
         c.freeList = c.freeList.next
       dec(c.free, size)
-      assert((cast[TAddress](result) and (MemAlign-1)) == 0)
+      sysAssert((cast[TAddress](result) and (MemAlign-1)) == 0)
     if c.free < size: 
       ListRemove(a.freeSmallChunks[s], c)
   else:
     size = roundup(requestedSize+bigChunkOverhead(), PageSize)
     # allocate a large block
     var c = getBigChunk(a, size)
-    assert c.prev == nil
-    assert c.next == nil
-    assert c.size == size
+    sysAssert c.prev == nil
+    sysAssert c.next == nil
+    sysAssert c.size == size
     result = addr(c.data)
-    assert((cast[TAddress](result) and (MemAlign-1)) == 0)
-  assert(isAccessible(a, result))
+    sysAssert((cast[TAddress](result) and (MemAlign-1)) == 0)
+  sysAssert(isAccessible(a, result))
+
+proc rawAlloc0(a: var TMemRegion, requestedSize: int): pointer =
+  result = rawAlloc(a, requestedSize)
+  zeroMem(result, requestedSize)
 
-proc rawDealloc(a: var TAllocator, p: pointer) = 
+proc rawDealloc(a: var TMemRegion, p: pointer) = 
   var c = pageAddr(p)
   if isSmallChunk(c):
     # `p` is within a small chunk:
@@ -499,7 +507,7 @@ proc rawDealloc(a: var TAllocator, p: pointer) =
     var s = c.size
     var f = cast[ptr TFreeCell](p)
     #echo("setting to nil: ", $cast[TAddress](addr(f.zeroField)))
-    assert(f.zeroField != 0)
+    sysAssert(f.zeroField != 0)
     f.zeroField = 0
     f.next = c.freeList
     c.freeList = f
@@ -509,7 +517,7 @@ proc rawDealloc(a: var TAllocator, p: pointer) =
                s -% sizeof(TFreeCell))
     # check if it is not in the freeSmallChunks[s] list:
     if c.free < s:
-      assert c notin a.freeSmallChunks[s div memAlign]
+      sysAssert c notin a.freeSmallChunks[s div memAlign]
       # add it to the freeSmallChunks[s] array:
       ListAdd(a.freeSmallChunks[s div memAlign], c)
       inc(c.free, s)
@@ -525,7 +533,7 @@ proc rawDealloc(a: var TAllocator, p: pointer) =
     # free big chunk
     freeBigChunk(a, cast[PBigChunk](c))
 
-proc isAllocatedPtr(a: TAllocator, p: pointer): bool = 
+proc isAllocatedPtr(a: TMemRegion, p: pointer): bool = 
   if isAccessible(a, p):
     var c = pageAddr(p)
     if not chunkUnused(c):
@@ -539,40 +547,40 @@ proc isAllocatedPtr(a: TAllocator, p: pointer): bool =
         var c = cast[PBigChunk](c)
         result = p == addr(c.data) and cast[ptr TFreeCell](p).zeroField >% 1
 
-proc deallocOsPages(a: var TAllocator) =
-  # we free every 'ordinarily' allocated page by iterating over the page
-  # bits:
-  for p in elements(a.chunkStarts): 
+proc deallocOsPages(a: var TMemRegion) =
+  # we free every 'ordinarily' allocated page by iterating over the page bits:
+  for p in elements(a.chunkStarts):
     var page = cast[PChunk](p shl pageShift)
     var size = if page.size < PageSize: PageSize else: page.size
     osDeallocPages(page, size)
   # And then we free the pages that are in use for the page bits:
   llDeallocAll(a)
 
-var
-  allocator {.rtlThreadVar.}: TAllocator
+proc getFreeMem(a: TMemRegion): int {.inline.} = result = a.freeMem
+proc getTotalMem(a: TMemRegion): int {.inline.} = result = a.currMem
+proc getOccupiedMem(a: TMemRegion): int {.inline.} = 
+  result = a.currMem - a.freeMem
 
-proc deallocOsPages = deallocOsPages(allocator)
+# ---------------------- thread memory region -------------------------------
 
-# ---------------------- interface to programs -------------------------------
+template InstantiateForRegion(allocator: expr) =
+  proc deallocOsPages = deallocOsPages(allocator)
 
-when not defined(useNimRtl):
-
-  proc unlockedAlloc(size: int): pointer {.inline.} =
+  proc unlockedAlloc(size: int): pointer =
     result = rawAlloc(allocator, size+sizeof(TFreeCell))
     cast[ptr TFreeCell](result).zeroField = 1 # mark it as used
-    assert(not isAllocatedPtr(allocator, result))
+    sysAssert(not isAllocatedPtr(allocator, result))
     result = cast[pointer](cast[TAddress](result) +% sizeof(TFreeCell))
 
-  proc unlockedAlloc0(size: int): pointer {.inline.} =
+  proc unlockedAlloc0(size: int): pointer =
     result = unlockedAlloc(size)
     zeroMem(result, size)
 
-  proc unlockedDealloc(p: pointer) {.inline.} =
+  proc unlockedDealloc(p: pointer) =
     var x = cast[pointer](cast[TAddress](p) -% sizeof(TFreeCell))
-    assert(cast[ptr TFreeCell](x).zeroField == 1)
+    sysAssert(cast[ptr TFreeCell](x).zeroField == 1)
     rawDealloc(allocator, x)
-    assert(not isAllocatedPtr(allocator, x))
+    sysAssert(not isAllocatedPtr(allocator, x))
 
   proc alloc(size: int): pointer =
     when hasThreadSupport and hasSharedHeap: AcquireSys(HeapLock)
@@ -601,37 +609,18 @@ when not defined(useNimRtl):
     elif p != nil:
       dealloc(p)
 
-  proc countFreeMem(): int =
-    # only used for assertions
-    var it = allocator.freeChunksList
-    while it != nil:
-      inc(result, it.size)
-      it = it.next
+  when false:
+    proc countFreeMem(): int =
+      # only used for assertions
+      var it = allocator.freeChunksList
+      while it != nil:
+        inc(result, it.size)
+        it = it.next
 
   proc getFreeMem(): int = 
     result = allocator.freeMem
-    #assert(result == countFreeMem())
+    #sysAssert(result == countFreeMem())
 
   proc getTotalMem(): int = return allocator.currMem
   proc getOccupiedMem(): int = return getTotalMem() - getFreeMem()
 
-when isMainModule:
-  const iterations = 4000_000
-  incl(allocator.chunkStarts, 11)
-  assert 11 in allocator.chunkStarts
-  excl(allocator.chunkStarts, 11)
-  assert 11 notin allocator.chunkStarts
-  var p: array [1..iterations, pointer]
-  for i in 7..7:
-    var x = i * 8
-    for j in 1.. iterations:
-      p[j] = alloc(allocator, x)
-    for j in 1..iterations:
-      assert isAllocatedPtr(allocator, p[j])
-    echo($i, " used memory: ", $(allocator.currMem))
-    for j in countdown(iterations, 1):
-      #echo("j: ", $j)
-      dealloc(allocator, p[j])
-      assert(not isAllocatedPtr(allocator, p[j]))
-    echo($i, " after freeing: ", $(allocator.currMem))
-    
diff --git a/lib/system/assign.nim b/lib/system/assign.nim
index aa5cd3af3..33bfa15f3 100755
--- a/lib/system/assign.nim
+++ b/lib/system/assign.nim
@@ -24,7 +24,7 @@ proc genericAssignAux(dest, src: Pointer, n: ptr TNimNode, shallow: bool) =
             n.typ.size)
     var m = selectBranch(src, n)
     if m != nil: genericAssignAux(dest, src, m, shallow)
-  of nkNone: assert(false)
+  of nkNone: sysAssert(false)
   #else:
   #  echo "ugh memory corruption! ", n.kind
   #  quit 1
@@ -33,7 +33,7 @@ proc genericAssignAux(dest, src: Pointer, mt: PNimType, shallow: bool) =
   var
     d = cast[TAddress](dest)
     s = cast[TAddress](src)
-  assert(mt != nil)
+  sysAssert(mt != nil)
   case mt.Kind
   of tyString:
     var x = cast[ppointer](dest)
@@ -50,7 +50,7 @@ proc genericAssignAux(dest, src: Pointer, mt: PNimType, shallow: bool) =
       # this can happen! nil sequences are allowed
       unsureAsgnRef(x, s2)
       return
-    assert(dest != nil)
+    sysAssert(dest != nil)
     unsureAsgnRef(x, newObj(mt, seq.len * mt.base.size + GenericSeqSize))
     var dst = cast[taddress](cast[ppointer](dest)[])
     for i in 0..seq.len-1:
@@ -101,7 +101,7 @@ proc objectInit(dest: Pointer, typ: PNimType) {.compilerProc.}
 proc objectInitAux(dest: Pointer, n: ptr TNimNode) =
   var d = cast[TAddress](dest)
   case n.kind
-  of nkNone: assert(false)
+  of nkNone: sysAssert(false)
   of nkSLot: objectInit(cast[pointer](d +% n.offset), n.typ)
   of nkList:
     for i in 0..n.len-1:
@@ -134,7 +134,7 @@ proc genericReset(dest: Pointer, mt: PNimType) {.compilerProc.}
 proc genericResetAux(dest: Pointer, n: ptr TNimNode) =
   var d = cast[TAddress](dest)
   case n.kind
-  of nkNone: assert(false)
+  of nkNone: sysAssert(false)
   of nkSlot: genericReset(cast[pointer](d +% n.offset), n.typ)
   of nkList:
     for i in 0..n.len-1: genericResetAux(dest, n.sons[i])
@@ -145,7 +145,7 @@ proc genericResetAux(dest: Pointer, n: ptr TNimNode) =
 
 proc genericReset(dest: Pointer, mt: PNimType) =
   var d = cast[TAddress](dest)
-  assert(mt != nil)
+  sysAssert(mt != nil)
   case mt.Kind
   of tyString, tyRef, tySequence:
     unsureAsgnRef(cast[ppointer](dest), nil)
@@ -173,4 +173,4 @@ proc FieldDiscriminantCheck(oldDiscVal, newDiscVal: int,
   if newBranch != oldBranch and oldDiscVal != 0:
     raise newException(EInvalidField, 
                        "assignment to discriminant changes object branch")
-  
+
diff --git a/lib/system/atomics.nim b/lib/system/atomics.nim
index 31c25c5af..64f8e03e0 100644
--- a/lib/system/atomics.nim
+++ b/lib/system/atomics.nim
@@ -22,14 +22,14 @@ else:
     inc(p, val)
     result = p
 
-proc atomicInc(memLoc: var int, x: int): int =
+proc atomicInc(memLoc: var int, x: int = 1): int =
   when hasThreadSupport:
     result = sync_add_and_fetch(memLoc, x)
   else:
     inc(memLoc, x)
     result = memLoc
   
-proc atomicDec(memLoc: var int, x: int): int =
+proc atomicDec(memLoc: var int, x: int = 1): int =
   when hasThreadSupport:
     when defined(sync_sub_and_fetch):
       result = sync_sub_and_fetch(memLoc, x)
diff --git a/lib/system/cellsets.nim b/lib/system/cellsets.nim
index e262d4b77..7502636fa 100755
--- a/lib/system/cellsets.nim
+++ b/lib/system/cellsets.nim
@@ -102,9 +102,9 @@ proc CellSetGet(t: TCellSet, key: TAddress): PPageDesc =
 proc CellSetRawInsert(t: TCellSet, data: PPageDescArray, desc: PPageDesc) =
   var h = cast[int](desc.key) and t.max
   while data[h] != nil:
-    assert(data[h] != desc)
+    sysAssert(data[h] != desc)
     h = nextTry(h, t.max)
-  assert(data[h] == nil)
+  sysAssert(data[h] == nil)
   data[h] = desc
 
 proc CellSetEnlarge(t: var TCellSet) =
@@ -130,7 +130,7 @@ proc CellSetPut(t: var TCellSet, key: TAddress): PPageDesc =
   inc(t.counter)
   h = cast[int](key) and t.max
   while t.data[h] != nil: h = nextTry(h, t.max)
-  assert(t.data[h] == nil)
+  sysAssert(t.data[h] == nil)
   # the new page descriptor goes into result
   result = cast[PPageDesc](unlockedAlloc0(sizeof(TPageDesc)))
   result.next = t.head
diff --git a/lib/system/ecmasys.nim b/lib/system/ecmasys.nim
index e2ecb370a..7f91feb6b 100755
--- a/lib/system/ecmasys.nim
+++ b/lib/system/ecmasys.nim
@@ -408,7 +408,7 @@ proc NimCopy(x: pointer, ti: PNimType): pointer {.compilerproc.}
 
 proc NimCopyAux(dest, src: Pointer, n: ptr TNimNode) {.exportc.} =
   case n.kind
-  of nkNone: assert(false)
+  of nkNone: sysAssert(false)
   of nkSlot:
     asm "`dest`[`n`.offset] = NimCopy(`src`[`n`.offset], `n`.typ);"
   of nkList:
diff --git a/lib/system/gc.nim b/lib/system/gc.nim
index 29fd2eae5..d1fa98514 100755
--- a/lib/system/gc.nim
+++ b/lib/system/gc.nim
@@ -53,17 +53,20 @@ type
   
   TGcHeap {.final, pure.} = object # this contains the zero count and
                                    # non-zero count table
+    stackBottom: pointer
+    cycleThreshold: int
     zct: TCellSeq            # the zero count table
     decStack: TCellSeq       # cells in the stack that are to decref again
     cycleRoots: TCellSet
     tempStack: TCellSeq      # temporary stack for recursion elimination
     recGcLock: int           # prevent recursion via finalizers; no thread lock
+    region: TMemRegion       # garbage collected region
     stat: TGcStat
 
 var
-  stackBottom {.rtlThreadVar.}: pointer
   gch {.rtlThreadVar.}: TGcHeap
-  cycleThreshold {.rtlThreadVar.}: int
+
+InstantiateForRegion(gch.region)
 
 proc acquire(gch: var TGcHeap) {.inline.} = 
   when hasThreadSupport and hasSharedHeap:
@@ -124,30 +127,30 @@ when traceGC:
     of csAllocated:
       if c in states[csAllocated]:
         writeCell("attempt to alloc an already allocated cell", c)
-        assert(false)
+        sysAssert(false)
       excl(states[csCycFreed], c)
       excl(states[csZctFreed], c)
     of csZctFreed:
       if c in states[csZctFreed]:
         writeCell("attempt to free zct cell twice", c)
-        assert(false)
+        sysAssert(false)
       if c in states[csCycFreed]:
         writeCell("attempt to free with zct, but already freed with cyc", c)
-        assert(false)
+        sysAssert(false)
       if c notin states[csAllocated]:
         writeCell("attempt to free not an allocated cell", c)
-        assert(false)
+        sysAssert(false)
       excl(states[csAllocated], c)
     of csCycFreed:
       if c notin states[csAllocated]:
         writeCell("attempt to free a not allocated cell", c)
-        assert(false)
+        sysAssert(false)
       if c in states[csCycFreed]:
         writeCell("attempt to free cyc cell twice", c)
-        assert(false)
+        sysAssert(false)
       if c in states[csZctFreed]:
         writeCell("attempt to free with cyc, but already freed with zct", c)
-        assert(false)
+        sysAssert(false)
       excl(states[csAllocated], c)
     incl(states[state], c)
 
@@ -216,7 +219,7 @@ proc decRef(c: PCell) {.inline.} =
   when stressGC:
     if c.refcount <% rcIncrement:
       writeCell("broken cell", c)
-  assert(c.refcount >=% rcIncrement)
+  sysAssert(c.refcount >=% rcIncrement)
   #if c.refcount <% rcIncrement: quit("leck mich")
   if --c.refcount:
     rtlAddZCT(c)
@@ -233,7 +236,7 @@ proc nimGCunref(p: pointer) {.compilerProc, inline.} = decRef(usrToCell(p))
 
 proc asgnRef(dest: ppointer, src: pointer) {.compilerProc, inline.} =
   # the code generator calls this proc!
-  assert(not isOnStack(dest))
+  sysAssert(not isOnStack(dest))
   # BUGFIX: first incRef then decRef!
   if src != nil: incRef(usrToCell(src))
   if dest[] != nil: decRef(usrToCell(dest[]))
@@ -267,7 +270,7 @@ proc initGC() =
   when not defined(useNimRtl):
     when traceGC:
       for i in low(TCellState)..high(TCellState): Init(states[i])
-    cycleThreshold = InitialCycleThreshold
+    gch.cycleThreshold = InitialCycleThreshold
     gch.stat.stackScans = 0
     gch.stat.cycleCollections = 0
     gch.stat.maxThreshold = 0
@@ -289,7 +292,7 @@ proc forAllSlotsAux(dest: pointer, n: ptr TNimNode, op: TWalkOp) =
   of nkCase:
     var m = selectBranch(dest, n)
     if m != nil: forAllSlotsAux(dest, m, op)
-  of nkNone: assert(false)
+  of nkNone: sysAssert(false)
 
 proc forAllChildrenAux(dest: Pointer, mt: PNimType, op: TWalkOp) =
   var d = cast[TAddress](dest)
@@ -306,9 +309,9 @@ proc forAllChildrenAux(dest: Pointer, mt: PNimType, op: TWalkOp) =
     else: nil
 
 proc forAllChildren(cell: PCell, op: TWalkOp) =
-  assert(cell != nil)
-  assert(cell.typ != nil)
-  assert cell.typ.kind in {tyRef, tySequence, tyString}
+  sysAssert(cell != nil)
+  sysAssert(cell.typ != nil)
+  sysAssert cell.typ.kind in {tyRef, tySequence, tyString}
   case cell.typ.Kind
   of tyRef: # common case
     forAllChildrenAux(cellToUsr(cell), cell.typ.base, op)
@@ -321,12 +324,7 @@ proc forAllChildren(cell: PCell, op: TWalkOp) =
           GenericSeqSize), cell.typ.base, op)
   else: nil
 
-proc checkCollection {.inline.} =
-  # checks if a collection should be done
-  if gch.recGcLock == 0:
-    collectCT(gch)
-
-proc addNewObjToZCT(res: PCell) {.inline.} =
+proc addNewObjToZCT(res: PCell, gch: var TGcHeap) {.inline.} =
   # we check the last 8 entries (cache line) for a slot that could be reused.
   # In 63% of all cases we succeed here! But we have to optimize the heck
   # out of this small linear search so that ``newObj`` is not slowed down.
@@ -370,14 +368,14 @@ proc addNewObjToZCT(res: PCell) {.inline.} =
         return
     add(gch.zct, res)
 
-proc newObj(typ: PNimType, size: int): pointer {.compilerRtl.} =
+proc newObj(typ: PNimType, size: int, gch: var TGcHeap): pointer =
   # generates a new object and sets its reference counter to 0
   acquire(gch)
-  assert(typ.kind in {tyRef, tyString, tySequence})
-  checkCollection()
-  var res = cast[PCell](rawAlloc(allocator, size + sizeof(TCell)))
+  sysAssert(typ.kind in {tyRef, tyString, tySequence})
+  collectCT(gch)
+  var res = cast[PCell](rawAlloc(gch.region, size + sizeof(TCell)))
   zeroMem(res, size+sizeof(TCell))
-  assert((cast[TAddress](res) and (MemAlign-1)) == 0)
+  sysAssert((cast[TAddress](res) and (MemAlign-1)) == 0)
   # now it is buffered in the ZCT
   res.typ = typ
   when debugGC and not hasThreadSupport:
@@ -385,13 +383,16 @@ proc newObj(typ: PNimType, size: int): pointer {.compilerRtl.} =
       res.filename = framePtr.prev.filename
       res.line = framePtr.prev.line
   res.refcount = rcZct # refcount is zero, but mark it to be in the ZCT  
-  assert(isAllocatedPtr(allocator, res))
+  sysAssert(isAllocatedPtr(gch.region, res))
   # its refcount is zero, so add it to the ZCT:
-  addNewObjToZCT(res)
+  addNewObjToZCT(res, gch)
   when logGC: writeCell("new cell", res)
   gcTrace(res, csAllocated)  
   release(gch)
-  result = cellToUsr(res)
+  result = cellToUsr(res)  
+
+proc newObj(typ: PNimType, size: int): pointer {.compilerRtl.} =
+  result = newObj(typ, size, gch)
 
 proc newSeq(typ: PNimType, len: int): pointer {.compilerRtl.} =
   # `newObj` already uses locks, so no need for them here.
@@ -399,23 +400,22 @@ proc newSeq(typ: PNimType, len: int): pointer {.compilerRtl.} =
   cast[PGenericSeq](result).len = len
   cast[PGenericSeq](result).space = len
 
-proc growObj(old: pointer, newsize: int): pointer {.rtl.} =
+proc growObj(old: pointer, newsize: int, gch: var TGcHeap): pointer =
   acquire(gch)
-  checkCollection()
+  collectCT(gch)
   var ol = usrToCell(old)
-  assert(ol.typ != nil)
-  assert(ol.typ.kind in {tyString, tySequence})
-  var res = cast[PCell](rawAlloc(allocator, newsize + sizeof(TCell)))
+  sysAssert(ol.typ != nil)
+  sysAssert(ol.typ.kind in {tyString, tySequence})
+  var res = cast[PCell](rawAlloc(gch.region, newsize + sizeof(TCell)))
   var elemSize = 1
-  if ol.typ.kind != tyString:
-    elemSize = ol.typ.base.size
+  if ol.typ.kind != tyString: elemSize = ol.typ.base.size
   
   var oldsize = cast[PGenericSeq](old).len*elemSize + GenericSeqSize
   copyMem(res, ol, oldsize + sizeof(TCell))
   zeroMem(cast[pointer](cast[TAddress](res)+% oldsize +% sizeof(TCell)),
           newsize-oldsize)
-  assert((cast[TAddress](res) and (MemAlign-1)) == 0)
-  assert(res.refcount shr rcShift <=% 1)
+  sysAssert((cast[TAddress](res) and (MemAlign-1)) == 0)
+  sysAssert(res.refcount shr rcShift <=% 1)
   #if res.refcount <% rcIncrement:
   #  add(gch.zct, res)
   #else: # XXX: what to do here?
@@ -434,29 +434,32 @@ proc growObj(old: pointer, newsize: int): pointer {.rtl.} =
     writeCell("growObj new cell", res)
   gcTrace(ol, csZctFreed)
   gcTrace(res, csAllocated)
-  when reallyDealloc: rawDealloc(allocator, ol)
+  when reallyDealloc: rawDealloc(gch.region, ol)
   else:
-    assert(ol.typ != nil)
+    sysAssert(ol.typ != nil)
     zeroMem(ol, sizeof(TCell))
   release(gch)
   result = cellToUsr(res)
 
+proc growObj(old: pointer, newsize: int): pointer {.rtl.} =
+  result = growObj(old, newsize, gch)
+
 # ---------------- cycle collector -------------------------------------------
 
 proc doOperation(p: pointer, op: TWalkOp) =
   if p == nil: return
   var c: PCell = usrToCell(p)
-  assert(c != nil)
+  sysAssert(c != nil)
   case op # faster than function pointers because of easy prediction
   of waZctDecRef:
-    assert(c.refcount >=% rcIncrement)
+    sysAssert(c.refcount >=% rcIncrement)
     c.refcount = c.refcount -% rcIncrement
     when logGC: writeCell("decref (from doOperation)", c)
     if c.refcount <% rcIncrement: addZCT(gch.zct, c)
   of waPush:
     add(gch.tempStack, c)
   of waCycleDecRef:
-    assert(c.refcount >=% rcIncrement)
+    sysAssert(c.refcount >=% rcIncrement)
     c.refcount = c.refcount -% rcIncrement
 
 # we now use a much simpler and non-recursive algorithm for cycle removal
@@ -496,20 +499,20 @@ proc collectCycles(gch: var TGcHeap) =
       prepareDealloc(c)
       gcTrace(c, csCycFreed)
       when logGC: writeCell("cycle collector dealloc cell", c)
-      when reallyDealloc: rawDealloc(allocator, c)
+      when reallyDealloc: rawDealloc(gch.region, c)
       else:
-        assert(c.typ != nil)
+        sysAssert(c.typ != nil)
         zeroMem(c, sizeof(TCell))
   Deinit(gch.cycleRoots)
   Init(gch.cycleRoots)
 
-proc gcMark(p: pointer) {.inline.} =
+proc gcMark(gch: var TGcHeap, p: pointer) {.inline.} =
   # the addresses are not as cells on the stack, so turn them to cells:
   var cell = usrToCell(p)
   var c = cast[TAddress](cell)
   if c >% PageSize and (c and (MemAlign-1)) == 0:
     # fast check: does it look like a cell?
-    if isAllocatedPtr(allocator, cell): 
+    if isAllocatedPtr(gch.region, cell): 
       # mark the cell:
       cell.refcount = cell.refcount +% rcIncrement
       add(gch.decStack, cell)
@@ -520,13 +523,13 @@ proc markThreadStacks(gch: var TGcHeap) =
     var it = threadList
     while it != nil:
       # mark registers: 
-      for i in 0 .. high(it.registers): gcMark(it.registers[i])
+      for i in 0 .. high(it.registers): gcMark(gch, it.registers[i])
       var sp = cast[TAddress](it.stackBottom)
       var max = cast[TAddress](it.stackTop)
       # XXX stack direction?
       # XXX unroll this loop:
       while sp <=% max:
-        gcMark(cast[ppointer](sp)[])
+        gcMark(gch, cast[ppointer](sp)[])
         sp = sp +% sizeof(pointer)
       it = it.next
 
@@ -545,24 +548,24 @@ when not defined(useNimRtl):
   proc setStackBottom(theStackBottom: pointer) =
     #c_fprintf(c_stdout, "stack bottom: %p;\n", theStackBottom)
     # the first init must be the one that defines the stack bottom:
-    if stackBottom == nil: stackBottom = theStackBottom
+    if gch.stackBottom == nil: gch.stackBottom = theStackBottom
     else:
       var a = cast[TAddress](theStackBottom) # and not PageMask - PageSize*2
-      var b = cast[TAddress](stackBottom)
+      var b = cast[TAddress](gch.stackBottom)
       when stackIncreases:
-        stackBottom = cast[pointer](min(a, b))
+        gch.stackBottom = cast[pointer](min(a, b))
       else:
-        stackBottom = cast[pointer](max(a, b))
+        gch.stackBottom = cast[pointer](max(a, b))
 
 proc stackSize(): int {.noinline.} =
   var stackTop {.volatile.}: pointer
-  result = abs(cast[int](addr(stackTop)) - cast[int](stackBottom))
+  result = abs(cast[int](addr(stackTop)) - cast[int](gch.stackBottom))
 
 when defined(sparc): # For SPARC architecture.
   proc isOnStack(p: pointer): bool =
     var stackTop {.volatile.}: pointer
     stackTop = addr(stackTop)
-    var b = cast[TAddress](stackBottom)
+    var b = cast[TAddress](gch.stackBottom)
     var a = cast[TAddress](stackTop)
     var x = cast[TAddress](p)
     result = a <=% x and x <=% b
@@ -574,13 +577,13 @@ when defined(sparc): # For SPARC architecture.
       asm  """"ta      0x3   ! ST_FLUSH_WINDOWS\n" """
 
     var
-      max = stackBottom
+      max = gch.stackBottom
       sp: PPointer
       stackTop: array[0..1, pointer]
     sp = addr(stackTop[0])
     # Addresses decrease as the stack grows.
     while sp <= max:
-      gcMark(sp[])
+      gcMark(gch, sp[])
       sp = cast[ppointer](cast[TAddress](sp) +% sizeof(pointer))
 
 elif defined(ELATE):
@@ -593,7 +596,7 @@ elif stackIncreases:
   proc isOnStack(p: pointer): bool =
     var stackTop {.volatile.}: pointer
     stackTop = addr(stackTop)
-    var a = cast[TAddress](stackBottom)
+    var a = cast[TAddress](gch.stackBottom)
     var b = cast[TAddress](stackTop)
     var x = cast[TAddress](p)
     result = a <=% x and x <=% b
@@ -606,12 +609,12 @@ elif stackIncreases:
   proc markStackAndRegisters(gch: var TGcHeap) {.noinline, cdecl.} =
     var registers: C_JmpBuf
     if c_setjmp(registers) == 0'i32: # To fill the C stack with registers.
-      var max = cast[TAddress](stackBottom)
+      var max = cast[TAddress](gch.stackBottom)
       var sp = cast[TAddress](addr(registers)) +% jmpbufSize -% sizeof(pointer)
       # sp will traverse the JMP_BUF as well (jmp_buf size is added,
       # otherwise sp would be below the registers structure).
       while sp >=% max:
-        gcMark(cast[ppointer](sp)[])
+        gcMark(gch, cast[ppointer](sp)[])
         sp = sp -% sizeof(pointer)
 
 else:
@@ -621,7 +624,7 @@ else:
   proc isOnStack(p: pointer): bool =
     var stackTop {.volatile.}: pointer
     stackTop = addr(stackTop)
-    var b = cast[TAddress](stackBottom)
+    var b = cast[TAddress](gch.stackBottom)
     var a = cast[TAddress](stackTop)
     var x = cast[TAddress](p)
     result = a <=% x and x <=% b
@@ -633,22 +636,22 @@ else:
     type PStackSlice = ptr array [0..7, pointer]
     var registers: C_JmpBuf
     if c_setjmp(registers) == 0'i32: # To fill the C stack with registers.
-      var max = cast[TAddress](stackBottom)
+      var max = cast[TAddress](gch.stackBottom)
       var sp = cast[TAddress](addr(registers))
       # loop unrolled:
       while sp <% max - 8*sizeof(pointer):
-        gcMark(cast[PStackSlice](sp)[0])
-        gcMark(cast[PStackSlice](sp)[1])
-        gcMark(cast[PStackSlice](sp)[2])
-        gcMark(cast[PStackSlice](sp)[3])
-        gcMark(cast[PStackSlice](sp)[4])
-        gcMark(cast[PStackSlice](sp)[5])
-        gcMark(cast[PStackSlice](sp)[6])
-        gcMark(cast[PStackSlice](sp)[7])
+        gcMark(gch, cast[PStackSlice](sp)[0])
+        gcMark(gch, cast[PStackSlice](sp)[1])
+        gcMark(gch, cast[PStackSlice](sp)[2])
+        gcMark(gch, cast[PStackSlice](sp)[3])
+        gcMark(gch, cast[PStackSlice](sp)[4])
+        gcMark(gch, cast[PStackSlice](sp)[5])
+        gcMark(gch, cast[PStackSlice](sp)[6])
+        gcMark(gch, cast[PStackSlice](sp)[7])
         sp = sp +% sizeof(pointer)*8
       # last few entries:
       while sp <=% max:
-        gcMark(cast[ppointer](sp)[])
+        gcMark(gch, cast[ppointer](sp)[])
         sp = sp +% sizeof(pointer)
 
 # ----------------------------------------------------------------------------
@@ -664,7 +667,7 @@ proc CollectZCT(gch: var TGcHeap) =
   while L[] > 0:
     var c = gch.zct.d[0]
     # remove from ZCT:
-    assert((c.refcount and colorMask) == rcZct)
+    sysAssert((c.refcount and colorMask) == rcZct)
     c.refcount = c.refcount and not colorMask
     gch.zct.d[0] = gch.zct.d[L[] - 1]
     dec(L[])
@@ -683,41 +686,42 @@ proc CollectZCT(gch: var TGcHeap) =
       # access invalid memory. This is done by prepareDealloc():
       prepareDealloc(c)
       forAllChildren(c, waZctDecRef)
-      when reallyDealloc: rawDealloc(allocator, c)
+      when reallyDealloc: rawDealloc(gch.region, c)
       else:
-        assert(c.typ != nil)
+        sysAssert(c.typ != nil)
         zeroMem(c, sizeof(TCell))
 
 proc unmarkStackAndRegisters(gch: var TGcHeap) = 
   var d = gch.decStack.d
   for i in 0..gch.decStack.len-1:
-    assert isAllocatedPtr(allocator, d[i])
+    sysAssert isAllocatedPtr(allocator, d[i])
     # decRef(d[i]) inlined: cannot create a cycle and must not acquire lock
     var c = d[i]
     # XXX no need for an atomic dec here:
     if --c.refcount:
       addZCT(gch.zct, c)
-    assert c.typ != nil
+    sysAssert c.typ != nil
   gch.decStack.len = 0
 
 proc collectCT(gch: var TGcHeap) =
-  if gch.zct.len >= ZctThreshold or (cycleGC and
-      getOccupiedMem() >= cycleThreshold) or stressGC:
+  if (gch.zct.len >= ZctThreshold or (cycleGC and
+      getOccupiedMem(gch.region) >= gch.cycleThreshold) or stressGC) and 
+      gch.recGcLock == 0:
     gch.stat.maxStackSize = max(gch.stat.maxStackSize, stackSize())
-    assert(gch.decStack.len == 0)
+    sysAssert(gch.decStack.len == 0)
     markStackAndRegisters(gch)
     markThreadStacks(gch)
     gch.stat.maxStackCells = max(gch.stat.maxStackCells, gch.decStack.len)
     inc(gch.stat.stackScans)
     collectZCT(gch)
     when cycleGC:
-      if getOccupiedMem() >= cycleThreshold or stressGC:
+      if getOccupiedMem() >= gch.cycleThreshold or stressGC:
         collectCycles(gch)
         collectZCT(gch)
         inc(gch.stat.cycleCollections)
-        cycleThreshold = max(InitialCycleThreshold, getOccupiedMem() *
-                             cycleIncrease)
-        gch.stat.maxThreshold = max(gch.stat.maxThreshold, cycleThreshold)
+        gch.cycleThreshold = max(InitialCycleThreshold, getOccupiedMem() *
+                                 cycleIncrease)
+        gch.stat.maxThreshold = max(gch.stat.maxThreshold, gch.cycleThreshold)
     unmarkStackAndRegisters(gch)
 
 when not defined(useNimRtl):
@@ -741,18 +745,18 @@ when not defined(useNimRtl):
     of gcOptimizeTime: nil
 
   proc GC_enableMarkAndSweep() =
-    cycleThreshold = InitialCycleThreshold
+    gch.cycleThreshold = InitialCycleThreshold
 
   proc GC_disableMarkAndSweep() =
-    cycleThreshold = high(cycleThreshold)-1
+    gch.cycleThreshold = high(gch.cycleThreshold)-1
     # set to the max value to suppress the cycle detector
 
   proc GC_fullCollect() =
     acquire(gch)
-    var oldThreshold = cycleThreshold
-    cycleThreshold = 0 # forces cycle collection
+    var oldThreshold = gch.cycleThreshold
+    gch.cycleThreshold = 0 # forces cycle collection
     collectCT(gch)
-    cycleThreshold = oldThreshold
+    gch.cycleThreshold = oldThreshold
     release(gch)
 
   proc GC_getStatistics(): string =
diff --git a/lib/system/inboxes.nim b/lib/system/inboxes.nim
new file mode 100644
index 000000000..8f683f612
--- /dev/null
+++ b/lib/system/inboxes.nim
@@ -0,0 +1,203 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2011 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Message passing for threads. The current implementation is slow and does
+## not work with cyclic data structures. But hey, it's better than nothing.
+
+type
+  pbytes = ptr array[0.. 0xffff, byte]
+  TInbox {.pure, final.} = object ## msg queue for a thread
+    rd, wr, count, mask: int
+    data: pbytes
+    lock: TSysLock
+    cond: TSysCond
+    elemType: PNimType
+    region: TMemRegion
+  PInbox = ptr TInbox
+  TLoadStoreMode = enum mStore, mLoad
+
+proc initInbox(p: pointer) =
+  var inbox = cast[PInbox](p)
+  initSysLock(inbox.lock)
+  initSysCond(inbox.cond)
+  inbox.mask = -1
+
+proc freeInbox(p: pointer) =
+  var inbox = cast[PInbox](p)
+  deallocOsPages(inbox.region)
+  deinitSys(inbox.lock)
+  deinitSysCond(inbox.cond)
+
+proc storeAux(dest, src: Pointer, mt: PNimType, t: PInbox, mode: TLoadStoreMode)
+proc storeAux(dest, src: Pointer, n: ptr TNimNode, t: PInbox,
+              mode: TLoadStoreMode) =
+  var
+    d = cast[TAddress](dest)
+    s = cast[TAddress](src)
+  case n.kind
+  of nkSlot: storeAux(cast[pointer](d +% n.offset), 
+                      cast[pointer](s +% n.offset), n.typ, t, mode)
+  of nkList:
+    for i in 0..n.len-1: storeAux(dest, src, n.sons[i], t, mode)
+  of nkCase:
+    copyMem(cast[pointer](d +% n.offset), cast[pointer](s +% n.offset),
+            n.typ.size)
+    var m = selectBranch(src, n)
+    if m != nil: storeAux(dest, src, m, t, mode)
+  of nkNone: sysAssert(false)
+
+proc storeAux(dest, src: Pointer, mt: PNimType, t: PInbox, 
+              mode: TLoadStoreMode) =
+  var
+    d = cast[TAddress](dest)
+    s = cast[TAddress](src)
+  sysAssert(mt != nil)
+  case mt.Kind
+  of tyString:
+    if mode == mStore:
+      var x = cast[ppointer](dest)
+      var s2 = cast[ppointer](s)[]
+      if s2 == nil: 
+        x[] = nil
+      else:
+        var ss = cast[NimString](s2)
+        var ns = cast[NimString](rawAlloc(t.region, ss.len+1 + GenericSeqSize))
+        copyMem(ns, ss, ss.len+1 + GenericSeqSize)
+        x[] = ns
+    else:
+      var x = cast[ppointer](dest)
+      var s2 = cast[ppointer](s)[]
+      if s2 == nil:
+        unsureAsgnRef(x, s2)
+      else:
+        unsureAsgnRef(x, copyString(cast[NimString](s2)))
+        rawDealloc(t.region, s2)
+  of tySequence:
+    var s2 = cast[ppointer](src)[]
+    var seq = cast[PGenericSeq](s2)
+    var x = cast[ppointer](dest)
+    if s2 == nil:
+      if mode == mStore:
+        x[] = nil
+      else:
+        unsureAsgnRef(x, nil)
+    else:
+      sysAssert(dest != nil)
+      if mode == mStore:
+        x[] = rawAlloc(t.region, seq.len *% mt.base.size +% GenericSeqSize)
+      else:
+        unsureAsgnRef(x, newObj(mt, seq.len * mt.base.size + GenericSeqSize))
+      var dst = cast[taddress](cast[ppointer](dest)[])
+      for i in 0..seq.len-1:
+        storeAux(
+          cast[pointer](dst +% i*% mt.base.size +% GenericSeqSize),
+          cast[pointer](cast[TAddress](s2) +% i *% mt.base.size +%
+                        GenericSeqSize),
+          mt.Base, t, mode)
+      var dstseq = cast[PGenericSeq](dst)
+      dstseq.len = seq.len
+      dstseq.space = seq.len
+      if mode != mStore: rawDealloc(t.region, s2)
+  of tyObject:
+    # copy type field:
+    var pint = cast[ptr PNimType](dest)
+    # XXX use dynamic type here!
+    pint[] = mt
+    storeAux(dest, src, mt.node, t, mode)
+  of tyTuple, tyPureObject:
+    storeAux(dest, src, mt.node, t, mode)
+  of tyArray, tyArrayConstr:
+    for i in 0..(mt.size div mt.base.size)-1:
+      storeAux(cast[pointer](d +% i*% mt.base.size),
+               cast[pointer](s +% i*% mt.base.size), mt.base, t, mode)
+  of tyRef:
+    var s = cast[ppointer](src)[]
+    var x = cast[ppointer](dest)
+    if s == nil:
+      if mode == mStore:
+        x[] = nil
+      else:
+        unsureAsgnRef(x, nil)
+    else:
+      if mode == mStore:
+        x[] = rawAlloc(t.region, mt.base.size)
+      else:
+        # XXX we should use the dynamic type here too, but that is not stored in
+        # the inbox at all --> use source[]'s object type? but how? we need a
+        # tyRef to the object!
+        var obj = newObj(mt.base, mt.base.size)
+        unsureAsgnRef(x, obj)
+      storeAux(x[], s, mt.base, t, mode)
+      if mode != mStore: rawDealloc(t.region, s)
+  else:
+    copyMem(dest, src, mt.size) # copy raw bits
+
+proc rawSend(q: PInbox, data: pointer, typ: PNimType) =
+  ## adds an `item` to the end of the queue `q`.
+  var cap = q.mask+1
+  if q.count >= cap:
+    # start with capicity for 2 entries in the queue:
+    if cap == 0: cap = 1
+    var n = cast[pbytes](rawAlloc0(q.region, cap*2*typ.size))
+    var z = 0
+    var i = q.rd
+    var c = q.count
+    while c > 0:
+      dec c
+      copyMem(addr(n[z*typ.size]), addr(q.data[i*typ.size]), typ.size)
+      i = (i + 1) and q.mask
+      inc z
+    if q.data != nil: rawDealloc(q.region, q.data)
+    q.data = n
+    q.mask = cap*2 - 1
+    q.wr = q.count
+    q.rd = 0
+    #echo "came here"
+  storeAux(addr(q.data[q.wr * typ.size]), data, typ, q, mStore)
+  inc q.count
+  q.wr = (q.wr + 1) and q.mask
+
+proc rawRecv(q: PInbox, data: pointer, typ: PNimType) =
+  assert q.count > 0
+  dec q.count
+  storeAux(data, addr(q.data[q.rd * typ.size]), typ, q, mLoad)
+  q.rd = (q.rd + 1) and q.mask
+
+template lockInbox(q: expr, action: stmt) =
+  acquireSys(q.lock)
+  action
+  releaseSys(q.lock)
+
+proc send*[TMsg](receiver: var TThread[TMsg], msg: TMsg) =
+  ## sends a message to a thread. `msg` is deeply copied.
+  var q = cast[PInbox](getInBoxMem(receiver))
+  acquireSys(q.lock)
+  var m: TMsg
+  shallowCopy(m, msg)
+  rawSend(q, addr(m), cast[PNimType](getTypeInfo(msg)))
+  releaseSys(q.lock)
+  SignalSysCond(q.cond)
+
+proc recv*[TMsg](): TMsg =
+  ## receives a message from its internal message queue. This blocks until
+  ## a message has arrived! You may use ``peek`` to avoid the blocking.
+  var q = cast[PInbox](getInBoxMem())
+  acquireSys(q.lock)
+  while q.count <= 0:
+    WaitSysCond(q.cond, q.lock)
+  rawRecv(q, addr(result), cast[PNimType](getTypeInfo(result)))
+  releaseSys(q.lock)
+
+proc peek*(): int =
+  ## returns the current number of messages in the inbox.
+  var q = cast[PInbox](getInBoxMem())
+  lockInbox(q):
+    result = q.count
+
+
diff --git a/lib/system/mmdisp.nim b/lib/system/mmdisp.nim
index d450c520e..e5efff615 100755
--- a/lib/system/mmdisp.nim
+++ b/lib/system/mmdisp.nim
@@ -62,11 +62,10 @@ when defined(boehmgc):
     const boehmLib = "boehmgc.dll"
   elif defined(macosx):
     const boehmLib = "libgc.dylib"
-    
-    proc boehmGCinit {.importc: "GC_init", dynlib: boehmLib.}
   else:
     const boehmLib = "/usr/lib/libgc.so.1"
-
+    
+  proc boehmGCinit {.importc: "GC_init", dynlib: boehmLib.}
   proc boehmGC_disable {.importc: "GC_disable", dynlib: boehmLib.} 
   proc boehmGC_enable {.importc: "GC_enable", dynlib: boehmLib.} 
   proc boehmGCincremental {.
@@ -177,12 +176,20 @@ elif defined(nogc):
   proc asgnRefNoCycle(dest: ppointer, src: pointer) {.compilerproc, inline.} =
     dest[] = src
 
+  var allocator {.rtlThreadVar.}: TMemRegion
+  InstantiateForRegion(allocator)
+
   include "system/cellsets"
 
 else:
   include "system/alloc"
+
+  proc unlockedAlloc(size: int): pointer {.inline.} 
+  proc unlockedAlloc0(size: int): pointer {.inline.} 
+  proc unlockedDealloc(p: pointer) {.inline.} 
+  
   include "system/cellsets"
-  assert(sizeof(TCell) == sizeof(TFreeCell))
+  sysAssert(sizeof(TCell) == sizeof(TFreeCell))
   include "system/gc"
   
 {.pop.}
diff --git a/lib/system/repr.nim b/lib/system/repr.nim
index 256313ebd..6b940ccb4 100755
--- a/lib/system/repr.nim
+++ b/lib/system/repr.nim
@@ -158,7 +158,7 @@ when not defined(useNimRtl):
   proc reprRecordAux(result: var string, p: pointer, n: ptr TNimNode,
                      cl: var TReprClosure) =
     case n.kind
-    of nkNone: assert(false)
+    of nkNone: sysAssert(false)
     of nkSlot:
       add result, $n.name
       add result, " = "
@@ -206,7 +206,7 @@ when not defined(useNimRtl):
       var t = cast[ptr PNimType](p)[]
       reprRecord(result, p, t, cl)
     of tyRef, tyPtr:
-      assert(p != nil)
+      sysAssert(p != nil)
       if cast[ppointer](p)[] == nil: add result, "nil"
       else: reprRef(result, cast[ppointer](p)[], typ, cl)
     of tySequence:
diff --git a/lib/system/syslocks.nim b/lib/system/syslocks.nim
new file mode 100644
index 000000000..c91e83dcd
--- /dev/null
+++ b/lib/system/syslocks.nim
@@ -0,0 +1,101 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2011 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Low level system locks and condition vars.
+
+when defined(Windows):
+  type
+    THandle = int
+    TSysLock {.final, pure.} = object # CRITICAL_SECTION in WinApi
+      DebugInfo: pointer
+      LockCount: int32
+      RecursionCount: int32
+      OwningThread: int
+      LockSemaphore: int
+      Reserved: int32
+          
+    TSysCond = THandle
+          
+  proc InitSysLock(L: var TSysLock) {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "InitializeCriticalSection".}
+    ## Initializes the lock `L`.
+
+  proc TryAcquireSysAux(L: var TSysLock): int32 {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "TryEnterCriticalSection".}
+    ## Tries to acquire the lock `L`.
+    
+  proc TryAcquireSys(L: var TSysLock): bool {.inline.} = 
+    result = TryAcquireSysAux(L) != 0'i32
+
+  proc AcquireSys(L: var TSysLock) {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "EnterCriticalSection".}
+    ## Acquires the lock `L`.
+    
+  proc ReleaseSys(L: var TSysLock) {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "LeaveCriticalSection".}
+    ## Releases the lock `L`.
+
+  proc DeinitSys(L: var TSysLock) {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "DeleteCriticalSection".}
+
+  proc CreateEvent(lpEventAttributes: pointer, 
+                   bManualReset, bInitialState: int32,
+                   lpName: cstring): TSysCond {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "CreateEvent".}
+  
+  proc CloseHandle(hObject: THandle) {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "CloseHandle".}
+  proc WaitForSingleObject(hHandle: THandle, dwMilliseconds: int32): int32 {.
+    stdcall, dynlib: "kernel32", importc: "WaitForSingleObject".}
+
+  proc SignalSysCond(hEvent: TSysCond) {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "SetEvent".}
+  
+  proc InitSysCond(cond: var TSysCond) {.inline.} =
+    cond = CreateEvent(nil, 0'i32, 0'i32, nil)
+  proc DeinitSysCond(cond: var TSysCond) {.inline.} =
+    CloseHandle(cond)
+  proc WaitSysCond(cond: var TSysCond, lock: var TSysLock) =
+    releaseSys(lock)
+    discard WaitForSingleObject(cond, -1'i32)
+    acquireSys(lock)
+
+else:
+  type
+    TSysLock {.importc: "pthread_mutex_t", pure, final,
+               header: "<sys/types.h>".} = object
+    TSysCond {.importc: "pthread_cond_t", pure, final,
+               header: "<sys/types.h>".} = object
+
+  proc InitSysLock(L: var TSysLock, attr: pointer = nil) {.
+    importc: "pthread_mutex_init", header: "<pthread.h>", noSideEffect.}
+
+  proc AcquireSys(L: var TSysLock) {.noSideEffect,
+    importc: "pthread_mutex_lock", header: "<pthread.h>".}
+  proc TryAcquireSysAux(L: var TSysLock): cint {.noSideEffect,
+    importc: "pthread_mutex_trylock", header: "<pthread.h>".}
+
+  proc TryAcquireSys(L: var TSysLock): bool {.inline.} = 
+    result = TryAcquireSysAux(L) == 0'i32
+
+  proc ReleaseSys(L: var TSysLock) {.noSideEffect,
+    importc: "pthread_mutex_unlock", header: "<pthread.h>".}
+  proc DeinitSys(L: var TSysLock) {.
+    importc: "pthread_mutex_destroy", header: "<pthread.h>".}
+
+  proc InitSysCond(cond: var TSysCond, cond_attr: pointer = nil) {.
+    importc: "pthread_cond_init", header: "<pthread.h>".}
+  proc WaitSysCond(cond: var TSysCond, lock: var TSysLock) {.
+    importc: "pthread_cond_wait", header: "<pthread.h>".}
+  proc SignalSysCond(cond: var TSysCond) {.
+    importc: "pthread_cond_signal", header: "<pthread.h>".}
+  
+  proc DeinitSysCond(cond: var TSysCond) {.
+    importc: "pthread_cond_destroy", header: "<pthread.h>".}
+  
diff --git a/lib/system/threads.nim b/lib/system/threads.nim
index 86a6a5691..9bb67863b 100755
--- a/lib/system/threads.nim
+++ b/lib/system/threads.nim
@@ -25,8 +25,8 @@
 ##    thr: array [0..4, TThread[tuple[a,b: int]]]
 ##    L: TLock
 ##  
-##  proc threadFunc(interval: tuple[a,b: int]) {.procvar.} = 
-##    for i in interval.a..interval.b: 
+##  proc threadFunc(interval: tuple[a,b: int]) {.thread.} =
+##    for i in interval.a..interval.b:
 ##      Acquire(L) # lock stdout
 ##      echo i
 ##      Release(L)
@@ -41,38 +41,13 @@ const
   maxRegisters = 256 # don't think there is an arch with more registers
   maxLocksPerThread* = 10 ## max number of locks a thread can hold
                           ## at the same time
+  useStackMaskHack = false ## use the stack mask hack for better performance
+  StackGuardSize = 4096
+  ThreadStackMask = 1024*256*sizeof(int)-1
+  ThreadStackSize = ThreadStackMask+1 - StackGuardSize
 
-when defined(Windows):
-  type
-    TSysLock {.final, pure.} = object # CRITICAL_SECTION in WinApi
-      DebugInfo: pointer
-      LockCount: int32
-      RecursionCount: int32
-      OwningThread: int
-      LockSemaphore: int
-      Reserved: int32
-          
-  proc InitSysLock(L: var TSysLock) {.stdcall, noSideEffect,
-    dynlib: "kernel32", importc: "InitializeCriticalSection".}
-    ## Initializes the lock `L`.
-
-  proc TryAcquireSysAux(L: var TSysLock): int32 {.stdcall, noSideEffect,
-    dynlib: "kernel32", importc: "TryEnterCriticalSection".}
-    ## Tries to acquire the lock `L`.
-    
-  proc TryAcquireSys(L: var TSysLock): bool {.inline.} = 
-    result = TryAcquireSysAux(L) != 0'i32
-
-  proc AcquireSys(L: var TSysLock) {.stdcall, noSideEffect,
-    dynlib: "kernel32", importc: "EnterCriticalSection".}
-    ## Acquires the lock `L`.
-    
-  proc ReleaseSys(L: var TSysLock) {.stdcall, noSideEffect,
-    dynlib: "kernel32", importc: "LeaveCriticalSection".}
-    ## Releases the lock `L`.
-
+when defined(windows):
   type
-    THandle = int
     TSysThread = THandle
     TWinThreadProc = proc (x: pointer): int32 {.stdcall.}
 
@@ -95,9 +70,6 @@ when defined(Windows):
                               dwMilliseconds: int32): int32 {.
     stdcall, dynlib: "kernel32", importc: "WaitForMultipleObjects".}
 
-  proc WaitForSingleObject(hHandle: TSysThread, dwMilliseconds: int32): int32 {.
-    stdcall, dynlib: "kernel32", importc: "WaitForSingleObject".}
-
   proc TerminateThread(hThread: TSysThread, dwExitCode: int32): int32 {.
     stdcall, dynlib: "kernel32", importc: "TerminateThread".}
     
@@ -116,24 +88,6 @@ else:
   {.passC: "-pthread".}
 
   type
-    TSysLock {.importc: "pthread_mutex_t", pure, final,
-               header: "<sys/types.h>".} = object
-
-  proc InitSysLock(L: var TSysLock, attr: pointer = nil) {.
-    importc: "pthread_mutex_init", header: "<pthread.h>", noSideEffect.}
-
-  proc AcquireSys(L: var TSysLock) {.noSideEffect,
-    importc: "pthread_mutex_lock", header: "<pthread.h>".}
-  proc TryAcquireSysAux(L: var TSysLock): cint {.noSideEffect,
-    importc: "pthread_mutex_trylock", header: "<pthread.h>".}
-
-  proc TryAcquireSys(L: var TSysLock): bool {.inline.} = 
-    result = TryAcquireSysAux(L) == 0'i32
-
-  proc ReleaseSys(L: var TSysLock) {.noSideEffect,
-    importc: "pthread_mutex_unlock", header: "<pthread.h>".}
-
-  type
     TSysThread {.importc: "pthread_t", header: "<sys/types.h>",
                  final, pure.} = object
     Tpthread_attr {.importc: "pthread_attr_t",
@@ -191,57 +145,71 @@ else:
   proc ThreadVarGetValue(s: TThreadVarSlot): pointer {.inline.} =
     result = pthread_getspecific(s)
 
-const emulatedThreadVars = defined(macosx)
+  when useStackMaskHack:
+    proc pthread_attr_setstack(attr: var TPthread_attr, stackaddr: pointer,
+                               size: int): cint {.
+      importc: "pthread_attr_setstack", header: "<pthread.h>".}
+
+const
+  emulatedThreadVars = true
 
 when emulatedThreadVars:
   # the compiler generates this proc for us, so that we can get the size of
-  # the thread local var block:
+  # the thread local var block; we use this only for sanity checking though
   proc NimThreadVarsSize(): int {.noconv, importc: "NimThreadVarsSize".}
 
-proc ThreadVarsAlloc(size: int): pointer =
-  result = c_malloc(size)
-  zeroMem(result, size)
-proc ThreadVarsDealloc(p: pointer) {.importc: "free", nodecl.}
-
+# we preallocate a fixed size for thread local storage, so that no heap
+# allocations are needed. Currently less than 7K are used on a 64bit machine.
+# We use ``float`` for proper alignment:
 type
+  TThreadLocalStorage = array [0..1_000, float]
+
   PGcThread = ptr TGcThread
   TGcThread {.pure.} = object
     sys: TSysThread
     next, prev: PGcThread
-    stackBottom, stackTop, threadLocalStorage: pointer
+    stackBottom, stackTop: pointer
     stackSize: int
-    locksLen: int
-    locks: array [0..MaxLocksPerThread-1, pointer]
-    registers: array[0..maxRegisters-1, pointer] # register contents for GC
+    inbox: TThreadLocalStorage
+    when emulatedThreadVars and not useStackMaskHack:
+      tls: TThreadLocalStorage
+    else:
+      nil
 
 # XXX it'd be more efficient to not use a global variable for the 
 # thread storage slot, but to rely on the implementation to assign slot 0
 # for us... ;-)
 var globalsSlot = ThreadVarAlloc()
 #const globalsSlot = TThreadVarSlot(0)
-#assert checkSlot.int == globalsSlot.int
-  
-proc ThisThread(): PGcThread {.compilerRtl, inl.} =
-  result = cast[PGcThread](ThreadVarGetValue(globalsSlot))
+#sysAssert checkSlot.int == globalsSlot.int
 
 proc GetThreadLocalVars(): pointer {.compilerRtl, inl.} =
-  result = cast[PGcThread](ThreadVarGetValue(globalsSlot)).threadLocalStorage
+  result = addr(cast[PGcThread](ThreadVarGetValue(globalsSlot)).tls)
+
+when useStackMaskHack:
+  proc MaskStackPointer(offset: int): pointer {.compilerRtl, inl.} =
+    var x {.volatile.}: pointer
+    x = addr(x)
+    result = cast[pointer]((cast[int](x) and not ThreadStackMask) +% 
+      (0) +% offset)
 
 # create for the main thread. Note: do not insert this data into the list
 # of all threads; it's not to be stopped etc.
 when not defined(useNimRtl):
-  var mainThread: TGcThread
-  
-  ThreadVarSetValue(globalsSlot, addr(mainThread))
-  when emulatedThreadVars:
-    mainThread.threadLocalStorage = ThreadVarsAlloc(NimThreadVarsSize())
-
-  initStackBottom()
-  initGC()
+  when not useStackMaskHack:
+    var mainThread: TGcThread
+    ThreadVarSetValue(globalsSlot, addr(mainThread))
+    initStackBottom()
+    initGC()
   
   var heapLock: TSysLock
   InitSysLock(HeapLock)
 
+  when emulatedThreadVars:
+    if NimThreadVarsSize() > sizeof(TThreadLocalStorage):
+      echo "too large thread local storage size requested"
+      quit 1
+
   var
     threadList: PGcThread
     
@@ -251,11 +219,11 @@ when not defined(useNimRtl):
     t.prev = nil
     t.next = threadList
     if threadList != nil: 
-      assert(threadList.prev == nil)
+      sysAssert(threadList.prev == nil)
       threadList.prev = t
     threadList = t
     ReleaseSys(HeapLock)
-        
+  
   proc unregisterThread(t: PGcThread) =
     # we need to use the GC global lock here!
     AcquireSys(HeapLock)
@@ -270,9 +238,7 @@ when not defined(useNimRtl):
     
   # on UNIX, the GC uses ``SIGFREEZE`` to tell every thread to stop so that
   # the GC can examine the stacks?
-  
-  proc stopTheWord() =
-    nil
+  proc stopTheWord() = nil
     
 # We jump through some hops here to ensure that Nimrod thread procs can have
 # the Nimrod calling convention. This is needed because thread procs are 
@@ -286,26 +252,33 @@ type
     fn: proc (p: TParam)
     data: TParam
 
+proc initInbox(p: pointer)
+proc freeInbox(p: pointer)
 when not defined(boehmgc) and not hasSharedHeap:
   proc deallocOsPages()
   
 template ThreadProcWrapperBody(closure: expr) =
   ThreadVarSetValue(globalsSlot, closure)
   var t = cast[ptr TThread[TParam]](closure)
-  when emulatedThreadVars:
-    t.threadLocalStorage = ThreadVarsAlloc(NimThreadVarsSize())
+  when useStackMaskHack:
+    var tls: TThreadLocalStorage
   when not defined(boehmgc) and not hasSharedHeap:
     # init the GC for this thread:
     setStackBottom(addr(t))
     initGC()
   t.stackBottom = addr(t)
   registerThread(t)
+  initInbox(addr(t.inbox))
   try:
+    when false:
+      var a = addr(tls)
+      var b = MaskStackPointer(1293920-372736-303104-36864)
+      c_fprintf(c_stdout, "TLS:    %p\nmasked: %p\ndiff:   %ld\n",
+                a, b, cast[int](a) - cast[int](b))
     t.fn(t.data)
   finally:
     # XXX shut-down is not executed when the thread is forced down!
-    when emulatedThreadVars:
-      ThreadVarsDealloc(t.threadLocalStorage)
+    freeInbox(addr(t.inbox))
     unregisterThread(t)
     when defined(deallocOsPages): deallocOsPages()
   
@@ -330,7 +303,7 @@ proc joinThreads*[TParam](t: openArray[TThread[TParam]]) =
   ## waits for every thread in `t` to finish.
   when hostOS == "windows":
     var a: array[0..255, TSysThread]
-    assert a.len >= t.len
+    sysAssert a.len >= t.len
     for i in 0..t.high: a[i] = t[i].sys
     discard WaitForMultipleObjects(t.len, cast[ptr TSysThread](addr(a)), 1, -1)
   else:
@@ -338,7 +311,7 @@ proc joinThreads*[TParam](t: openArray[TThread[TParam]]) =
 
 when false:
   # XXX a thread should really release its heap here somehow:
-  proc destroyThread*[TParam](t: var TThread[TParam]) {.inline.} =
+  proc destroyThread*[TParam](t: var TThread[TParam]) =
     ## forces the thread `t` to terminate. This is potentially dangerous if
     ## you don't have full control over `t` and its acquired resources.
     when hostOS == "windows":
@@ -348,28 +321,32 @@ when false:
     unregisterThread(addr(t))
 
 proc createThread*[TParam](t: var TThread[TParam], 
-                           tp: proc (param: TParam), 
-                           param: TParam,
-                           stackSize = 1024*256*sizeof(int)) {.
-                           magic: "CreateThread".} = 
+                           tp: proc (param: TParam) {.thread.}, 
+                           param: TParam) =
   ## creates a new thread `t` and starts its execution. Entry point is the
   ## proc `tp`. `param` is passed to `tp`.
   t.data = param
   t.fn = tp
-  t.stackSize = stackSize
+  t.stackSize = ThreadStackSize
   when hostOS == "windows":
     var dummyThreadId: int32
-    t.sys = CreateThread(nil, stackSize, threadProcWrapper[TParam],
+    t.sys = CreateThread(nil, ThreadStackSize, threadProcWrapper[TParam],
                          addr(t), 0'i32, dummyThreadId)
     if t.sys <= 0:
       raise newException(EResourceExhausted, "cannot create thread")
   else:
     var a: Tpthread_attr
     pthread_attr_init(a)
-    pthread_attr_setstacksize(a, stackSize)
+    pthread_attr_setstacksize(a, ThreadStackSize)
     if pthread_create(t.sys, a, threadProcWrapper[TParam], addr(t)) != 0:
       raise newException(EResourceExhausted, "cannot create thread")
 
+when useStackMaskHack:
+  proc runMain(tp: proc (dummy: pointer) {.thread.}) {.compilerproc.} =
+    var mainThread: TThread[pointer]
+    createThread(mainThread, tp, nil)
+    joinThread(mainThread)
+
 # --------------------------- lock handling ----------------------------------
 
 type
@@ -380,18 +357,20 @@ const
 
 when nodeadlocks:
   var
-    deadlocksPrevented* = 0  ## counts the number of times a 
+    deadlocksPrevented*: int ## counts the number of times a 
                              ## deadlock has been prevented
+    locksLen {.threadvar.}: int
+    locks {.threadvar.}: array [0..MaxLocksPerThread-1, pointer]
+
+  proc OrderedLocks(): bool = 
+    for i in 0 .. locksLen-2:
+      if locks[i] >= locks[i+1]: return false
+    result = true
 
 proc InitLock*(lock: var TLock) {.inline.} =
   ## Initializes the lock `lock`.
   InitSysLock(lock)
 
-proc OrderedLocks(g: PGcThread): bool = 
-  for i in 0 .. g.locksLen-2:
-    if g.locks[i] >= g.locks[i+1]: return false
-  result = true
-
 proc TryAcquire*(lock: var TLock): bool {.inline.} = 
   ## Try to acquires the lock `lock`. Returns `true` on success.
   result = TryAcquireSys(lock)
@@ -399,88 +378,93 @@ proc TryAcquire*(lock: var TLock): bool {.inline.} =
     if not result: return
     # we have to add it to the ordered list. Oh, and we might fail if
     # there is no space in the array left ...
-    var g = ThisThread()
-    if g.locksLen >= len(g.locks):
+    if locksLen >= len(locks):
       ReleaseSys(lock)
       raise newException(EResourceExhausted, "cannot acquire additional lock")
     # find the position to add:
     var p = addr(lock)
-    var L = g.locksLen-1
+    var L = locksLen-1
     var i = 0
     while i <= L:
-      assert g.locks[i] != nil
-      if g.locks[i] < p: inc(i) # in correct order
-      elif g.locks[i] == p: return # thread already holds lock
+      sysAssert locks[i] != nil
+      if locks[i] < p: inc(i) # in correct order
+      elif locks[i] == p: return # thread already holds lock
       else:
         # do the crazy stuff here:
         while L >= i:
-          g.locks[L+1] = g.locks[L]
+          locks[L+1] = locks[L]
           dec L
-        g.locks[i] = p
-        inc(g.locksLen)
-        assert OrderedLocks(g)
+        locks[i] = p
+        inc(locksLen)
+        sysAssert OrderedLocks()
         return
     # simply add to the end:
-    g.locks[g.locksLen] = p
-    inc(g.locksLen)
-    assert OrderedLocks(g)
+    locks[locksLen] = p
+    inc(locksLen)
+    sysAssert OrderedLocks(g)
 
 proc Acquire*(lock: var TLock) =
   ## Acquires the lock `lock`.
   when nodeadlocks:
-    var g = ThisThread()
     var p = addr(lock)
-    var L = g.locksLen-1
+    var L = locksLen-1
     var i = 0
     while i <= L:
-      assert g.locks[i] != nil
-      if g.locks[i] < p: inc(i) # in correct order
-      elif g.locks[i] == p: return # thread already holds lock
+      sysAssert locks[i] != nil
+      if locks[i] < p: inc(i) # in correct order
+      elif locks[i] == p: return # thread already holds lock
       else:
         # do the crazy stuff here:
-        if g.locksLen >= len(g.locks):
+        if locksLen >= len(locks):
           raise newException(EResourceExhausted, 
               "cannot acquire additional lock")
         while L >= i:
-          ReleaseSys(cast[ptr TSysLock](g.locks[L])[])
-          g.locks[L+1] = g.locks[L]
+          ReleaseSys(cast[ptr TSysLock](locks[L])[])
+          locks[L+1] = locks[L]
           dec L
         # acquire the current lock:
         AcquireSys(lock)
-        g.locks[i] = p
-        inc(g.locksLen)
+        locks[i] = p
+        inc(locksLen)
         # acquire old locks in proper order again:
-        L = g.locksLen-1
+        L = locksLen-1
         inc i
         while i <= L:
-          AcquireSys(cast[ptr TSysLock](g.locks[i])[])
+          AcquireSys(cast[ptr TSysLock](locks[i])[])
           inc(i)
         # DANGER: We can only modify this global var if we gained every lock!
         # NO! We need an atomic increment. Crap.
         discard system.atomicInc(deadlocksPrevented, 1)
-        assert OrderedLocks(g)
+        sysAssert OrderedLocks(g)
         return
         
     # simply add to the end:
-    if g.locksLen >= len(g.locks):
+    if locksLen >= len(locks):
       raise newException(EResourceExhausted, "cannot acquire additional lock")
     AcquireSys(lock)
-    g.locks[g.locksLen] = p
-    inc(g.locksLen)
-    assert OrderedLocks(g)
+    locks[locksLen] = p
+    inc(locksLen)
+    sysAssert OrderedLocks(g)
   else:
     AcquireSys(lock)
   
 proc Release*(lock: var TLock) =
   ## Releases the lock `lock`.
   when nodeadlocks:
-    var g = ThisThread()
     var p = addr(lock)
-    var L = g.locksLen
+    var L = locksLen
     for i in countdown(L-1, 0):
-      if g.locks[i] == p: 
-        for j in i..L-2: g.locks[j] = g.locks[j+1]
-        dec g.locksLen
+      if locks[i] == p: 
+        for j in i..L-2: locks[j] = locks[j+1]
+        dec locksLen
         break
   ReleaseSys(lock)
 
+# ------------------------ message passing support ---------------------------
+
+proc getInBoxMem*[TMsg](t: var TThread[TMsg]): pointer {.inline.} =
+  result = addr(t.inbox)
+
+proc getInBoxMem*(): pointer {.inline.} =
+  result = addr(cast[PGcThread](ThreadVarGetValue(globalsSlot)).inbox)
+