10 files changed, 470 insertions, 115 deletions
diff --git a/compiler/ccgexprs.nim b/compiler/ccgexprs.nim
index 7fb6af896..34fdf5bf1 100644
--- a/compiler/ccgexprs.nim
+++ b/compiler/ccgexprs.nim
@@ -1636,7 +1636,7 @@ proc genMagicExpr(p: BProc, e: PNode, d: var TLoc, op: TMagic) =
   of mSlurp..mQuoteAst:
     localError(e.info, errXMustBeCompileTime, e.sons[0].sym.name.s)
   of mSpawn:
-    let n = lowerings.wrapProcForSpawn(p.module.module, e.sons[1])
+    let n = lowerings.wrapProcForSpawn(p.module.module, e[1], e.typ, nil, nil)
     expr(p, n, d)
   of mParallel:
     let n = semparallel.liftParallel(p.module.module, e)
diff --git a/compiler/guards.nim b/compiler/guards.nim
index de0ce1dcc..3df3bd1a8 100644
--- a/compiler/guards.nim
+++ b/compiler/guards.nim
@@ -672,12 +672,8 @@ proc simpleSlice*(a, b: PNode): BiggestInt =
   else:
     result = -1
 
-proc proveLe*(m: TModel; a, b: PNode): TImplication =
-  let res = canon(opLe.buildCall(a, b))
-  #echo renderTree(res)
-  # we hardcode lots of axioms here:
-  let a = res[1]
-  let b = res[2]
+proc ple(m: TModel; a, b: PNode): TImplication =  
+  template `<=?`(a,b): expr = ple(m,a,b) == impYes
   #   0 <= 3
   if a.isValue and b.isValue:
     return if leValue(a, b): impYes else: impNo
@@ -692,26 +688,46 @@ proc proveLe*(m: TModel; a, b: PNode): TImplication =
   # x <= x
   if sameTree(a, b): return impYes
 
-  #   x <= x+c  iff 0 <= c
-  if b.getMagic in someAdd and sameTree(a, b[1]):
-    return proveLe(m, zero(), b[2])
+  # 0 <= x.len
+  if b.getMagic in someLen and a.isValue:
+    if a.intVal <= 0: return impYes
+
+  #   x <= y+c  if 0 <= c and x <= y
+  if b.getMagic in someAdd and zero() <=? b[2] and a <=? b[1]: return impYes
+
+  #   x+c <= y  if c <= 0 and x <= y
+  if a.getMagic in someAdd and a[2] <=? zero() and a[1] <=? b: return impYes
 
-  #   x+c <= x  iff c <= 0
-  if a.getMagic in someAdd and sameTree(b, a[1]):
-    return proveLe(m, a[2], zero())
+  #   x <= y*c  if  1 <= c and x <= y  and 0 <= y
+  if b.getMagic in someMul:
+    if a <=? b[1] and one() <=? b[2] and zero() <=? b[1]: return impYes
 
-  #   x <= x*c  if  1 <= c and 0 <= x:
-  if b.getMagic in someMul and sameTree(a, b[1]):
-    if proveLe(m, one(), b[2]) == impYes and proveLe(m, zero(), a) == impYes:
-      return impYes
+  #   x div c <= y   if   1 <= c  and  0 <= y  and x <= y:
+  if a.getMagic in someDiv:
+    if one() <=? a[2] and zero() <=? b and a[1] <=? b: return impYes
 
-  #   x div c <= x   if   1 <= c  and  0 <= x:
-  if a.getMagic in someDiv and sameTree(a[1], b):
-    if proveLe(m, one(), a[2]) == impYes and proveLe(m, zero(), b) == impYes:
-      return impYes
+  # slightly subtle:
+  # x <= max(y, z)  iff x <= y or x <= z
+  # note that 'x <= max(x, z)' is a special case of the above rule
+  if b.getMagic in someMax:
+    if a <=? b[1] or a <=? b[2]: return impYes
+
+  # min(x, y) <= z  iff x <= z or y <= z
+  if a.getMagic in someMin:
+    if a[1] <=? b or a[2] <=? b: return impYes
 
   # use the knowledge base:
-  return doesImply(m, res)
+  return doesImply(m, opLe.buildCall(a, b))
+
+proc proveLe*(m: TModel; a, b: PNode): TImplication =
+  #echo "ROOT ", renderTree(a), " <=? ", b.rendertree
+  let x = canon(opLe.buildCall(a, b))
+  #echo renderTree(res)
+  result = ple(m, x[1], x[2])
+  if result == impUnknown:
+    # try an alternative:  a <= b  iff  not (b < a)  iff  not (b+1 <= a):
+    let y = canon(opLe.buildCall(opAdd.buildCall(b, one()), a))
+    result = ~ple(m, y[1], y[2])
 
 proc addFactLe*(m: var TModel; a, b: PNode) =
   m.add canon(opLe.buildCall(a, b))
diff --git a/compiler/lowerings.nim b/compiler/lowerings.nim
index 704cfbcdd..2a1a8e577 100644
--- a/compiler/lowerings.nim
+++ b/compiler/lowerings.nim
@@ -13,6 +13,8 @@ const
   genPrefix* = ":tmp"         # prefix for generated names
 
 import ast, astalgo, types, idents, magicsys, msgs, options
+from guards import createMagic
+from trees import getMagic
 
 proc newTupleAccess*(tup: PNode, i: int): PNode =
   result = newNodeIT(nkBracketExpr, tup.info, tup.typ.skipTypes(
@@ -80,19 +82,23 @@ proc newDotExpr(obj, b: PSym): PNode =
   addSon(result, newSymNode(field))
   result.typ = field.typ
 
-proc indirectAccess*(a: PNode, b: PSym, info: TLineInfo): PNode = 
+proc indirectAccess*(a: PNode, b: string, info: TLineInfo): PNode = 
   # returns a[].b as a node
   var deref = newNodeI(nkHiddenDeref, info)
-  deref.typ = a.typ.sons[0]
+  deref.typ = a.typ.skipTypes(abstractInst).sons[0]
   assert deref.typ.kind == tyObject
-  let field = getSymFromList(deref.typ.n, getIdent(b.name.s & $b.id))
-  assert field != nil, b.name.s
+  let field = getSymFromList(deref.typ.n, getIdent(b))
+  assert field != nil, b
   addSon(deref, a)
   result = newNodeI(nkDotExpr, info)
   addSon(result, deref)
   addSon(result, newSymNode(field))
   result.typ = field.typ
 
+proc indirectAccess*(a: PNode, b: PSym, info: TLineInfo): PNode = 
+  # returns a[].b as a node
+  result = indirectAccess(a, b.name.s & $b.id, info)
+
 proc indirectAccess*(a, b: PSym, info: TLineInfo): PNode =
   result = indirectAccess(newSymNode(a), b, info)
 
@@ -102,6 +108,11 @@ proc genAddrOf*(n: PNode): PNode =
   result.typ = newType(tyPtr, n.typ.owner)
   result.typ.rawAddSon(n.typ)
 
+proc genDeref*(n: PNode): PNode =
+  result = newNodeIT(nkHiddenDeref, n.info, 
+                     n.typ.skipTypes(abstractInst).sons[0])
+  result.add n
+
 proc callCodegenProc*(name: string, arg1: PNode; 
                       arg2, arg3: PNode = nil): PNode =
   result = newNodeI(nkCall, arg1.info)
@@ -114,14 +125,83 @@ proc callCodegenProc*(name: string, arg1: PNode;
     if arg2 != nil: result.add arg2
     if arg3 != nil: result.add arg3
 
+# we have 4 cases to consider:
+# - a void proc --> nothing to do
+# - a proc returning GC'ed memory --> requires a future
+# - a proc returning non GC'ed memory --> pass as hidden 'var' parameter
+# - not in a parallel environment --> requires a future for memory safety
+type
+  TSpawnResult = enum
+    srVoid, srFuture, srByVar
+  TFutureKind = enum
+    futInvalid # invalid type T for 'Future[T]'
+    futGC      # Future of a GC'ed type
+    futBlob    # Future of a blob type
+
+proc spawnResult(t: PType; inParallel: bool): TSpawnResult =
+  if t.isEmptyType: srVoid
+  elif inParallel and not containsGarbageCollectedRef(t): srByVar
+  else: srFuture
+
+proc futureKind(t: PType): TFutureKind =
+  if t.skipTypes(abstractInst).kind in {tyRef, tyString, tySequence}: futGC
+  elif containsGarbageCollectedRef(t): futInvalid
+  else: futBlob
+
+discard """
+We generate roughly this:
+
+proc f_wrapper(args) =
+  var a = args.a # copy strings/seqs; thread transfer; not generated for
+                 # the 'parallel' statement
+  var b = args.b
+
+  args.fut = createFuture(thread, sizeof(T)) # optional
+  nimArgsPassingDone() # signal parent that the work is done
+  args.fut.blob = f(a, b, ...)
+  # - or -
+  f(a, b, ...)
+
+stmtList:
+  var scratchObj
+  scratchObj.a = a
+  scratchObj.b = b
+
+  nimSpawn(f_wrapper, addr scratchObj)
+  scratchObj.fut # optional
+
+"""
+
+proc createNimCreateFutureCall(fut, threadParam: PNode): PNode =
+  let size = newNodeIT(nkCall, fut.info, getSysType(tyInt))
+  size.add newSymNode(createMagic("sizeof", mSizeOf))
+  assert fut.typ.kind == tyGenericInst
+  size.add newNodeIT(nkType, fut.info, fut.typ.sons[1])
+
+  let castExpr = newNodeIT(nkCast, fut.info, fut.typ)
+  castExpr.add emptyNode
+  castExpr.add callCodeGenProc("nimCreateFuture", threadParam, size)
+  result = newFastAsgnStmt(fut, castExpr)
+
 proc createWrapperProc(f: PNode; threadParam, argsParam: PSym;
-                       varSection, call, barrier: PNode): PSym =
+                       varSection, call, barrier, fut: PNode): PSym =
   var body = newNodeI(nkStmtList, f.info)
   body.add varSection
   if barrier != nil:
     body.add callCodeGenProc("barrierEnter", barrier)
-  body.add callCodeGenProc("nimArgsPassingDone", newSymNode(threadParam))
-  body.add call
+  if fut != nil:
+    body.add createNimCreateFutureCall(fut, threadParam.newSymNode)
+    if barrier == nil:
+      body.add callCodeGenProc("nimFutureCreateCondVar", fut)
+
+  body.add callCodeGenProc("nimArgsPassingDone", threadParam.newSymNode)
+  if fut != nil:
+    body.add newAsgnStmt(indirectAccess(fut, 
+        if fut.typ.futureKind==futGC: "data" else: "blob", fut.info), call)
+    if barrier == nil:
+      body.add callCodeGenProc("nimFutureSignal", fut)
+  else:
+    body.add call
   if barrier != nil:
     body.add callCodeGenProc("barrierLeave", barrier)
 
@@ -151,10 +231,148 @@ proc createCastExpr(argsParam: PSym; objType: PType): PNode =
   result.typ = newType(tyPtr, objType.owner)
   result.typ.rawAddSon(objType)
 
-proc wrapProcForSpawn*(owner: PSym; n: PNode; barrier: PNode = nil): PNode =
-  result = newNodeI(nkStmtList, n.info)
-  if n.kind notin nkCallKinds or not n.typ.isEmptyType:
-    localError(n.info, "'spawn' takes a call expression of type void")
+proc setupArgsForConcurrency(n: PNode; objType: PType; scratchObj: PSym, 
+                             castExpr, call, varSection, result: PNode) =
+  let formals = n[0].typ.n
+  let tmpName = getIdent(genPrefix)
+  for i in 1 .. <n.len:
+    # we pick n's type here, which hopefully is 'tyArray' and not
+    # 'tyOpenArray':
+    var argType = n[i].typ.skipTypes(abstractInst)
+    if i < formals.len and formals[i].typ.kind == tyVar:
+      localError(n[i].info, "'spawn'ed function cannot have a 'var' parameter")
+    elif containsTyRef(argType):
+      localError(n[i].info, "'spawn'ed function cannot refer to 'ref'/closure")
+
+    let fieldname = if i < formals.len: formals[i].sym.name else: tmpName
+    var field = newSym(skField, fieldname, objType.owner, n.info)
+    field.typ = argType
+    objType.addField(field)
+    result.add newFastAsgnStmt(newDotExpr(scratchObj, field), n[i])
+
+    var temp = newSym(skTemp, tmpName, objType.owner, n.info)
+    temp.typ = argType
+    incl(temp.flags, sfFromGeneric)
+
+    var vpart = newNodeI(nkIdentDefs, n.info, 3)
+    vpart.sons[0] = newSymNode(temp)
+    vpart.sons[1] = ast.emptyNode
+    vpart.sons[2] = indirectAccess(castExpr, field, n.info)
+    varSection.add vpart
+    
+    call.add(newSymNode(temp))
+
+proc getRoot*(n: PNode): PSym =
+  ## ``getRoot`` takes a *path* ``n``. A path is an lvalue expression
+  ## like ``obj.x[i].y``. The *root* of a path is the symbol that can be
+  ## determined as the owner; ``obj`` in the example.
+  case n.kind
+  of nkSym:
+    if n.sym.kind in {skVar, skResult, skTemp, skLet, skForVar}:
+      result = n.sym
+  of nkDotExpr, nkBracketExpr, nkHiddenDeref, nkDerefExpr,
+      nkObjUpConv, nkObjDownConv, nkCheckedFieldExpr:
+    result = getRoot(n.sons[0])
+  of nkHiddenStdConv, nkHiddenSubConv, nkConv:
+    result = getRoot(n.sons[1])
+  of nkCallKinds:
+    if getMagic(n) == mSlice: result = getRoot(n.sons[1])
+  else: discard
+
+proc newIntLit(value: BiggestInt): PNode =
+  result = nkIntLit.newIntNode(value)
+  result.typ = getSysType(tyInt)
+
+proc genHigh(n: PNode): PNode =
+  if skipTypes(n.typ, abstractVar).kind in {tyArrayConstr, tyArray}:
+    result = newIntLit(lastOrd(skipTypes(n.typ, abstractVar)))
+  else:
+    result = newNodeI(nkCall, n.info, 2)
+    result.typ = getSysType(tyInt)
+    result.sons[0] = newSymNode(createMagic("high", mHigh))
+    result.sons[1] = n
+
+proc setupArgsForParallelism(n: PNode; objType: PType; scratchObj: PSym;
+                             castExpr, call, result: PNode) =
+  let formals = n[0].typ.n
+  let tmpName = getIdent(genPrefix)
+  for i in 1 .. <n.len:
+    let n = n[i]
+    let argType = skipTypes(if i < formals.len: formals[i].typ else: n.typ,
+                            abstractInst)
+    if containsTyRef(argType):
+      localError(n.info, "'spawn'ed function cannot refer to 'ref'/closure")
+
+    let fieldname = if i < formals.len: formals[i].sym.name else: tmpName
+    var field = newSym(skField, fieldname, objType.owner, n.info)
+
+    if argType.kind in {tyVarargs, tyOpenArray}:
+      # important special case: we always create a zero-copy slice:
+      let slice = newNodeI(nkCall, n.info, 4)
+      slice.typ = n.typ
+      slice.sons[0] = newSymNode(createMagic("slice", mSlice))
+      var fieldB = newSym(skField, tmpName, objType.owner, n.info)
+      fieldB.typ = getSysType(tyInt)
+      objType.addField(fieldB)
+      
+      if getMagic(n) == mSlice:
+        let a = genAddrOf(n[0])
+        field.typ = a.typ
+        objType.addField(field)
+        result.add newFastAsgnStmt(newDotExpr(scratchObj, field), a)
+
+        var fieldA = newSym(skField, tmpName, objType.owner, n.info)
+        fieldA.typ = getSysType(tyInt)
+        objType.addField(fieldA)
+        result.add newFastAsgnStmt(newDotExpr(scratchObj, fieldA), n[2])
+        result.add newFastAsgnStmt(newDotExpr(scratchObj, fieldB), n[3])
+
+        slice.sons[2] = indirectAccess(castExpr, fieldA, n.info)
+      else:
+        let a = genAddrOf(n)
+        field.typ = a.typ
+        objType.addField(field)
+        result.add newFastAsgnStmt(newDotExpr(scratchObj, field), a)
+        result.add newFastAsgnStmt(newDotExpr(scratchObj, fieldB), genHigh(n))
+
+        slice.sons[2] = newIntLit(0)
+        
+      slice.sons[1] = genDeref(indirectAccess(castExpr, field, n.info))
+      slice.sons[3] = indirectAccess(castExpr, fieldB, n.info)
+      call.add slice
+    elif (let size = computeSize(argType); size < 0 or size > 16) and
+        n.getRoot != nil:
+      # it is more efficient to pass a pointer instead:
+      let a = genAddrOf(n)
+      field.typ = a.typ
+      objType.addField(field)
+      result.add newFastAsgnStmt(newDotExpr(scratchObj, field), a)
+      call.add(genDeref(indirectAccess(castExpr, field, n.info)))
+    else:
+      # boring case
+      field.typ = argType
+      objType.addField(field)
+      result.add newFastAsgnStmt(newDotExpr(scratchObj, field), n)
+      call.add(indirectAccess(castExpr, field, n.info))
+
+proc wrapProcForSpawn*(owner: PSym; n: PNode; retType: PType; 
+                       barrier, dest: PNode = nil): PNode =
+  # if 'barrier' != nil, then it is in a 'parallel' section and we
+  # generate quite different code
+  let spawnKind = spawnResult(retType, barrier!=nil)
+  case spawnKind
+  of srVoid:
+    internalAssert dest == nil
+    result = newNodeI(nkStmtList, n.info)
+  of srFuture:
+    internalAssert dest == nil
+    result = newNodeIT(nkStmtListExpr, n.info, retType)
+  of srByVar:
+    if dest == nil: localError(n.info, "'spawn' must not be discarded")
+    result = newNodeI(nkStmtList, n.info)
+  
+  if n.kind notin nkCallKinds:
+    localError(n.info, "'spawn' takes a call expression")
     return
   if optThreadAnalysis in gGlobalOptions:
     if {tfThread, tfNoSideEffect} * n[0].typ.flags == {}:
@@ -180,7 +398,7 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode; barrier: PNode = nil): PNode =
     varSectionB.addVar(scratchObj.newSymNode)
     result.add varSectionB
 
-  var call = newNodeI(nkCall, n.info)
+  var call = newNodeIT(nkCall, n.info, n.typ)
   var fn = n.sons[0]
   # templates and macros are in fact valid here due to the nature of
   # the transformation:
@@ -200,34 +418,10 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode; barrier: PNode = nil): PNode =
 
   call.add(fn)
   var varSection = newNodeI(nkVarSection, n.info)
-  let formals = n[0].typ.n
-  let tmpName = getIdent(genPrefix)
-  for i in 1 .. <n.len:
-    # we pick n's type here, which hopefully is 'tyArray' and not
-    # 'tyOpenArray':
-    var argType = n[i].typ.skipTypes(abstractInst)
-    if i < formals.len and formals[i].typ.kind == tyVar:
-      localError(n[i].info, "'spawn'ed function cannot have a 'var' parameter")
-    elif containsTyRef(argType):
-      localError(n[i].info, "'spawn'ed function cannot refer to 'ref'/closure")
-
-    let fieldname = if i < formals.len: formals[i].sym.name else: tmpName
-    var field = newSym(skField, fieldname, owner, n.info)
-    field.typ = argType
-    objType.addField(field)
-    result.add newFastAsgnStmt(newDotExpr(scratchObj, field), n[i])
-
-    var temp = newSym(skTemp, tmpName, owner, n.info)
-    temp.typ = argType
-    incl(temp.flags, sfFromGeneric)
-
-    var vpart = newNodeI(nkIdentDefs, n.info, 3)
-    vpart.sons[0] = newSymNode(temp)
-    vpart.sons[1] = ast.emptyNode
-    vpart.sons[2] = indirectAccess(castExpr, field, n.info)
-    varSection.add vpart
-
-    call.add(newSymNode(temp))
+  if barrier.isNil:
+    setupArgsForConcurrency(n, objType, scratchObj, castExpr, call, varSection, result)
+  else: 
+    setupArgsForParallelism(n, objType, scratchObj, castExpr, call, result)
 
   var barrierAsExpr: PNode = nil
   if barrier != nil:
@@ -239,7 +433,17 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode; barrier: PNode = nil): PNode =
     result.add newFastAsgnStmt(newDotExpr(scratchObj, field), barrier)
     barrierAsExpr = indirectAccess(castExpr, field, n.info)
 
+  var futField, futAsExpr: PNode = nil
+  if spawnKind == srFuture:
+    var field = newSym(skField, getIdent"fut", owner, n.info)
+    field.typ = retType
+    objType.addField(field)
+    futField = newDotExpr(scratchObj, field)
+    futAsExpr = indirectAccess(castExpr, field, n.info)
+
   let wrapper = createWrapperProc(fn, threadParam, argsParam, varSection, call,
-                                  barrierAsExpr)
+                                  barrierAsExpr, futAsExpr)
   result.add callCodeGenProc("nimSpawn", wrapper.newSymNode,
                              genAddrOf(scratchObj.newSymNode))
+
+  if spawnKind == srFuture: result.add futField
diff --git a/compiler/semmagic.nim b/compiler/semmagic.nim
index 80e70b8c0..3a6bfcf67 100644
--- a/compiler/semmagic.nim
+++ b/compiler/semmagic.nim
@@ -115,6 +115,12 @@ proc semLocals(c: PContext, n: PNode): PNode =
         if it.typ.skipTypes({tyGenericInst}).kind == tyVar: a = newDeref(a)
         result.add(a)
 
+proc createFuture(c: PContext; t: PType; info: TLineInfo): PType =
+  result = newType(tyGenericInvokation, c.module)
+  addSonSkipIntLit(result, magicsys.getCompilerProc("Future").typ)
+  addSonSkipIntLit(result, t)
+  result = instGenericContainer(c, info, result, allowMetaTypes = false)
+
 proc semShallowCopy(c: PContext, n: PNode, flags: TExprFlags): PNode
 proc magicsAfterOverloadResolution(c: PContext, n: PNode, 
                                    flags: TExprFlags): PNode =
@@ -130,5 +136,9 @@ proc magicsAfterOverloadResolution(c: PContext, n: PNode,
   of mShallowCopy: result = semShallowCopy(c, n, flags)
   of mNBindSym: result = semBindSym(c, n)
   of mLocals: result = semLocals(c, n)
+  of mSpawn:
+    result = n
+    # later passes may transform the type 'Future[T]' back into 'T'
+    if not n[1].typ.isEmptyType:
+      result.typ = createFuture(c, n[1].typ, n.info)
   else: result = n
-
diff --git a/compiler/semparallel.nim b/compiler/semparallel.nim
index 7917cab90..b13542038 100644
--- a/compiler/semparallel.nim
+++ b/compiler/semparallel.nim
@@ -9,8 +9,8 @@
 
 ## Semantic checking for 'parallel'.
 
-# - codegen needs to support mSlice
-# - lowerings must not perform unnecessary copies
+# - codegen needs to support mSlice (+)
+# - lowerings must not perform unnecessary copies (+)
 # - slices should become "nocopy" to openArray (+)
 #   - need to perform bound checks (+)
 #
@@ -19,7 +19,7 @@
 #   - what about 'f(a)'? --> f shouldn't have side effects anyway
 # - passed arrays need to be ensured not to alias
 # - passed slices need to be ensured to be disjoint (+)
-# - output slices need special logic
+# - output slices need special logic (+)
 
 import
   ast, astalgo, idents, lowerings, magicsys, guards, sempass2, msgs,
@@ -94,23 +94,6 @@ proc getSlot(c: var AnalysisCtx; v: PSym): ptr MonotonicVar =
   c.locals[L].v = v
   return addr(c.locals[L])
 
-proc getRoot(n: PNode): PSym =
-  ## ``getRoot`` takes a *path* ``n``. A path is an lvalue expression
-  ## like ``obj.x[i].y``. The *root* of a path is the symbol that can be
-  ## determined as the owner; ``obj`` in the example.
-  case n.kind
-  of nkSym:
-    if n.sym.kind in {skVar, skResult, skTemp, skLet, skForVar}:
-      result = n.sym
-  of nkDotExpr, nkBracketExpr, nkHiddenDeref, nkDerefExpr,
-      nkObjUpConv, nkObjDownConv, nkCheckedFieldExpr:
-    result = getRoot(n.sons[0])
-  of nkHiddenStdConv, nkHiddenSubConv, nkConv:
-    result = getRoot(n.sons[1])
-  of nkCallKinds:
-    if getMagic(n) == mSlice: result = getRoot(n.sons[1])
-  else: discard
-
 proc gatherArgs(c: var AnalysisCtx; n: PNode) =
   for i in 0.. <n.safeLen:
     let root = getRoot n[i]
@@ -184,8 +167,6 @@ proc overlap(m: TModel; x,y,c,d: PNode) =
   of impNo: discard
 
 proc stride(c: AnalysisCtx; n: PNode): BiggestInt =
-  # note: 0 if it cannot be determined is just right because then
-  # we analyse 'i..i' and 'i+0 .. i+0' and these are not disjoint!
   if isLocal(n):
     let s = c.lookupSlot(n.sym)
     if s >= 0 and c.locals[s].stride != nil:
@@ -193,6 +174,20 @@ proc stride(c: AnalysisCtx; n: PNode): BiggestInt =
   else:
     for i in 0 .. <n.safeLen: result += stride(c, n.sons[i])
 
+proc subStride(c: AnalysisCtx; n: PNode): PNode =
+  # substitute with stride:
+  if isLocal(n):
+    let s = c.lookupSlot(n.sym)
+    if s >= 0 and c.locals[s].stride != nil:
+      result = n +@ c.locals[s].stride.intVal
+    else:
+      result = n
+  elif n.safeLen > 0:
+    result = shallowCopy(n)
+    for i in 0 .. <n.len: result.sons[i] = subStride(c, n.sons[i])
+  else:
+    result = n
+
 proc checkSlicesAreDisjoint(c: var AnalysisCtx) =
   # this is the only thing that we need to perform after we have traversed
   # the whole tree so that the strides are available.
@@ -200,7 +195,7 @@ proc checkSlicesAreDisjoint(c: var AnalysisCtx) =
   addLowerBoundAsFacts(c)
   # Every slice used in a loop needs to be disjoint with itself:
   for x,a,b,id,inLoop in items(c.slices):
-    if inLoop: overlap(c.guards, a,b, a+@c.stride(a), b+@c.stride(b))
+    if inLoop: overlap(c.guards, a,b, c.subStride(a), c.subStride(b))
   # Another tricky example is:
   #   while true:
   #     spawn f(a[i])
@@ -283,23 +278,19 @@ proc analyseCall(c: var AnalysisCtx; n: PNode; op: PSym) =
 
 proc analyseCase(c: var AnalysisCtx; n: PNode) =
   analyse(c, n.sons[0])
-  #let oldState = c.locals.len
   let oldFacts = c.guards.len
   for i in 1.. <n.len:
     let branch = n.sons[i]
-    #setLen(c.locals, oldState)
     setLen(c.guards, oldFacts)
     addCaseBranchFacts(c.guards, n, i)
     for i in 0 .. <branch.len:
       analyse(c, branch.sons[i])
-  #setLen(c.locals, oldState)
   setLen(c.guards, oldFacts)
 
 proc analyseIf(c: var AnalysisCtx; n: PNode) =
   analyse(c, n.sons[0].sons[0])
   let oldFacts = c.guards.len
   addFact(c.guards, n.sons[0].sons[0])
-  #let oldState = c.locals.len
 
   analyse(c, n.sons[0].sons[1])
   for i in 1.. <n.len:
@@ -309,10 +300,8 @@ proc analyseIf(c: var AnalysisCtx; n: PNode) =
       addFactNeg(c.guards, n.sons[j].sons[0])
     if branch.len > 1:
       addFact(c.guards, branch.sons[0])
-    #setLen(c.locals, oldState)
     for i in 0 .. <branch.len:
       analyse(c, branch.sons[i])
-  #setLen(c.locals, oldState)
   setLen(c.guards, oldFacts)
 
 proc analyse(c: var AnalysisCtx; n: PNode) =
@@ -390,17 +379,40 @@ proc transformSlices(n: PNode): PNode =
   else:
     result = n
 
+proc transformSpawn(owner: PSym; n, barrier: PNode): PNode
+proc transformSpawnSons(owner: PSym; n, barrier: PNode): PNode =
+  result = shallowCopy(n)
+  for i in 0 .. < n.len:
+    result.sons[i] = transformSpawn(owner, n.sons[i], barrier)
+
 proc transformSpawn(owner: PSym; n, barrier: PNode): PNode =
-  if n.kind in nkCallKinds:
-    if n[0].kind == nkSym:
-      let op = n[0].sym
-      if op.magic == mSpawn:
-        result = transformSlices(n)
-        return wrapProcForSpawn(owner, result[1], barrier)
+  case n.kind
+  of nkVarSection:
+    result = nil
+    for it in n:
+      let b = it.lastSon
+      if getMagic(b) == mSpawn:
+        if it.len != 3: localError(it.info, "invalid context for 'spawn'")
+        let m = transformSlices(b)
+        if result.isNil:
+          result = newNodeI(nkStmtList, n.info)
+          result.add n
+        result.add wrapProcForSpawn(owner, m[1], b.typ, barrier, it[0])
+        it.sons[it.len-1] = emptyNode
+    if result.isNil: result = n
+  of nkAsgn, nkFastAsgn:
+    let b = n[1]
+    if getMagic(b) == mSpawn:
+      let m = transformSlices(b)
+      return wrapProcForSpawn(owner, m[1], b.typ, barrier, n[0])
+    result = transformSpawnSons(owner, n, barrier)
+  of nkCallKinds:
+    if getMagic(n) == mSpawn:
+      result = transformSlices(n)
+      return wrapProcForSpawn(owner, result[1], n.typ, barrier, nil)
+    result = transformSpawnSons(owner, n, barrier)
   elif n.safeLen > 0:
-    result = shallowCopy(n)
-    for i in 0 .. < n.len:
-      result.sons[i] = transformSpawn(owner, n.sons[i], barrier)
+    result = transformSpawnSons(owner, n, barrier)
   else:
     result = n
 
@@ -440,3 +452,4 @@ proc liftParallel*(owner: PSym; n: PNode): PNode =
   result.add callCodeGenProc("openBarrier", barrier)
   result.add transformSpawn(owner, body, barrier)
   result.add callCodeGenProc("closeBarrier", barrier)
+
diff --git a/doc/manual.txt b/doc/manual.txt
index 39e2bad2a..b2e008969 100644
--- a/doc/manual.txt
+++ b/doc/manual.txt
@@ -2748,7 +2748,7 @@ The following builtin procs cannot be overloaded for reasons of implementation
 simplicity (they require specialized semantic checking)::
 
   defined, definedInScope, compiles, low, high, sizeOf, 
-  is, of, echo, shallowCopy, getAst
+  is, of, echo, shallowCopy, getAst, spawn
 
 Thus they act more like keywords than like ordinary identifiers; unlike a 
 keyword however, a redefinition may `shadow`:idx: the definition in 
diff --git a/lib/pure/concurrency/threadpool.nim b/lib/pure/concurrency/threadpool.nim
index 86819d25a..583c60c66 100644
--- a/lib/pure/concurrency/threadpool.nim
+++ b/lib/pure/concurrency/threadpool.nim
@@ -65,6 +65,30 @@ proc closeBarrier*(b: ptr Barrier) {.compilerProc.} =
 # ----------------------------------------------------------------------------
 
 type
+  AwaitInfo = object
+    cv: CondVar
+    idx: int
+
+  RawFuture* = ptr RawFutureObj ## untyped base class for 'Future[T]'
+  RawFutureObj {.inheritable.} = object # \
+    # we allocate this with the thread local allocator; this
+    # is possible since we already need to do the GC_unref
+    # on the owning thread
+    ready, usesCondVar: bool
+    cv: CondVar #\
+    # for 'awaitAny' support
+    ai: ptr AwaitInfo
+    idx: int
+    data: PObject  # we incRef and unref it to keep it alive
+    owner: ptr Worker
+    next: RawFuture
+    align: float64 # a float for proper alignment
+
+  Future* {.compilerProc.} [T] = ptr object of RawFutureObj
+    blob: T  ## the underlying value, if available. Note that usually
+             ## you should not access this field directly! However it can
+             ## sometimes be more efficient than getting the value via ``^``.
+
   WorkerProc = proc (thread, args: pointer) {.nimcall, gcsafe.}
   Worker = object
     taskArrived: CondVar
@@ -75,6 +99,92 @@ type
     ready: bool # put it here for correct alignment!
     initialized: bool # whether it has even been initialized
     shutdown: bool # the pool requests to shut down this worker thread
+    futureLock: TLock
+    head: RawFuture
+
+proc finished*(fut: RawFuture) =
+  ## This MUST be called for every created future to free its associated
+  ## resources. Note that the default reading operation ``^`` is destructive
+  ## and calls ``finished``.
+  doAssert fut.ai.isNil, "future is still attached to an 'awaitAny'"
+  assert fut.next == nil
+  let w = fut.owner
+  acquire(w.futureLock)
+  fut.next = w.head
+  w.head = fut
+  release(w.futureLock)
+
+proc cleanFutures(w: ptr Worker) =
+  var it = w.head
+  acquire(w.futureLock)
+  while it != nil:
+    let nxt = it.next
+    if it.usesCondVar: destroyCondVar(it.cv)
+    if it.data != nil: GC_unref(it.data)
+    dealloc(it)
+    it = nxt
+  w.head = nil
+  release(w.futureLock)
+
+proc nimCreateFuture(owner: pointer; blobSize: int): RawFuture {.
+                     compilerProc.} =
+  result = cast[RawFuture](alloc0(RawFutureObj.sizeof + blobSize))
+  result.owner = cast[ptr Worker](owner)
+
+proc nimFutureCreateCondVar(fut: RawFuture) {.compilerProc.} =
+  fut.cv = createCondVar()
+  fut.usesCondVar = true
+
+proc nimFutureSignal(fut: RawFuture) {.compilerProc.} =
+  assert fut.usesCondVar
+  signal(fut.cv)
+
+proc await*[T](fut: Future[T]) =
+  ## waits until the value for the future arrives.
+  if fut.usesCondVar: await(fut.cv)
+
+proc `^`*[T](fut: Future[T]): T =
+  ## blocks until the value is available and then returns this value. Note
+  ## this reading is destructive for reasons of efficiency and convenience.
+  ## This calls ``finished(fut)``.
+  await(fut)
+  when T is string or T is seq or T is ref:
+    result = cast[T](fut.data)
+  else:
+    result = fut.payload
+  finished(fut)
+
+proc notify*(fut: RawFuture) {.compilerproc.} =
+  if fut.ai != nil:
+    acquire(fut.ai.cv.L)
+    fut.ai.idx = fut.idx
+    inc fut.ai.cv.counter
+    release(fut.ai.cv.L)
+    signal(fut.ai.cv.c)
+  if fut.usesCondVar: signal(fut.cv)
+
+proc awaitAny*(futures: openArray[RawFuture]): int =
+  # awaits any of the given futures. Returns the index of one future for which
+  ## a value arrived. A future only supports one call to 'awaitAny' at the
+  ## same time. That means if you await([a,b]) and await([b,c]) the second
+  ## call will only await 'c'. If there is no future left to be able to wait
+  ## on, -1 is returned.
+  var ai: AwaitInfo
+  ai.cv = createCondVar()
+  var conflicts = 0
+  for i in 0 .. futures.high:
+    if cas(addr futures[i].ai, nil, addr ai):
+      futures[i].idx = i
+    else:
+      inc conflicts
+  if conflicts < futures.len:
+    await(ai.cv)
+    result = ai.idx
+    for i in 0 .. futures.high:
+      discard cas(addr futures[i].ai, addr ai, nil)
+  else:
+    result = -1
+  destroyCondVar(ai.cv)
 
 proc nimArgsPassingDone(p: pointer) {.compilerProc.} =
   let w = cast[ptr Worker](p)
@@ -99,6 +209,7 @@ proc slave(w: ptr Worker) {.thread.} =
     await(w.taskArrived)
     assert(not w.ready)
     w.f(w, w.data)
+    if w.head != nil: w.cleanFutures
     if w.shutdown:
       w.shutdown = false
       atomicDec currentPoolSize
@@ -119,6 +230,7 @@ var
 proc activateThread(i: int) {.noinline.} =
   workersData[i].taskArrived = createCondVar()
   workersData[i].taskStarted = createCondVar()
+  initLock workersData[i].futureLock
   workersData[i].initialized = true
   createThread(workers[i], slave, addr(workersData[i]))
 
diff --git a/lib/system/atomics.nim b/lib/system/atomics.nim
index c6c603b19..96246ba01 100644
--- a/lib/system/atomics.nim
+++ b/lib/system/atomics.nim
@@ -209,12 +209,12 @@ when defined(windows) and not defined(gcc):
   proc interlockedCompareExchange(p: pointer; exchange, comparand: int32): int32
     {.importc: "InterlockedCompareExchange", header: "<windows.h>", cdecl.}
 
-  proc cas*[T: bool|int](p: ptr T; oldValue, newValue: T): bool =
+  proc cas*[T: bool|int|ptr](p: ptr T; oldValue, newValue: T): bool =
     interlockedCompareExchange(p, newValue.int32, oldValue.int32) != 0
-
+  # XXX fix for 64 bit build
 else:
   # this is valid for GCC and Intel C++
-  proc cas*[T: bool|int](p: ptr T; oldValue, newValue: T): bool
+  proc cas*[T: bool|int|ptr](p: ptr T; oldValue, newValue: T): bool
     {.importc: "__sync_bool_compare_and_swap", nodecl.}
   # XXX is this valid for 'int'?
 
diff --git a/tests/parallel/tdisjoint_slice1.nim b/tests/parallel/tdisjoint_slice1.nim
index 2ca96d6ae..c1d0e52f8 100644
--- a/tests/parallel/tdisjoint_slice1.nim
+++ b/tests/parallel/tdisjoint_slice1.nim
@@ -1,20 +1,20 @@
+discard """
+  outputsub: "EVEN 28"
+"""
 
 import threadpool
 
-proc f(a: openArray[int]) =
-  for x in a: echo x
-
-proc f(a: int) = echo a
+proc odd(a: int) =  echo "ODD  ", a
+proc even(a: int) = echo "EVEN ", a
 
 proc main() =
   var a: array[0..30, int]
+  for i in low(a)..high(a): a[i] = i
   parallel:
-    #spawn f(a[0..15])
-    #spawn f(a[16..30])
     var i = 0
     while i <= 29:
-      spawn f(a[i])
-      spawn f(a[i+1])
+      spawn even(a[i])
+      spawn odd(a[i+1])
       inc i, 2
       # is correct here
 
diff --git a/tests/parallel/tinvalid_array_bounds.nim b/tests/parallel/tinvalid_array_bounds.nim
index 337fae729..4c6065fd6 100644
--- a/tests/parallel/tinvalid_array_bounds.nim
+++ b/tests/parallel/tinvalid_array_bounds.nim
@@ -1,5 +1,5 @@
 discard """
-  errormsg: "cannot prove: i + 1 <= 30"
+  errormsg: "can prove: i + 1 > 30"
   line: 21
 """