# # # Nim's Runtime Library # (c) Copyright 2015 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # ## Implements Nim's 'spawn'. when not compileOption("threads"): {.error: "Threadpool requires --threads:on option.".} import cpuinfo, cpuload, locks {.push stackTrace:off.} type Semaphore = object c: Cond L: Lock counter: int proc initSemaphore(cv: var Semaphore) = initCond(cv.c) initLock(cv.L) proc destroySemaphore(cv: var Semaphore) {.inline.} = deinitCond(cv.c) deinitLock(cv.L) proc await(cv: var Semaphore) = acquire(cv.L) while cv.counter <= 0: wait(cv.c, cv.L) dec cv.counter release(cv.L) proc signal(cv: var Semaphore) = acquire(cv.L) inc cv.counter release(cv.L) signal(cv.c) const CacheLineSize = 32 # true for most archs type Barrier {.compilerProc.} = object entered: int cv: Semaphore # Semaphore takes 3 words at least when sizeof(int) < 8: cacheAlign: array[CacheLineSize-4*sizeof(int), byte] left: int cacheAlign2: array[CacheLineSize-sizeof(int), byte] interest: bool ## wether the master is interested in the "all done" event proc barrierEnter(b: ptr Barrier) {.compilerProc, inline.} = # due to the signaling between threads, it is ensured we are the only # one with access to 'entered' so we don't need 'atomicInc' here: inc b.entered # also we need no 'fence' instructions here as soon 'nimArgsPassingDone' # will be called which already will perform a fence for us. proc barrierLeave(b: ptr Barrier) {.compilerProc, inline.} = atomicInc b.left when not defined(x86): fence() # We may not have seen the final value of b.entered yet, # so we need to check for >= instead of ==. if b.interest and b.left >= b.entered: signal(b.cv) proc openBarrier(b: ptr Barrier) {.compilerProc, inline.} = b.entered = 0 b.left = 0 b.interest = false proc closeBarrier(b: ptr Barrier) {.compilerProc.} = fence() if b.left != b.entered: b.cv.initSemaphore() fence() b.interest = true fence() while b.left != b.entered: await(b.cv) destroySemaphore(b.cv) {.pop.} # ---------------------------------------------------------------------------- type foreign* = object ## a region that indicates the pointer comes from a ## foreign thread heap. AwaitInfo = object cv: Semaphore idx: int FlowVarBase* = ref FlowVarBaseObj ## untyped base class for 'FlowVar[T]' FlowVarBaseObj = object of RootObj ready, usesSemaphore, awaited: bool cv: Semaphore #\ # for 'awaitAny' support ai: ptr AwaitInfo idx: int data: pointer # we incRef and unref it to keep it alive; note this MUST NOT # be RootRef here otherwise the wrong GC keeps track of it! owner: pointer # ptr Worker FlowVarObj[T] = object of FlowVarBaseObj blob: T FlowVar*{.compilerProc.}[T] = ref FlowVarObj[T] ## a data flow variable ToFreeQueue = object len: int lock: Lock empty: Semaphore data: array[128, pointer] WorkerProc = proc (thread, args: pointer) {.nimcall, gcsafe.} Worker = object taskArrived: Semaphore taskStarted: Semaphore #\ # task data: f: WorkerProc data: pointer ready: bool # put it here for correct alignment! initialized: bool # whether it has even been initialized shutdown: bool # the pool requests to shut down this worker thread q: ToFreeQueue readyForTask: Semaphore proc await*(fv: FlowVarBase) = ## waits until the value for the flowVar arrives. Usually it is not necessary ## to call this explicitly. if fv.usesSemaphore and not fv.awaited: fv.awaited = true await(fv.cv) destroySemaphore(fv.cv) proc selectWorker(w: ptr Worker; fn: WorkerProc; data: pointer): bool = if cas(addr w.ready, true, false): w.data = data w.f = fn signal(w.taskArrived) await(w.taskStarted) result = true proc cleanFlowVars(w: ptr Worker) = let q = addr(w.q) acquire(q.lock) for i in 0 ..< q.len: GC_unref(cast[RootRef](q.data[i])) #echo "GC_unref" q.len = 0 release(q.lock) proc wakeupWorkerToProcessQueue(w: ptr Worker) = # we have to ensure it's us who wakes up the owning thread. # This is quite horrible code, but it runs so rarely that it doesn't matter: while not cas(addr w.ready, true, false): cpuRelax() discard w.data = nil w.f = proc (w, a: pointer) {.nimcall.} = let w = cast[ptr Worker](w) cleanFlowVars(w) signal(w.q.empty) signal(w.taskArrived) proc attach(fv: FlowVarBase; i: int): bool = acquire(fv.cv.L) if fv.cv.counter <= 0: fv.idx = i result = true else: result = false release(fv.cv.L) proc finished(fv: FlowVarBase) = doAssert fv.ai.isNil, "flowVar is still attached to an 'awaitAny'" # we have to protect against the rare cases where the owner of the flowVar # simply disregards the flowVar and yet the "flowVar" has not yet written # anything to it: await(fv) if fv.data.isNil: return let owner = cast[ptr Worker](fv.owner) let q = addr(owner.q) acquire(q.lock) while not (q.len < q.data.len): #echo "EXHAUSTED!" release(q.lock) wakeupWorkerToProcessQueue(owner) await(q.empty) acquire(q.lock) q.data[q.len] = cast[pointer](fv.data) inc q.len release(q.lock) fv.data = nil proc fvFinalizer[T](fv: FlowVar[T]) = finished(fv) proc nimCreateFlowVar[T](): FlowVar[T] {.compilerProc.} = new(result, fvFinalizer) proc nimFlowVarCreateSemaphore(fv: FlowVarBase) {.compilerProc.} = fv.cv.initSemaphore() fv.usesSemaphore = true proc nimFlowVarSignal(fv: FlowVarBase) {.compilerProc.} = if fv.ai != nil: acquire(fv.ai.cv.L) fv.ai.idx = fv.idx inc fv.ai.cv.counter release(fv.ai.cv.L) signal(fv.ai.cv.c) if fv.usesSemaphore: signal(fv.cv) proc awaitAndThen*[T](fv: FlowVar[T]; action: proc (x: T) {.closure.}) = ## blocks until the ``fv`` is available and then passes its value ## to ``action``. Note that due to Nim's parameter passing semantics this ## means that ``T`` doesn't need to be copied and so ``awaitAndThen`` can ## sometimes be more efficient than ``^``. await(fv) when T is string or T is seq: action(cast[T](fv.data)) elif T is ref: {.error: "'awaitAndThen' not available for FlowVar[ref]".} else: action(fv.blob) finished(fv) proc unsafeRead*[T](fv: FlowVar[ref T]): foreign ptr T = ## blocks until the value is available and then returns this value. await(fv) result = cast[foreign ptr T](fv.data) proc `^`*[T](fv: FlowVar[ref T]): ref T = ## blocks until the value is available and then returns this value. await(fv) let src = cast[ref T](fv.data) deepCopy result, src proc `^`*[T](fv: FlowVar[T]): T = ## blocks until the value is available and then returns this value. await(fv) when T is string or T is seq: # XXX closures? deepCopy? result = cast[T](fv.data) else: result = fv.blob proc awaitAny*(flowVars: openArray[FlowVarBase]): int = ## awaits any of the given flowVars. Returns the index of one flowVar for ## which a value arrived. A flowVar only supports one call to 'awaitAny' at ## the same time. That means if you awaitAny([a,b]) and awaitAny([b,c]) the second ## call will only await 'c'. If there is no flowVar left to be able to wait ## on, -1 is returned. ## **Note**: This results in non-deterministic behaviour and should be avoided. var ai: AwaitInfo ai.cv.initSemaphore() var conflicts = 0 result = -1 for i in 0 .. flowVars.high: if cas(addr flowVars[i].ai, nil, addr ai): if not attach(flowVars[i], i): result = i break else: inc conflicts if conflicts < flowVars.len: if result < 0: await(ai.cv) result = ai.idx for i in 0 .. flowVars.high: discard cas(addr flowVars[i].ai, addr ai, nil) destroySemaphore(ai.cv) proc isReady*(fv: FlowVarBase): bool = ## Determines whether the specified ``FlowVarBase``'s value is available. ## ## If ``true`` awaiting ``fv`` will not block. if fv.usesSemaphore and not fv.awaited: acquire(fv.cv.L) result = fv.cv.counter > 0 release(fv.cv.L) else: result = true proc nimArgsPassingDone(p: pointer) {.compilerProc.} = let w = cast[ptr Worker](p) signal(w.taskStarted) const MaxThreadPoolSize* = 256 ## maximal size of the thread pool. 256 threads ## should be good enough for anybody ;-) MaxDistinguishedThread* = 32 ## maximal number of "distinguished" threads. type ThreadId* = range[0..MaxDistinguishedThread-1] var currentPoolSize: int maxPoolSize = MaxThreadPoolSize minPoolSize = 4 gSomeReady : Semaphore readyWorker: ptr Worker # A workaround for recursion deadlock issue # https://github.com/nim-lang/Nim/issues/4597 var numSlavesLock: Lock numSlavesRunning {.guard: numSlavesLock}: int numSlavesWaiting {.guard: numSlavesLock}: int isSlave {.threadvar.}: bool numSlavesLock.initLock gSomeReady.initSemaphore() proc slave(w: ptr Worker) {.thread.} = isSlave = true while true: when declared(atomicStoreN): atomicStoreN(addr(w.ready), true, ATOMIC_SEQ_CST) else: w.ready = true readyWorker = w signal(gSomeReady) await(w.taskArrived) # XXX Somebody needs to look into this (why does this assertion fail # in Visual Studio?) when not defined(vcc): assert(not w.ready) withLock numSlavesLock: inc numSlavesRunning w.f(w, w.data) withLock numSlavesLock: dec numSlavesRunning if w.q.len != 0: w.cleanFlowVars if w.shutdown: w.shutdown = false atomicDec currentPoolSize proc distinguishedSlave(w: ptr Worker) {.thread.} = while true: when declared(atomicStoreN): atomicStoreN(addr(w.ready), true, ATOMIC_SEQ_CST) else: w.ready = true signal(w.readyForTask) await(w.taskArrived) assert(not w.ready) w.f(w, w.data) if w.q.len != 0: w.cleanFlowVars var workers: array[MaxThreadPoolSize, Thread[ptr Worker]] workersData: array[MaxThreadPoolSize, Worker] distinguished: array[MaxDistinguishedThread, Thread[ptr Worker]] distinguishedData: array[MaxDistinguishedThread, Worker] when defined(nimPinToCpu): var gCpus: Natural proc setMinPoolSize*(size: range[1..MaxThreadPoolSize]) = ## sets the minimal thread pool size. The default value of this is 4. minPoolSize = size proc setMaxPoolSize*(size: range[1..MaxThreadPoolSize]) = ## sets the maximal thread pool size. The default value of this ## is ``MaxThreadPoolSize``. maxPoolSize = size if currentPoolSize > maxPoolSize: for i in maxPoolSize..currentPoolSize-1: let w = addr(workersData[i]) w.shutdown = true when defined(nimRecursiveSpawn): var localThreadId {.threadvar.}: int proc activateWorkerThread(i: int) {.noinline.} = workersData[i].taskArrived.initSemaphore() workersData[i].taskStarted.initSemaphore() workersData[i].initialized = true workersData[i].q.empty.initSemaphore() initLock(workersData[i].q.lock) createThread(workers[i], slave, addr(workersData[i])) when defined(nimRecursiveSpawn): localThreadId = i+1 when defined(nimPinToCpu): if gCpus > 0: pinToCpu(workers[i], i mod gCpus) proc activateDistinguishedThread(i: int) {.noinline.} = distinguishedData[i].taskArrived.initSemaphore() distinguishedData[i].taskStarted.initSemaphore() distinguishedData[i].initialized = true distinguishedData[i].q.empty.initSemaphore() initLock(distinguishedData[i].q.lock) distinguishedData[i].readyForTask.initSemaphore() createThread(distinguished[i], distinguishedSlave, addr(distinguishedData[i])) proc setup() = let p = countProcessors() when defined(nimPinToCpu): gCpus = p currentPoolSize = min(p, MaxThreadPoolSize) readyWorker = addr(workersData[0]) for i in 0.. 0 proc spawn*(call: typed): void {.magic: "Spawn".} ## always spawns a new task, so that the 'call' is never executed on ## the calling thread. 'call' has to be proc call 'p(...)' where 'p' ## is gcsafe and has a return type that is either 'void' or compatible ## with ``FlowVar[T]``. proc pinnedSpawn*(id: ThreadId; call: typed): void {.magic: "Spawn".} ## always spawns a new task on the worker thread with ``id``, so that ## the 'call' is **always** executed on ## the thread. 'call' has to be proc call 'p(...)' where 'p' ## is gcsafe and has a return type that is either 'void' or compatible ## with ``FlowVar[T]``. template spawnX*(call): void = ## spawns a new task if a CPU core is ready, otherwise executes the ## call in the calling thread. Usually it is advised to ## use 'spawn' in order to not block the producer for an unknown ## amount of time. 'call' has to be proc call 'p(...)' where 'p' ## is gcsafe and has a return type that is either 'void' or compatible ## with ``FlowVar[T]``. (if preferSpawn(): spawn call else: call) proc parallel*(body: untyped) {.magic: "Parallel".} ## a parallel section can be used to execute a block in parallel. ``body`` ## has to be in a DSL that is a particular subset of the language. Please ## refer to the manual for further information. var state: ThreadPoolState stateLock: Lock initLock stateLock proc nimSpawn3(fn: WorkerProc; data: pointer) {.compilerProc.} = # implementation of 'spawn' that is used by the code generator. while true: if selectWorker(readyWorker, fn, data): return for i in 0.. minPoolSize: let w = addr(workersData[currentPoolSize-1]) w.shutdown = true # we don't free anything here. Too dangerous. release(stateLock) # else the acquire failed, but this means some # other thread succeeded, so we don't need to do anything here. when defined(nimRecursiveSpawn): if localThreadId > 0: # we are a worker thread, so instead of waiting for something which # might as well never happen (see tparallel_quicksort), we run the task # on the current thread instead. var self = addr(workersData[localThreadId-1]) fn(self, data) await(self.taskStarted) return if isSlave: # Run under lock until `numSlavesWaiting` increment to avoid a # race (otherwise two last threads might start waiting together) withLock numSlavesLock: if numSlavesRunning <= numSlavesWaiting + 1: # All the other slaves are waiting # If we wait now, we-re deadlocked until # an external spawn happens ! if currentPoolSize < maxPoolSize: if not workersData[currentPoolSize].initialized: activateWorkerThread(currentPoolSize) let w = addr(workersData[currentPoolSize]) atomicInc currentPoolSize if selectWorker(w, fn, data): return else: # There is no place in the pool. We're deadlocked. # echo "Deadlock!" discard inc numSlavesWaiting await(gSomeReady) if isSlave: withLock numSlavesLock: dec numSlavesWaiting var distinguishedLock: Lock initLock distinguishedLock proc nimSpawn4(fn: WorkerProc; data: pointer; id: ThreadId) {.compilerProc.} = acquire(distinguishedLock) if not distinguishedData[id].initialized: activateDistinguishedThread(id) release(distinguishedLock) while true: if selectWorker(addr(distinguishedData[id]), fn, data): break await(distinguishedData[id].readyForTask) proc sync*() = ## a simple barrier to wait for all spawn'ed tasks. If you need more elaborate ## waiting, you have to use an explicit barrier. var toRelease = 0 while true: var allReady = true for i in 0 ..< currentPoolSize: if not allReady: break allReady = allReady and workersData[i].ready if allReady: break await(gSomeReady) inc toRelease for i in 0 ..< toRelease: signal(gSomeReady) setup()