From 7ac1fc81fd119dba3739e05a89ad63fcee405ac1 Mon Sep 17 00:00:00 2001 From: c-blake Date: Mon, 31 Dec 2018 08:52:51 -0500 Subject: Resolve things raised in https://github.com/nim-lang/Nim/issues/10081 ? (#10084) * Resolve things raised in https://github.com/nim-lang/Nim/issues/10081 ? CDF is a standard ident in all things related to random numbers/sampling, and full words "cumulativeDistributionFunction" would be silly long, in this case, IMO. We use lowercase `cdf` to make it not look like a type, remove all looping from `sample` letting callers do it. Besides just side-stepping any `sampleSize` name choice, callers may want to filter out samples anyway which this makes slightly simpler. Also add two variants of `cumsum`, value return and in-place update distinguished by the var-ness of the first argument. Add tests for `int` and `float` for both `cumsum` and the new `sample`. (The sample tests exercise the value return mode of `cumsum`.) Functionality pre-this-PR `sample(a, w)` is now the almost as simple `for i in 0.. `cumsummed` to honor NEP1 style. Re-instate `cumsum` as the in-place transformation. Test both in `tests/stdlib/tmath.nim` and use `cumsummed` in the example code for sample since that's a simpler example. * Fix requests from https://github.com/nim-lang/Nim/pull/10084 : example in lib/pure/math.nim and comment whitespace in lib/pure/random.nim --- lib/pure/random.nim | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'lib/pure/random.nim') diff --git a/lib/pure/random.nim b/lib/pure/random.nim index 26e6740ea..ee728ad4a 100644 --- a/lib/pure/random.nim +++ b/lib/pure/random.nim @@ -175,27 +175,26 @@ proc sample*[T](a: openArray[T]): T = ## returns a random element from openArray ``a`` using non-thread-safe state. result = a[rand(a.low..a.high)] -proc sample*[T, U](r: var Rand; a: openArray[T], w: openArray[U], n=1): seq[T] = - ## Return a sample (with replacement) of size ``n`` from elements of ``a`` - ## according to convertible-to-``float``, not necessarily normalized, and - ## non-negative weights ``w``. Uses state in ``r``. Must have sum ``w > 0.0``. - assert(w.len == a.len) - var cdf = newSeq[float](a.len) # The *unnormalized* CDF - var tot = 0.0 # Unnormalized is fine if we sample up to tot - for i, w in w: - assert(w >= 0) - tot += float(w) - cdf[i] = tot - assert(tot > 0.0) # Need at least one non-zero weight - for i in 0 ..< n: - result.add(a[cdf.upperBound(r.rand(tot))]) - -proc sample*[T, U](a: openArray[T], w: openArray[U], n=1): seq[T] = - ## Return a sample (with replacement) of size ``n`` from elements of ``a`` - ## according to convertible-to-``float``, not necessarily normalized, and - ## non-negative weights ``w``. Uses default non-thread-safe state. - state.sample(a, w, n) - +proc sample*[T, U](r: var Rand; a: openArray[T], cdf: openArray[U]): T = + ## Sample one element from openArray ``a`` when it has cumulative distribution + ## function (CDF) ``cdf`` (not necessarily normalized, any type of elements + ## convertible to ``float``). Uses state in ``r``. E.g.: + ## + ## .. code-block:: nim + ## let val = [ "a", "b", "c", "d" ] # some values + ## var cnt = [1, 2, 3, 4] # histogram of counts + ## echo r.sample(val, cnt.cumsummed) # echo a sample + assert(cdf.len == a.len) # Two basic sanity checks. + assert(float(cdf[^1]) > 0.0) + #While we could check cdf[i-1] <= cdf[i] for i in 1..cdf.len, that could get + #awfully expensive even in debugging modes. + let u = r.rand(float(cdf[^1])) + a[cdf.upperBound(U(u))] + +proc sample*[T, U](a: openArray[T], cdf: openArray[U]): T = + ## Like ``sample(var Rand; openArray[T], openArray[U])``, but uses default + ## non-thread-safe state. + state.sample(a, cdf) proc initRand*(seed: int64): Rand = ## Creates a new ``Rand`` state from ``seed``. -- cgit 1.4.1-2-gfad0