summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorkonsumlamm <44230978+konsumlamm@users.noreply.github.com>2021-01-19 08:40:09 +0100
committerGitHub <noreply@github.com>2021-01-19 08:40:09 +0100
commitbd5ce5b351ec59d844dfdff462f23a2365fb1d61 (patch)
tree7448d1191f3a0dcfb427b9ab912be9162ace389c
parent2e5254ff279d8013c01f6f56064fda713e0a6e4d (diff)
downloadNim-bd5ce5b351ec59d844dfdff462f23a2365fb1d61.tar.gz
Improve documentation for stats (#16742)
* Improve documentation for stats

* Address nits

* Update lib/pure/stats.nim

Co-authored-by: Andreas Rumpf <rumpf_a@web.de>
-rw-r--r--lib/pure/stats.nim166
1 files changed, 79 insertions, 87 deletions
diff --git a/lib/pure/stats.nim b/lib/pure/stats.nim
index af6cffef2..6e2d5fecd 100644
--- a/lib/pure/stats.nim
+++ b/lib/pure/stats.nim
@@ -9,65 +9,72 @@
 
 ## Statistical analysis framework for performing
 ## basic statistical analysis of data.
-## The data is analysed in a single pass, when a data value
-## is pushed to the ``RunningStat`` or ``RunningRegress`` objects
+## The data is analysed in a single pass, when it
+## is pushed to a `RunningStat` or `RunningRegress` object.
 ##
-## ``RunningStat`` calculates for a single data set
+## `RunningStat` calculates for a single data set
 ## - n (data count)
-## - min  (smallest value)
-## - max  (largest value)
+## - min (smallest value)
+## - max (largest value)
 ## - sum
 ## - mean
 ## - variance
-## - varianceS (sample var)
+## - varianceS (sample variance)
 ## - standardDeviation
-## - standardDeviationS  (sample stddev)
+## - standardDeviationS (sample standard deviation)
 ## - skewness (the third statistical moment)
 ## - kurtosis (the fourth statistical moment)
 ##
-## ``RunningRegress`` calculates for two sets of data
-## - n
+## `RunningRegress` calculates for two sets of data
+## - n (data count)
 ## - slope
 ## - intercept
 ## - correlation
 ##
-## Procs have been provided to calculate statistics on arrays and sequences.
+## Procs are provided to calculate statistics on `openArray`s.
 ##
 ## However, if more than a single statistical calculation is required, it is more
-## efficient to push the data once to the RunningStat object, and
-## call the numerous statistical procs for the RunningStat object.
-##
-## .. code-block:: Nim
-##
-##  var rs: RunningStat
-##  rs.push(MySeqOfData)
-##  rs.mean()
-##  rs.variance()
-##  rs.skewness()
-##  rs.kurtosis()
+## efficient to push the data once to a `RunningStat` object and then
+## call the numerous statistical procs for the `RunningStat` object:
+
+runnableExamples:
+  from std/math import almostEqual
+
+  template `~=`(a, b: float): bool = almostEqual(a, b)
+
+  var statistics: RunningStat  ## Must be var
+  statistics.push(@[1.0, 2.0, 1.0, 4.0, 1.0, 4.0, 1.0, 2.0])
+  doAssert statistics.n == 8
+  doAssert statistics.mean() ~= 2.0
+  doAssert statistics.variance() ~= 1.5
+  doAssert statistics.varianceS() ~= 1.714285714285715
+  doAssert statistics.skewness() ~= 0.8164965809277261
+  doAssert statistics.skewnessS() ~= 1.018350154434631
+  doAssert statistics.kurtosis() ~= -1.0
+  doAssert statistics.kurtosisS() ~= -0.7000000000000008
 
-from math import FloatClass, sqrt, pow, round
+from std/math import FloatClass, sqrt, pow, round
 
 {.push debugger: off.} # the user does not want to trace a part
                        # of the standard library!
 {.push checks: off, line_dir: off, stack_trace: off.}
 
 type
-  RunningStat* = object           ## an accumulator for statistical data
-    n*: int                       ## number of pushed data
+  RunningStat* = object           ## An accumulator for statistical data.
+    n*: int                       ## amount of pushed data
     min*, max*, sum*: float       ## self-explaining
     mom1, mom2, mom3, mom4: float ## statistical moments, mom1 is mean
 
-
-  RunningRegress* = object ## an accumulator for regression calculations
-    n*: int                ## number of pushed data
-    x_stats*: RunningStat  ## stats for first set of data
-    y_stats*: RunningStat  ## stats for second set of data
+  RunningRegress* = object ## An accumulator for regression calculations.
+    n*: int                ## amount of pushed data
+    x_stats*: RunningStat  ## stats for the first set of data
+    y_stats*: RunningStat  ## stats for the second set of data
     s_xy: float            ## accumulated data for combined xy
 
 # ----------- RunningStat --------------------------
+
 proc clear*(s: var RunningStat) =
-  ## reset `s`
+  ## Resets `s`.
   s.n = 0
   s.min = toBiggestFloat(int.high)
   s.max = 0.0
@@ -78,7 +85,7 @@ proc clear*(s: var RunningStat) =
   s.mom4 = 0.0
 
 proc push*(s: var RunningStat, x: float) =
-  ## pushes a value `x` for processing
+  ## Pushes a value `x` for processing.
   if s.n == 0: s.min = x
   inc(s.n)
   # See Knuth TAOCP vol 2, 3rd edition, page 232
@@ -97,63 +104,63 @@ proc push*(s: var RunningStat, x: float) =
   s.mom1 += delta_n
 
 proc push*(s: var RunningStat, x: int) =
-  ## pushes a value `x` for processing.
+  ## Pushes a value `x` for processing.
   ##
-  ## `x` is simply converted to ``float``
+  ## `x` is simply converted to `float`
   ## and the other push operation is called.
   s.push(toFloat(x))
 
 proc push*(s: var RunningStat, x: openArray[float|int]) =
-  ## pushes all values of `x` for processing.
+  ## Pushes all values of `x` for processing.
   ##
-  ## Int values of `x` are simply converted to ``float`` and
+  ## Int values of `x` are simply converted to `float` and
   ## the other push operation is called.
   for val in x:
     s.push(val)
 
 proc mean*(s: RunningStat): float =
-  ## computes the current mean of `s`
+  ## Computes the current mean of `s`.
   result = s.mom1
 
 proc variance*(s: RunningStat): float =
-  ## computes the current population variance of `s`
+  ## Computes the current population variance of `s`.
   result = s.mom2 / toFloat(s.n)
 
 proc varianceS*(s: RunningStat): float =
-  ## computes the current sample variance of `s`
+  ## Computes the current sample variance of `s`.
   if s.n > 1: result = s.mom2 / toFloat(s.n - 1)
 
 proc standardDeviation*(s: RunningStat): float =
-  ## computes the current population standard deviation of `s`
+  ## Computes the current population standard deviation of `s`.
   result = sqrt(variance(s))
 
 proc standardDeviationS*(s: RunningStat): float =
-  ## computes the current sample standard deviation of `s`
+  ## Computes the current sample standard deviation of `s`.
   result = sqrt(varianceS(s))
 
 proc skewness*(s: RunningStat): float =
-  ## computes the current population skewness of `s`
+  ## Computes the current population skewness of `s`.
   result = sqrt(toFloat(s.n)) * s.mom3 / pow(s.mom2, 1.5)
 
 proc skewnessS*(s: RunningStat): float =
-  ## computes the current sample skewness of `s`
+  ## Computes the current sample skewness of `s`.
   let s2 = skewness(s)
   result = sqrt(toFloat(s.n*(s.n-1)))*s2 / toFloat(s.n-2)
 
 proc kurtosis*(s: RunningStat): float =
-  ## computes the current population kurtosis of `s`
+  ## Computes the current population kurtosis of `s`.
   result = toFloat(s.n) * s.mom4 / (s.mom2 * s.mom2) - 3.0
 
 proc kurtosisS*(s: RunningStat): float =
-  ## computes the current sample kurtosis of `s`
+  ## Computes the current sample kurtosis of `s`.
   result = toFloat(s.n-1) / toFloat((s.n-2)*(s.n-3)) *
               (toFloat(s.n+1)*kurtosis(s) + 6)
 
 proc `+`*(a, b: RunningStat): RunningStat =
-  ## combine two RunningStats.
+  ## Combines two `RunningStat`s.
   ##
-  ## Useful if performing parallel analysis of data series
-  ## and need to re-combine parallel result sets
+  ## Useful when performing parallel analysis of data series
+  ## and needing to re-combine parallel result sets.
   result.clear()
   result.n = a.n + b.n
 
@@ -178,11 +185,11 @@ proc `+`*(a, b: RunningStat): RunningStat =
   result.min = min(a.min, b.min)
 
 proc `+=`*(a: var RunningStat, b: RunningStat) {.inline.} =
-  ## add a second RunningStats `b` to `a`
+  ## Adds the `RunningStat` `b` to `a`.
   a = a + b
 
 proc `$`*(a: RunningStat): string =
-  ## produces a string representation of the ``RunningStat``. The exact
+  ## Produces a string representation of the `RunningStat`. The exact
   ## format is currently unspecified and subject to change. Currently
   ## it contains:
   ##
@@ -199,56 +206,57 @@ proc `$`*(a: RunningStat): string =
   result.add ")"
 
 # ---------------------- standalone array/seq stats ---------------------
+
 proc mean*[T](x: openArray[T]): float =
-  ## computes the mean of `x`
+  ## Computes the mean of `x`.
   var rs: RunningStat
   rs.push(x)
   result = rs.mean()
 
 proc variance*[T](x: openArray[T]): float =
-  ## computes the population variance of `x`
+  ## Computes the population variance of `x`.
   var rs: RunningStat
   rs.push(x)
   result = rs.variance()
 
 proc varianceS*[T](x: openArray[T]): float =
-  ## computes the sample variance of `x`
+  ## Computes the sample variance of `x`.
   var rs: RunningStat
   rs.push(x)
   result = rs.varianceS()
 
 proc standardDeviation*[T](x: openArray[T]): float =
-  ## computes the population standardDeviation of `x`
+  ## Computes the population standard deviation of `x`.
   var rs: RunningStat
   rs.push(x)
   result = rs.standardDeviation()
 
 proc standardDeviationS*[T](x: openArray[T]): float =
-  ## computes the sample standardDeviation of `x`
+  ## Computes the sample standard deviation of `x`.
   var rs: RunningStat
   rs.push(x)
   result = rs.standardDeviationS()
 
 proc skewness*[T](x: openArray[T]): float =
-  ## computes the population skewness of `x`
+  ## Computes the population skewness of `x`.
   var rs: RunningStat
   rs.push(x)
   result = rs.skewness()
 
 proc skewnessS*[T](x: openArray[T]): float =
-  ## computes the sample skewness of `x`
+  ## Computes the sample skewness of `x`.
   var rs: RunningStat
   rs.push(x)
   result = rs.skewnessS()
 
 proc kurtosis*[T](x: openArray[T]): float =
-  ## computes the population kurtosis of `x`
+  ## Computes the population kurtosis of `x`.
   var rs: RunningStat
   rs.push(x)
   result = rs.kurtosis()
 
 proc kurtosisS*[T](x: openArray[T]): float =
-  ## computes the sample kurtosis of `x`
+  ## Computes the sample kurtosis of `x`.
   var rs: RunningStat
   rs.push(x)
   result = rs.kurtosisS()
@@ -256,14 +264,14 @@ proc kurtosisS*[T](x: openArray[T]): float =
 # ---------------------- Running Regression -----------------------------
 
 proc clear*(r: var RunningRegress) =
-  ## reset `r`
+  ## Resets `r`.
   r.x_stats.clear()
   r.y_stats.clear()
   r.s_xy = 0.0
   r.n = 0
 
 proc push*(r: var RunningRegress, x, y: float) =
-  ## pushes two values `x` and `y` for processing
+  ## Pushes two values `x` and `y` for processing.
   r.s_xy += (r.x_stats.mean() - x)*(r.y_stats.mean() - y) *
                 toFloat(r.n) / toFloat(r.n + 1)
   r.x_stats.push(x)
@@ -271,38 +279,38 @@ proc push*(r: var RunningRegress, x, y: float) =
   inc(r.n)
 
 proc push*(r: var RunningRegress, x, y: int) {.inline.} =
-  ## pushes two values `x` and `y` for processing.
+  ## Pushes two values `x` and `y` for processing.
   ##
-  ## `x` and `y` are converted to ``float``
+  ## `x` and `y` are converted to `float`
   ## and the other push operation is called.
   r.push(toFloat(x), toFloat(y))
 
 proc push*(r: var RunningRegress, x, y: openArray[float|int]) =
-  ## pushes two sets of values `x` and `y` for processing.
+  ## Pushes two sets of values `x` and `y` for processing.
   assert(x.len == y.len)
   for i in 0..<x.len:
     r.push(x[i], y[i])
 
 proc slope*(r: RunningRegress): float =
-  ## computes the current slope of `r`
+  ## Computes the current slope of `r`.
   let s_xx = r.x_stats.varianceS()*toFloat(r.n - 1)
   result = r.s_xy / s_xx
 
 proc intercept*(r: RunningRegress): float =
-  ## computes the current intercept of `r`
+  ## Computes the current intercept of `r`.
   result = r.y_stats.mean() - r.slope()*r.x_stats.mean()
 
 proc correlation*(r: RunningRegress): float =
-  ## computes the current correlation of the two data
-  ## sets pushed into `r`
+  ## Computes the current correlation of the two data
+  ## sets pushed into `r`.
   let t = r.x_stats.standardDeviation() * r.y_stats.standardDeviation()
   result = r.s_xy / (toFloat(r.n) * t)
 
 proc `+`*(a, b: RunningRegress): RunningRegress =
-  ## combine two `RunningRegress` objects.
+  ## Combines two `RunningRegress` objects.
   ##
-  ## Useful if performing parallel analysis of data series
-  ## and need to re-combine parallel result sets
+  ## Useful when performing parallel analysis of data series
+  ## and needing to re-combine parallel result sets
   result.clear()
   result.x_stats = a.x_stats + b.x_stats
   result.y_stats = a.y_stats + b.y_stats
@@ -314,24 +322,8 @@ proc `+`*(a, b: RunningRegress): RunningRegress =
       toFloat(a.n*b.n)*delta_x*delta_y/toFloat(result.n)
 
 proc `+=`*(a: var RunningRegress, b: RunningRegress) =
-  ## add RunningRegress `b` to `a`
+  ## Adds the `RunningRegress` `b` to `a`.
   a = a + b
 
 {.pop.}
 {.pop.}
-
-
-runnableExamples:
-  static:
-    block:
-      var statistics: RunningStat  ## Must be "var"
-      statistics.push(@[1.0, 2.0, 1.0, 4.0, 1.0, 4.0, 1.0, 2.0])
-      doAssert statistics.n == 8
-      template `===`(a, b: float): bool = (abs(a - b) < 1e-9)
-      doAssert statistics.mean() === 2.0
-      doAssert statistics.variance() === 1.5
-      doAssert statistics.varianceS() === 1.714285714285715
-      doAssert statistics.skewness() === 0.8164965809277261
-      doAssert statistics.skewnessS() === 1.018350154434631
-      doAssert statistics.kurtosis() === -1.0
-      doAssert statistics.kurtosisS() === -0.7000000000000008