summary refs log tree commit diff stats
path: root/lib/pure/stats.nim
blob: 0da1d4594df4951e5d45d0c711824600df63838e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
#
#
#            Nim's Runtime Library
#        (c) Copyright 2015 Nim contributors
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## Statistical analysis framework for performing
## basic statistical analysis of data.
## The data is analysed in a single pass, when a data value
## is pushed to the ``RunningStat`` or ``RunningRegress`` objects
##
## ``RunningStat`` calculates for a single data set
## - n (data count)
## - min  (smallest value)
## - max  (largest value)
## - sum
## - mean
## - variance
## - varianceS (sample var)
## - standardDeviation
## - standardDeviationS  (sample stddev)
## - skewness (the third statistical moment)
## - kurtosis (the fourth statistical moment)
##
## ``RunningRegress`` calculates for two sets of data
## - n
## - slope
## - intercept
## - correlation
##
## Procs have been provided to calculate statistics on arrays and sequences.
##
## However, if more than a single statistical calculation is required, it is more
## efficient to push the data once to the RunningStat object, and
## call the numerous statistical procs for the RunningStat object.
##
## .. code-block:: Nim
##
##  var rs: RunningStat
##  rs.push(MySeqOfData)
##  rs.mean()
##  rs.variance()
##  rs.skewness()
##  rs.kurtosis()

from math import FloatClass, sqrt, pow, round

{.push debugger:off .} # the user does not want to trace a part
                       # of the standard library!
{.push checks:off, line_dir:off, stack_trace:off.}

type
  RunningStat* = object             ## an accumulator for statistical data
    n*: int                         ## number of pushed data
    min*, max*, sum*: float         ## self-explaining
    mom1, mom2, mom3, mom4: float   ## statistical moments, mom1 is mean


  RunningRegress* = object  ## an accumulator for regression calculations
    n*: int                 ## number of pushed data
    x_stats*: RunningStat   ## stats for first set of data
    y_stats*: RunningStat   ## stats for second set of data
    s_xy: float             ## accumulated data for combined xy

# ----------- RunningStat --------------------------
proc clear*(s: var RunningStat) =
  ## reset `s`
  s.n = 0
  s.min = toBiggestFloat(int.high)
  s.max = 0.0
  s.sum = 0.0
  s.mom1 = 0.0
  s.mom2 = 0.0
  s.mom3 = 0.0
  s.mom4 = 0.0

proc push*(s: var RunningStat, x: float) =
  ## pushes a value `x` for processing
  if s.n == 0: s.min = x
  inc(s.n)
  # See Knuth TAOCP vol 2, 3rd edition, page 232
  if s.min > x: s.min = x
  if s.max < x: s.max = x
  s.sum += x
  let n = toFloat(s.n)
  let delta = x - s.mom1
  let delta_n = delta / toFloat(s.n)
  let delta_n2 = delta_n * delta_n
  let term1 = delta * delta_n * toFloat(s.n - 1)
  s.mom4 += term1 * delta_n2 * (n*n - 3*n + 3) +
              6*delta_n2*s.mom2 - 4*delta_n*s.mom3
  s.mom3 += term1 * delta_n * (n - 2) - 3*delta_n*s.mom2
  s.mom2 += term1
  s.mom1 += delta_n

proc push*(s: var RunningStat, x: int) =
  ## pushes a value `x` for processing.
  ##
  ## `x` is simply converted to ``float``
  ## and the other push operation is called.
  s.push(toFloat(x))

proc push*(s: var RunningStat, x: openarray[float|int]) =
  ## pushes all values of `x` for processing.
  ##
  ## Int values of `x` are simply converted to ``float`` and
  ## the other push operation is called.
  for val in x:
    s.push(val)

proc mean*(s: RunningStat): float =
  ## computes the current mean of `s`
  result = s.mom1

proc variance*(s: RunningStat): float =
  ## computes the current population variance of `s`
  result = s.mom2 / toFloat(s.n)

proc varianceS*(s: RunningStat): float =
  ## computes the current sample variance of `s`
  if s.n > 1: result = s.mom2 / toFloat(s.n - 1)

proc standardDeviation*(s: RunningStat): float =
  ## computes the current population standard deviation of `s`
  result = sqrt(variance(s))

proc standardDeviationS*(s: RunningStat): float =
  ## computes the current sample standard deviation of `s`
  result = sqrt(varianceS(s))

proc skewness*(s: RunningStat): float =
  ## computes the current population skewness of `s`
  result = sqrt(toFloat(s.n)) * s.mom3 / pow(s.mom2, 1.5)

proc skewnessS*(s: RunningStat): float =
  ## computes the current sample skewness of `s`
  let s2 = skewness(s)
  result = sqrt(toFloat(s.n*(s.n-1)))*s2 / toFloat(s.n-2)

proc kurtosis*(s: RunningStat): float =
  ## computes the current population kurtosis of `s`
  result = toFloat(s.n) * s.mom4 / (s.mom2 * s.mom2) - 3.0

proc kurtosisS*(s: RunningStat): float =
  ## computes the current sample kurtosis of `s`
  result = toFloat(s.n-1) / toFloat((s.n-2)*(s.n-3)) *
              (toFloat(s.n+1)*kurtosis(s) + 6)

proc `+`*(a, b: RunningStat): RunningStat =
  ## combine two RunningStats.
  ##
  ## Useful if performing parallel analysis of data series
  ## and need to re-combine parallel result sets
  result.clear()
  result.n = a.n + b.n

  let delta = b.mom1 - a.mom1
  let delta2 = delta*delta
  let delta3 = delta*delta2
  let delta4 = delta2*delta2
  let n = toFloat(result.n)

  result.mom1 = (a.n.float*a.mom1 + b.n.float*b.mom1) / n
  result.mom2 = a.mom2 + b.mom2 + delta2 * a.n.float * b.n.float / n
  result.mom3 = a.mom3 + b.mom3 +
                delta3 * a.n.float * b.n.float * (a.n.float - b.n.float)/(n*n);
  result.mom3 += 3.0*delta * (a.n.float*b.mom2 - b.n.float*a.mom2) / n
  result.mom4 = a.mom4 + b.mom4 +
            delta4*a.n.float*b.n.float * toFloat(a.n*a.n - a.n*b.n + b.n*b.n) /
                (n*n*n)
  result.mom4 += 6.0*delta2 * (a.n.float*a.n.float*b.mom2 + b.n.float*b.n.float*a.mom2) /
                (n*n) +
                4.0*delta*(a.n.float*b.mom3 - b.n.float*a.mom3) / n
  result.max = max(a.max, b.max)
  result.min = min(a.min, b.min)

proc `+=`*(a: var RunningStat, b: RunningStat) {.inline.} =
  ## add a second RunningStats `b` to `a`
  a = a + b

proc `$`*(a: RunningStat): string =
  ## produces a string representation of the ``RunningStat``. The exact
  ## format is currently unspecified and subject to change. Currently
  ## it contains:
  ##
  ## - the number of probes
  ## - min, max values
  ## - sum, mean and standard deviation.
  result = "RunningStat(\n"
  result.add "  number of probes: " & $a.n & "\n"
  result.add "  max: " & $a.max & "\n"
  result.add "  min: " & $a.min & "\n"
  result.add "  sum: " & $a.sum & "\n"
  result.add "  mean: " & $a.mean & "\n"
  result.add "  std deviation: " & $a.standardDeviation & "\n"
  result.add ")"

# ---------------------- standalone array/seq stats ---------------------
proc mean*[T](x: openArray[T]): float =
  ## computes the mean of `x`
  var rs: RunningStat
  rs.push(x)
  result = rs.mean()

proc variance*[T](x: openArray[T]): float =
  ## computes the population variance of `x`
  var rs: RunningStat
  rs.push(x)
  result = rs.variance()

proc varianceS*[T](x: openArray[T]): float =
  ## computes the sample variance of `x`
  var rs: RunningStat
  rs.push(x)
  result = rs.varianceS()

proc standardDeviation*[T](x: openArray[T]): float =
  ## computes the population standardDeviation of `x`
  var rs: RunningStat
  rs.push(x)
  result = rs.standardDeviation()

proc standardDeviationS*[T](x: openArray[T]): float =
  ## computes the sample standardDeviation of `x`
  var rs: RunningStat
  rs.push(x)
  result = rs.standardDeviationS()

proc skewness*[T](x: openArray[T]): float =
  ## computes the population skewness of `x`
  var rs: RunningStat
  rs.push(x)
  result = rs.skewness()

proc skewnessS*[T](x: openArray[T]): float =
  ## computes the sample skewness of `x`
  var rs: RunningStat
  rs.push(x)
  result = rs.skewnessS()

proc kurtosis*[T](x: openArray[T]): float =
  ## computes the population kurtosis of `x`
  var rs: RunningStat
  rs.push(x)
  result = rs.kurtosis()

proc kurtosisS*[T](x: openArray[T]): float =
  ## computes the sample kurtosis of `x`
  var rs: RunningStat
  rs.push(x)
  result = rs.kurtosisS()

# ---------------------- Running Regression -----------------------------

proc clear*(r: var RunningRegress) =
  ## reset `r`
  r.x_stats.clear()
  r.y_stats.clear()
  r.s_xy = 0.0
  r.n = 0

proc push*(r: var RunningRegress, x, y: float) =
  ## pushes two values `x` and `y` for processing
  r.s_xy += (r.x_stats.mean() - x)*(r.y_stats.mean() - y)*
                toFloat(r.n) / toFloat(r.n + 1)
  r.x_stats.push(x)
  r.y_stats.push(y)
  inc(r.n)

proc push*(r: var RunningRegress, x, y: int) {.inline.} =
  ## pushes two values `x` and `y` for processing.
  ##
  ## `x` and `y` are converted to ``float``
  ## and the other push operation is called.
  r.push(toFloat(x), toFloat(y))

proc push*(r: var RunningRegress, x, y: openarray[float|int]) =
  ## pushes two sets of values `x` and `y` for processing.
  assert(x.len == y.len)
  for i in 0..<x.len:
    r.push(x[i], y[i])

proc slope*(r: RunningRegress): float =
  ## computes the current slope of `r`
  let s_xx = r.x_stats.varianceS()*toFloat(r.n - 1)
  result = r.s_xy / s_xx

proc intercept*(r: RunningRegress): float =
  ## computes the current intercept of `r`
  result = r.y_stats.mean() - r.slope()*r.x_stats.mean()

proc correlation*(r: RunningRegress): float =
  ## computes the current correlation of the two data
  ## sets pushed into `r`
  let t = r.x_stats.standardDeviation() * r.y_stats.standardDeviation()
  result = r.s_xy / ( toFloat(r.n) * t )

proc `+`*(a, b: RunningRegress): RunningRegress =
  ## combine two `RunningRegress` objects.
  ##
  ## Useful if performing parallel analysis of data series
  ## and need to re-combine parallel result sets
  result.clear()
  result.x_stats = a.x_stats + b.x_stats
  result.y_stats = a.y_stats + b.y_stats
  result.n = a.n + b.n

  let delta_x = b.x_stats.mean() - a.x_stats.mean()
  let delta_y = b.y_stats.mean() - a.y_stats.mean()
  result.s_xy = a.s_xy + b.s_xy +
      toFloat(a.n*b.n)*delta_x*delta_y/toFloat(result.n)

proc `+=`*(a: var RunningRegress, b: RunningRegress) =
  ## add RunningRegress `b` to `a`
  a = a + b

{.pop.}
{.pop.}

when isMainModule:
  proc clean(x: float): float =
    result = round(1.0e8*x).float * 1.0e-8

  var rs: RunningStat
  rs.push(@[1.0, 2.0, 1.0, 4.0, 1.0, 4.0, 1.0, 2.0])
  doAssert(rs.n == 8)
  doAssert(clean(rs.mean) == 2.0)
  doAssert(clean(rs.variance()) == 1.5)
  doAssert(clean(rs.varianceS()) == 1.71428571)
  doAssert(clean(rs.skewness()) == 0.81649658)
  doAssert(clean(rs.skewnessS()) == 1.01835015)
  doAssert(clean(rs.kurtosis()) == -1.0)
  doAssert(clean(rs.kurtosisS()) == -0.7000000000000001)

  var rs1, rs2: RunningStat
  rs1.push(@[1.0, 2.0, 1.0, 4.0])
  rs2.push(@[1.0, 4.0, 1.0, 2.0])
  let rs3 = rs1 + rs2
  doAssert(clean(rs3.mom2) == clean(rs.mom2))
  doAssert(clean(rs3.mom3) == clean(rs.mom3))
  doAssert(clean(rs3.mom4) == clean(rs.mom4))
  rs1 += rs2
  doAssert(clean(rs1.mom2) == clean(rs.mom2))
  doAssert(clean(rs1.mom3) == clean(rs.mom3))
  doAssert(clean(rs1.mom4) == clean(rs.mom4))
  rs1.clear()
  rs1.push(@[1.0, 2.2, 1.4, 4.9])
  doAssert(rs1.sum == 9.5)
  doAssert(rs1.mean() == 2.375)

  when not defined(cpu32):
    # XXX For some reason on 32bit CPUs these results differ
    var rr: RunningRegress
    rr.push(@[0.0,1.0,2.8,3.0,4.0], @[0.0,1.0,2.3,3.0,4.0])
    doAssert(rr.slope() == 0.9695585996955861)
    doAssert(rr.intercept() == -0.03424657534246611)
    doAssert(rr.correlation() == 0.9905100362239381)
    var rr1, rr2: RunningRegress
    rr1.push(@[0.0,1.0], @[0.0,1.0])
    rr2.push(@[2.8,3.0,4.0], @[2.3,3.0,4.0])
    let rr3 = rr1 + rr2
    doAssert(rr3.correlation() == rr.correlation())
    doAssert(clean(rr3.slope()) == clean(rr.slope()))
    doAssert(clean(rr3.intercept()) == clean(rr.intercept()))