summary refs log tree commit diff stats
path: root/lib/pure/collections
diff options
context:
space:
mode:
authorCharles Blake <cblake@csail.mit.edu>2015-02-07 09:37:17 -0500
committerCharles Blake <cblake@csail.mit.edu>2015-02-07 09:37:17 -0500
commit42f8f1cd1fe491c19362a4b03f89952ea6e160bc (patch)
tree2db3cbcd2a6c6abdde85e34aa75241eab4a69765 /lib/pure/collections
parent8e685585a6a650d14f81bef5fbdfae10ac60a33a (diff)
downloadNim-42f8f1cd1fe491c19362a4b03f89952ea6e160bc.tar.gz
Fix unnecessarily slow set building from openArray.
The estimation of the initialSize as simply array len + 10 was too small for
for all but the smallest sets.  It would not elide/skip one final enlarge().
That last one is actually always the most expensive enlarge().  Indeed, in a
series where one to start from tiny and build up the table..that last one is
about 50% of all the enlarging time in general.  So, this simple and reasonable
optimization (compared to just starting at 64) was only helping about half as
much as it could.

Introduce a rightSize() proc to be the inverse to mustRehash().  Export it
to clients since pre-sizing is externally useful in set construction and the
current mustRehash rules are opaque and beyond the control of clients.

Also add test module logic to check that rightSize() and mustRehash() are
inverses in the appropriate sense..not really in a block/assertion throwing
unit test since this is a peformance nice-to-have issue rather than about
basic correctness.  (Also, fix a too vs. two typo in doc comment.)
Diffstat (limited to 'lib/pure/collections')
-rw-r--r--lib/pure/collections/sets.nim20
1 files changed, 17 insertions, 3 deletions
diff --git a/lib/pure/collections/sets.nim b/lib/pure/collections/sets.nim
index 72fd5cd81..3e462f73f 100644
--- a/lib/pure/collections/sets.nim
+++ b/lib/pure/collections/sets.nim
@@ -114,6 +114,15 @@ proc mustRehash(length, counter: int): bool {.inline.} =
   assert(length > counter)
   result = (length * 2 < counter * 3) or (length - counter < 4)
 
+proc rightSize*(count: int): int {.inline.} =
+  ## Return the value of `initialSize` to support `count` items.
+  ##
+  ## If more items are expected to be added, simply add that
+  ## expected extra amount to the parameter before calling this.
+  ##
+  ## Internally, we want mustRehash(rightSize(x), x) == false.
+  result = nextPowerOfTwo(count * 3 div 2  +  4)
+
 proc nextTry(h, maxHash: THash): THash {.inline.} =
   result = (h + 1) and maxHash
 
@@ -343,7 +352,7 @@ proc toSet*[A](keys: openArray[A]): HashSet[A] =
   ##   var numbers = toSet([1, 2, 3, 4, 5])
   ##   assert numbers.contains(2)
   ##   assert numbers.contains(4)
-  result = initSet[A](nextPowerOfTwo(keys.len+10))
+  result = initSet[A](rightSize(keys.len))
   for key in items(keys): result.incl(key)
 
 template dollarImpl(): stmt {.dirty.} =
@@ -708,7 +717,7 @@ proc containsOrIncl*[A](s: var OrderedSet[A], key: A): bool =
 proc init*[A](s: var OrderedSet[A], initialSize=64) =
   ## Initializes an ordered hash set.
   ##
-  ## The `initialSize` parameter needs to be a power of too. You can use
+  ## The `initialSize` parameter needs to be a power of two. You can use
   ## `math.nextPowerOfTwo() <math.html#nextPowerOfTwo>`_ to guarantee that at
   ## runtime. All set variables have to be initialized before you can use them
   ## with other procs from this module with the exception of `isValid()
@@ -751,7 +760,7 @@ proc toOrderedSet*[A](keys: openArray[A]): OrderedSet[A] =
   ##   var numbers = toOrderedSet([1, 2, 3, 4, 5])
   ##   assert numbers.contains(2)
   ##   assert numbers.contains(4)
-  result = initOrderedSet[A](nextPowerOfTwo(keys.len+10))
+  result = initOrderedSet[A](rightSize(keys.len))
   for key in items(keys): result.incl(key)
 
 proc `$`*[A](s: OrderedSet[A]): string =
@@ -954,6 +963,11 @@ proc testModule() =
     b.incl(2)
     assert b.len == 1
 
+  for i in 0 .. 32:
+    var s = rightSize(i)
+    if s <= i or mustRehash(s, i):
+      echo "performance issue: rightSize() will not elide enlarge() at ", i
+
   echo "Micro tests run successfully."
 
 when isMainModule and not defined(release): testModule()