Fix unnecessarily slow set building from openArray.

The estimation of the initialSize as simply array len + 10 was too small for for all but the smallest sets. It would not elide/skip one final enlarge(). That last one is actually always the most expensive enlarge(). Indeed, in a series where one to start from tiny and build up the table..that last one is about 50% of all the enlarging time in general. So, this simple and reasonable optimization (compared to just starting at 64) was only helping about half as much as it could. Introduce a rightSize() proc to be the inverse to mustRehash(). Export it to clients since pre-sizing is externally useful in set construction and the current mustRehash rules are opaque and beyond the control of clients. Also add test module logic to check that rightSize() and mustRehash() are inverses in the appropriate sense..not really in a block/assertion throwing unit test since this is a peformance nice-to-have issue rather than about basic correctness. (Also, fix a too vs. two typo in doc comment.)
author: Charles Blake <cblake@csail.mit.edu> 2015-02-07 09:37:17 -0500
committer: Charles Blake <cblake@csail.mit.edu> 2015-02-07 09:37:17 -0500
commit: 42f8f1cd1fe491c19362a4b03f89952ea6e160bc (patch)
tree: 2db3cbcd2a6c6abdde85e34aa75241eab4a69765 /lib/pure/collections
parent: 8e685585a6a650d14f81bef5fbdfae10ac60a33a (diff)
download: Nim-42f8f1cd1fe491c19362a4b03f89952ea6e160bc.tar.gz
1 files changed, 17 insertions, 3 deletions
diff --git a/lib/pure/collections/sets.nim b/lib/pure/collections/sets.nim
index 72fd5cd81..3e462f73f 100644
--- a/lib/pure/collections/sets.nim
+++ b/lib/pure/collections/sets.nim
@@ -114,6 +114,15 @@ proc mustRehash(length, counter: int): bool {.inline.} =
   assert(length > counter)
   result = (length * 2 < counter * 3) or (length - counter < 4)
 
+proc rightSize*(count: int): int {.inline.} =
+  ## Return the value of `initialSize` to support `count` items.
+  ##
+  ## If more items are expected to be added, simply add that
+  ## expected extra amount to the parameter before calling this.
+  ##
+  ## Internally, we want mustRehash(rightSize(x), x) == false.
+  result = nextPowerOfTwo(count * 3 div 2  +  4)
+
 proc nextTry(h, maxHash: THash): THash {.inline.} =
   result = (h + 1) and maxHash
 
@@ -343,7 +352,7 @@ proc toSet*[A](keys: openArray[A]): HashSet[A] =
   ##   var numbers = toSet([1, 2, 3, 4, 5])
   ##   assert numbers.contains(2)
   ##   assert numbers.contains(4)
-  result = initSet[A](nextPowerOfTwo(keys.len+10))
+  result = initSet[A](rightSize(keys.len))
   for key in items(keys): result.incl(key)
 
 template dollarImpl(): stmt {.dirty.} =
@@ -708,7 +717,7 @@ proc containsOrIncl*[A](s: var OrderedSet[A], key: A): bool =
 proc init*[A](s: var OrderedSet[A], initialSize=64) =
   ## Initializes an ordered hash set.
   ##
-  ## The `initialSize` parameter needs to be a power of too. You can use
+  ## The `initialSize` parameter needs to be a power of two. You can use
   ## `math.nextPowerOfTwo() <math.html#nextPowerOfTwo>`_ to guarantee that at
   ## runtime. All set variables have to be initialized before you can use them
   ## with other procs from this module with the exception of `isValid()
@@ -751,7 +760,7 @@ proc toOrderedSet*[A](keys: openArray[A]): OrderedSet[A] =
   ##   var numbers = toOrderedSet([1, 2, 3, 4, 5])
   ##   assert numbers.contains(2)
   ##   assert numbers.contains(4)
-  result = initOrderedSet[A](nextPowerOfTwo(keys.len+10))
+  result = initOrderedSet[A](rightSize(keys.len))
   for key in items(keys): result.incl(key)
 
 proc `$`*[A](s: OrderedSet[A]): string =
@@ -954,6 +963,11 @@ proc testModule() =
     b.incl(2)
     assert b.len == 1
 
+  for i in 0 .. 32:
+    var s = rightSize(i)
+    if s <= i or mustRehash(s, i):
+      echo "performance issue: rightSize() will not elide enlarge() at ", i
+
   echo "Micro tests run successfully."
 
 when isMainModule and not defined(release): testModule()
author	Charles Blake <cblake@csail.mit.edu>	2015-02-07 09:37:17 -0500
committer	Charles Blake <cblake@csail.mit.edu>	2015-02-07 09:37:17 -0500
commit	42f8f1cd1fe491c19362a4b03f89952ea6e160bc (patch)
tree	2db3cbcd2a6c6abdde85e34aa75241eab4a69765 /lib/pure/collections
parent	8e685585a6a650d14f81bef5fbdfae10ac60a33a (diff)
download	Nim-42f8f1cd1fe491c19362a4b03f89952ea6e160bc.tar.gz