diff options
author | Charles Blake <cblake@csail.mit.edu> | 2015-02-07 09:37:17 -0500 |
---|---|---|
committer | Charles Blake <cblake@csail.mit.edu> | 2015-02-07 09:37:17 -0500 |
commit | 42f8f1cd1fe491c19362a4b03f89952ea6e160bc (patch) | |
tree | 2db3cbcd2a6c6abdde85e34aa75241eab4a69765 /lib/pure/collections | |
parent | 8e685585a6a650d14f81bef5fbdfae10ac60a33a (diff) | |
download | Nim-42f8f1cd1fe491c19362a4b03f89952ea6e160bc.tar.gz |
Fix unnecessarily slow set building from openArray.
The estimation of the initialSize as simply array len + 10 was too small for for all but the smallest sets. It would not elide/skip one final enlarge(). That last one is actually always the most expensive enlarge(). Indeed, in a series where one to start from tiny and build up the table..that last one is about 50% of all the enlarging time in general. So, this simple and reasonable optimization (compared to just starting at 64) was only helping about half as much as it could. Introduce a rightSize() proc to be the inverse to mustRehash(). Export it to clients since pre-sizing is externally useful in set construction and the current mustRehash rules are opaque and beyond the control of clients. Also add test module logic to check that rightSize() and mustRehash() are inverses in the appropriate sense..not really in a block/assertion throwing unit test since this is a peformance nice-to-have issue rather than about basic correctness. (Also, fix a too vs. two typo in doc comment.)
Diffstat (limited to 'lib/pure/collections')
-rw-r--r-- | lib/pure/collections/sets.nim | 20 |
1 files changed, 17 insertions, 3 deletions
diff --git a/lib/pure/collections/sets.nim b/lib/pure/collections/sets.nim index 72fd5cd81..3e462f73f 100644 --- a/lib/pure/collections/sets.nim +++ b/lib/pure/collections/sets.nim @@ -114,6 +114,15 @@ proc mustRehash(length, counter: int): bool {.inline.} = assert(length > counter) result = (length * 2 < counter * 3) or (length - counter < 4) +proc rightSize*(count: int): int {.inline.} = + ## Return the value of `initialSize` to support `count` items. + ## + ## If more items are expected to be added, simply add that + ## expected extra amount to the parameter before calling this. + ## + ## Internally, we want mustRehash(rightSize(x), x) == false. + result = nextPowerOfTwo(count * 3 div 2 + 4) + proc nextTry(h, maxHash: THash): THash {.inline.} = result = (h + 1) and maxHash @@ -343,7 +352,7 @@ proc toSet*[A](keys: openArray[A]): HashSet[A] = ## var numbers = toSet([1, 2, 3, 4, 5]) ## assert numbers.contains(2) ## assert numbers.contains(4) - result = initSet[A](nextPowerOfTwo(keys.len+10)) + result = initSet[A](rightSize(keys.len)) for key in items(keys): result.incl(key) template dollarImpl(): stmt {.dirty.} = @@ -708,7 +717,7 @@ proc containsOrIncl*[A](s: var OrderedSet[A], key: A): bool = proc init*[A](s: var OrderedSet[A], initialSize=64) = ## Initializes an ordered hash set. ## - ## The `initialSize` parameter needs to be a power of too. You can use + ## The `initialSize` parameter needs to be a power of two. You can use ## `math.nextPowerOfTwo() <math.html#nextPowerOfTwo>`_ to guarantee that at ## runtime. All set variables have to be initialized before you can use them ## with other procs from this module with the exception of `isValid() @@ -751,7 +760,7 @@ proc toOrderedSet*[A](keys: openArray[A]): OrderedSet[A] = ## var numbers = toOrderedSet([1, 2, 3, 4, 5]) ## assert numbers.contains(2) ## assert numbers.contains(4) - result = initOrderedSet[A](nextPowerOfTwo(keys.len+10)) + result = initOrderedSet[A](rightSize(keys.len)) for key in items(keys): result.incl(key) proc `$`*[A](s: OrderedSet[A]): string = @@ -954,6 +963,11 @@ proc testModule() = b.incl(2) assert b.len == 1 + for i in 0 .. 32: + var s = rightSize(i) + if s <= i or mustRehash(s, i): + echo "performance issue: rightSize() will not elide enlarge() at ", i + echo "Micro tests run successfully." when isMainModule and not defined(release): testModule() |