diff options
-rw-r--r-- | lib/pure/collections/hashcommon.nim | 87 | ||||
-rw-r--r-- | lib/pure/collections/intsets.nim | 36 | ||||
-rw-r--r-- | lib/pure/collections/setimpl.nim | 23 | ||||
-rw-r--r-- | lib/pure/collections/sets.nim | 40 | ||||
-rw-r--r-- | lib/pure/collections/sharedtables.nim | 6 | ||||
-rw-r--r-- | lib/pure/collections/tableimpl.nim | 47 | ||||
-rw-r--r-- | lib/pure/collections/tables.nim | 112 | ||||
-rw-r--r-- | lib/pure/hashes.nim | 32 | ||||
-rw-r--r-- | lib/pure/testutils.nim | 15 | ||||
-rw-r--r-- | tests/arc/trepr.nim | 2 | ||||
-rw-r--r-- | tests/collections/ttables.nim | 31 | ||||
-rw-r--r-- | tests/collections/ttablesthreads.nim | 5 |
12 files changed, 258 insertions, 178 deletions
diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index d9a914afc..bfc09ec3c 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -18,34 +18,87 @@ when not defined(nimHasDefault): var v: T v +const freeMarker = 0 +const deletedMarker = -1 + +type UHash = uint + # hcode for real keys cannot be zero. hcode==0 signifies an empty slot. These # two procs retain clarity of that encoding without the space cost of an enum. -proc isEmpty(hcode: Hash): bool {.inline.} = - result = hcode == 0 +proc isFilledAndValid(hcode: Hash): bool {.inline.} = + result = hcode != 0 and hcode != deletedMarker + # performance: we could use bit magic if needed proc isFilled(hcode: Hash): bool {.inline.} = result = hcode != 0 -proc nextTry(h, maxHash: Hash): Hash {.inline.} = - result = (h + 1) and maxHash -proc mustRehash(length, counter: int): bool {.inline.} = - assert(length > counter) - result = (length * 2 < counter * 3) or (length - counter < 4) +proc translateBits(a: UHash, numBitsMask: int): UHash {.inline.} = + result = (a shr numBitsMask) or (a shl (UHash.sizeof * 8 - numBitsMask)) + +proc nextTry(h, maxHash: Hash, perturb: var UHash): Hash {.inline.} = + # FACTOR between hashcommon.nextTry, intsets.nextTry + # an optimization would be to use `(h + 1) and maxHash` for a few iterations + # and then switch to the formula below, to get "best of both worlds": good + # cache locality, except when a collision cluster is detected (ie, large number + # of iterations). + const PERTURB_SHIFT = 5 # consider tying this to `numBitsMask = fastLog2(t.dataLen)` + result = cast[Hash]((5*cast[uint](h) + 1 + perturb) and cast[uint](maxHash)) + perturb = perturb shr PERTURB_SHIFT + +proc mustRehash[T](t: T): bool {.inline.} = + # FACTOR between hashcommon.mustRehash, intsets.mustRehash + let counter2 = t.counter + t.countDeleted + let length = t.dataLen + assert(length > counter2) + result = (length * 2 < counter2 * 3) or (length - counter2 < 4) # synchronize with `rightSize` + +proc rightSize*(count: Natural): int {.inline.} = + ## Return the value of `initialSize` to support `count` items. + ## + ## If more items are expected to be added, simply add that + ## expected extra amount to the parameter before calling this. + ## + ## Internally, we want `mustRehash(t) == false` for t that was just resized. + # Make sure to synchronize with `mustRehash` + result = nextPowerOfTwo(count * 3 div 2 + 4) + +template getPerturb(t: typed, hc: Hash): UHash = + # we can't use `fastLog2(dataLen(t))` because importing `bitops` would cause codegen errors + # so we use a practical value of half the bit width (eg 64 / 2 = 32 on 64bit machines) + let numBitsMask = sizeof(Hash) * 4 # ie, sizeof(Hash) * 8 / 2 + # this makes a major difference for cases like #13393; it causes the bits + # that were masked out in 1st position so they'll be masked in instead, and + # influence the recursion in nextTry earlier rather than later. + translateBits(cast[uint](hc), numBitsMask) template rawGetKnownHCImpl() {.dirty.} = if t.dataLen == 0: return -1 var h: Hash = hc and maxHash(t) # start with real hash value - while isFilled(t.data[h].hcode): - # Compare hc THEN key with boolean short circuit. This makes the common case - # zero ==key's for missing (e.g.inserts) and exactly one ==key for present. - # It does slow down succeeding lookups by one extra Hash cmp&and..usually - # just a few clock cycles, generally worth it for any non-integer-like A. - if t.data[h].hcode == hc and t.data[h].key == key: - return h - h = nextTry(h, maxHash(t)) - result = -1 - h # < 0 => MISSING; insert idx = -1 - result + var perturb = t.getPerturb(hc) + var deletedIndex = -1 + while true: + if isFilledAndValid(t.data[h].hcode): + # Compare hc THEN key with boolean short circuit. This makes the common case + # zero ==key's for missing (e.g.inserts) and exactly one ==key for present. + # It does slow down succeeding lookups by one extra Hash cmp&and..usually + # just a few clock cycles, generally worth it for any non-integer-like A. + # performance: we optimize this: depending on type(key), skip hc comparison + if t.data[h].hcode == hc and t.data[h].key == key: + return h + h = nextTry(h, maxHash(t), perturb) + elif t.data[h].hcode == deletedMarker: + if deletedIndex == -1: + deletedIndex = h + h = nextTry(h, maxHash(t), perturb) + else: + break + if deletedIndex == -1: + result = -1 - h # < 0 => MISSING; insert idx = -1 - result + else: + # we prefer returning a (in fact the 1st found) deleted index + result = -1 - deletedIndex proc rawGetKnownHC[X, A](t: X, key: A, hc: Hash): int {.inline.} = rawGetKnownHCImpl() @@ -54,6 +107,8 @@ template genHashImpl(key, hc: typed) = hc = hash(key) if hc == 0: # This almost never taken branch should be very predictable. hc = 314159265 # Value doesn't matter; Any non-zero favorite is fine. + elif hc == deletedMarker: + hc = 214159261 template genHash(key: typed): Hash = var res: Hash diff --git a/lib/pure/collections/intsets.nim b/lib/pure/collections/intsets.nim index 8338ceaca..7ca379783 100644 --- a/lib/pure/collections/intsets.nim +++ b/lib/pure/collections/intsets.nim @@ -46,30 +46,40 @@ type IntSet* = object ## An efficient set of `int` implemented as a sparse bit set. elems: int # only valid for small numbers counter, max: int + countDeleted: int head: PTrunk data: TrunkSeq a: array[0..33, int] # profiling shows that 34 elements are enough -proc mustRehash(length, counter: int): bool {.inline.} = - assert(length > counter) - result = (length * 2 < counter * 3) or (length - counter < 4) +proc mustRehash[T](t: T): bool {.inline.} = + # FACTOR between hashcommon.mustRehash, intsets.mustRehash + let counter2 = t.counter + t.countDeleted + let length = t.max + 1 + assert length > counter2 + result = (length * 2 < counter2 * 3) or (length - counter2 < 4) -proc nextTry(h, maxHash: Hash): Hash {.inline.} = - result = ((5 * h) + 1) and maxHash +proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} = + # FACTOR between hashcommon.nextTry, intsets.nextTry + const PERTURB_SHIFT = 5 + var perturb2 = cast[uint](perturb) shr PERTURB_SHIFT + perturb = cast[Hash](perturb2) + result = ((5*h) + 1 + perturb) and maxHash proc intSetGet(t: IntSet, key: int): PTrunk = var h = key and t.max + var perturb = key while t.data[h] != nil: if t.data[h].key == key: return t.data[h] - h = nextTry(h, t.max) + h = nextTry(h, t.max, perturb) result = nil proc intSetRawInsert(t: IntSet, data: var TrunkSeq, desc: PTrunk) = var h = desc.key and t.max + var perturb = desc.key while data[h] != nil: assert(data[h] != desc) - h = nextTry(h, t.max) + h = nextTry(h, t.max, perturb) assert(data[h] == nil) data[h] = desc @@ -84,14 +94,16 @@ proc intSetEnlarge(t: var IntSet) = proc intSetPut(t: var IntSet, key: int): PTrunk = var h = key and t.max + var perturb = key while t.data[h] != nil: if t.data[h].key == key: return t.data[h] - h = nextTry(h, t.max) - if mustRehash(t.max + 1, t.counter): intSetEnlarge(t) + h = nextTry(h, t.max, perturb) + if mustRehash(t): intSetEnlarge(t) inc(t.counter) h = key and t.max - while t.data[h] != nil: h = nextTry(h, t.max) + perturb = key + while t.data[h] != nil: h = nextTry(h, t.max, perturb) assert(t.data[h] == nil) new(result) result.next = t.head @@ -100,6 +112,7 @@ proc intSetPut(t: var IntSet, key: int): PTrunk = t.data[h] = result proc bitincl(s: var IntSet, key: int) {.inline.} = + var ret: PTrunk var t = intSetPut(s, `shr`(key, TrunkShift)) var u = key and TrunkMask t.bits[u shr IntShift] = t.bits[u shr IntShift] or @@ -393,7 +406,8 @@ proc assign*(dest: var IntSet, src: IntSet) = var it = src.head while it != nil: var h = it.key and dest.max - while dest.data[h] != nil: h = nextTry(h, dest.max) + var perturb = it.key + while dest.data[h] != nil: h = nextTry(h, dest.max, perturb) assert(dest.data[h] == nil) var n: PTrunk new(n) diff --git a/lib/pure/collections/setimpl.nim b/lib/pure/collections/setimpl.nim index d2d1490ff..f8950e354 100644 --- a/lib/pure/collections/setimpl.nim +++ b/lib/pure/collections/setimpl.nim @@ -48,7 +48,7 @@ template inclImpl() {.dirty.} = var hc: Hash var index = rawGet(s, key, hc) if index < 0: - if mustRehash(len(s.data), s.counter): + if mustRehash(s): enlarge(s) index = rawGetKnownHC(s, key, hc) rawInsert(s, s.data, key, hc, -1 - index) @@ -62,17 +62,12 @@ template containsOrInclImpl() {.dirty.} = if index >= 0: result = true else: - if mustRehash(len(s.data), s.counter): + if mustRehash(s): enlarge(s) index = rawGetKnownHC(s, key, hc) rawInsert(s, s.data, key, hc, -1 - index) inc(s.counter) -template doWhile(a, b) = - while true: - b - if not a: break - proc exclImpl[A](s: var HashSet[A], key: A): bool {.inline.} = var hc: Hash var i = rawGet(s, key, hc) @@ -82,17 +77,9 @@ proc exclImpl[A](s: var HashSet[A], key: A): bool {.inline.} = if i >= 0: result = false dec(s.counter) - while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1 - var j = i # The correctness of this depends on (h+1) in nextTry, - var r = j # though may be adaptable to other simple sequences. - s.data[i].hcode = 0 # mark current EMPTY - s.data[i].key = default(type(s.data[i].key)) - doWhile((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)): - i = (i + 1) and msk # increment mod table size - if isEmpty(s.data[i].hcode): # end of collision cluster; So all done - return - r = s.data[i].hcode and msk # "home" location of key@i - s.data[j] = move(s.data[i]) # data[i] will be marked EMPTY next loop + inc(s.countDeleted) + s.data[i].hcode = deletedMarker + s.data[i].key = default(type(s.data[i].key)) template dollarImpl() {.dirty.} = result = "{" diff --git a/lib/pure/collections/sets.nim b/lib/pure/collections/sets.nim index 9dd72cb56..9f800e6d1 100644 --- a/lib/pure/collections/sets.nim +++ b/lib/pure/collections/sets.nim @@ -68,6 +68,7 @@ type ## before calling other procs on it. data: KeyValuePairSeq[A] counter: int + countDeleted: int type OrderedKeyValuePair[A] = tuple[ @@ -80,15 +81,13 @@ type ## <#initOrderedSet,int>`_ before calling other procs on it. data: OrderedKeyValuePairSeq[A] counter, first, last: int + countDeleted: int const defaultInitialSize* = 64 include setimpl -proc rightSize*(count: Natural): int {.inline.} - - # --------------------------------------------------------------------- # ------------------------------ HashSet ------------------------------ # --------------------------------------------------------------------- @@ -250,7 +249,7 @@ iterator items*[A](s: HashSet[A]): A = ## echo b ## # --> {(a: 1, b: 3), (a: 0, b: 4)} for h in 0 .. high(s.data): - if isFilled(s.data[h].hcode): yield s.data[h].key + if isFilledAndValid(s.data[h].hcode): yield s.data[h].key proc containsOrIncl*[A](s: var HashSet[A], key: A): bool = ## Includes `key` in the set `s` and tells if `key` was already in `s`. @@ -342,7 +341,7 @@ proc pop*[A](s: var HashSet[A]): A = doAssertRaises(KeyError, echo s.pop) for h in 0 .. high(s.data): - if isFilled(s.data[h].hcode): + if isFilledAndValid(s.data[h].hcode): result = s.data[h].key excl(s, result) return result @@ -593,16 +592,6 @@ proc `$`*[A](s: HashSet[A]): string = ## # --> {no, esc'aping, is " provided} dollarImpl() -proc rightSize*(count: Natural): int {.inline.} = - ## Return the value of `initialSize` to support `count` items. - ## - ## If more items are expected to be added, simply add that - ## expected extra amount to the parameter before calling this. - ## - ## Internally, we want `mustRehash(rightSize(x), x) == false`. - result = nextPowerOfTwo(count * 3 div 2 + 4) - - proc initSet*[A](initialSize = defaultInitialSize): HashSet[A] {.deprecated: "Deprecated since v0.20, use 'initHashSet'".} = initHashSet[A](initialSize) @@ -634,7 +623,7 @@ template forAllOrderedPairs(yieldStmt: untyped) {.dirty.} = var idx = 0 while h >= 0: var nxt = s.data[h].next - if isFilled(s.data[h].hcode): + if isFilledAndValid(s.data[h].hcode): yieldStmt inc(idx) h = nxt @@ -868,7 +857,7 @@ proc `==`*[A](s, t: OrderedSet[A]): bool = while h >= 0 and g >= 0: var nxh = s.data[h].next var nxg = t.data[g].next - if isFilled(s.data[h].hcode) and isFilled(t.data[g].hcode): + if isFilledAndValid(s.data[h].hcode) and isFilledAndValid(t.data[g].hcode): if s.data[h].key == t.data[g].key: inc compared else: @@ -1146,10 +1135,19 @@ when isMainModule and not defined(release): b.incl(2) assert b.len == 1 - for i in 0 .. 32: - var s = rightSize(i) - if s <= i or mustRehash(s, i): - echo "performance issue: rightSize() will not elide enlarge() at ", i + block: + type FakeTable = object + dataLen: int + counter: int + countDeleted: int + + var t: FakeTable + for i in 0 .. 32: + var s = rightSize(i) + t.dataLen = s + t.counter = i + doAssert s > i and not mustRehash(t), + "performance issue: rightSize() will not elide enlarge() at: " & $i block missingOrExcl: var s = toOrderedSet([2, 3, 6, 7]) diff --git a/lib/pure/collections/sharedtables.nim b/lib/pure/collections/sharedtables.nim index 1a6148752..0fbbdb3eb 100644 --- a/lib/pure/collections/sharedtables.nim +++ b/lib/pure/collections/sharedtables.nim @@ -25,6 +25,7 @@ type SharedTable*[A, B] = object ## generic hash SharedTable data: KeyValuePairSeq[A, B] counter, dataLen: int + countDeleted: int lock: Lock template maxHash(t): untyped = t.dataLen-1 @@ -32,7 +33,7 @@ template maxHash(t): untyped = t.dataLen-1 include tableimpl template st_maybeRehashPutImpl(enlarge) {.dirty.} = - if mustRehash(t.dataLen, t.counter): + if mustRehash(t): enlarge(t) index = rawGetKnownHC(t, key, hc) index = -1 - index # important to transform for mgetOrPutImpl @@ -49,9 +50,10 @@ proc enlarge[A, B](t: var SharedTable[A, B]) = for i in 0..<oldSize: let eh = n[i].hcode if isFilled(eh): + var perturb = t.getPerturb(eh) var j: Hash = eh and maxHash(t) while isFilled(t.data[j].hcode): - j = nextTry(j, maxHash(t)) + j = nextTry(j, maxHash(t), perturb) rawInsert(t, t.data, n[i].key, n[i].val, eh, j) deallocShared(n) diff --git a/lib/pure/collections/tableimpl.nim b/lib/pure/collections/tableimpl.nim index 85bffd60f..aabaeeeb3 100644 --- a/lib/pure/collections/tableimpl.nim +++ b/lib/pure/collections/tableimpl.nim @@ -14,13 +14,20 @@ include hashcommon template rawGetDeepImpl() {.dirty.} = # Search algo for unconditional add genHashImpl(key, hc) var h: Hash = hc and maxHash(t) - while isFilled(t.data[h].hcode): - h = nextTry(h, maxHash(t)) + var perturb = t.getPerturb(hc) + while true: + let hcode = t.data[h].hcode + if hcode == deletedMarker or hcode == freeMarker: + break + else: + h = nextTry(h, maxHash(t), perturb) result = h -template rawInsertImpl() {.dirty.} = +template rawInsertImpl(t) {.dirty.} = data[h].key = key data[h].val = val + if data[h].hcode == deletedMarker: + t.countDeleted.dec data[h].hcode = hc proc rawGetDeep[X, A](t: X, key: A, hc: var Hash): int {.inline.} = @@ -28,7 +35,7 @@ proc rawGetDeep[X, A](t: X, key: A, hc: var Hash): int {.inline.} = proc rawInsert[X, A, B](t: var X, data: var KeyValuePairSeq[A, B], key: A, val: B, hc: Hash, h: Hash) = - rawInsertImpl() + rawInsertImpl(t) template checkIfInitialized() = when compiles(defaultInitialSize): @@ -37,15 +44,14 @@ template checkIfInitialized() = template addImpl(enlarge) {.dirty.} = checkIfInitialized() - if mustRehash(t.dataLen, t.counter): enlarge(t) + if mustRehash(t): enlarge(t) var hc: Hash var j = rawGetDeep(t, key, hc) rawInsert(t, t.data, key, val, hc, j) inc(t.counter) template maybeRehashPutImpl(enlarge) {.dirty.} = - checkIfInitialized() - if mustRehash(t.dataLen, t.counter): + if mustRehash(t): enlarge(t) index = rawGetKnownHC(t, key, hc) index = -1 - index # important to transform for mgetOrPutImpl @@ -82,24 +88,11 @@ template delImplIdx(t, i) = let msk = maxHash(t) if i >= 0: dec(t.counter) - block outer: - while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1 - var j = i # The correctness of this depends on (h+1) in nextTry, - var r = j # though may be adaptable to other simple sequences. - t.data[i].hcode = 0 # mark current EMPTY - t.data[i].key = default(type(t.data[i].key)) - t.data[i].val = default(type(t.data[i].val)) - while true: - i = (i + 1) and msk # increment mod table size - if isEmpty(t.data[i].hcode): # end of collision cluster; So all done - break outer - r = t.data[i].hcode and msk # "home" location of key@i - if not ((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)): - break - when defined(js): - t.data[j] = t.data[i] - else: - t.data[j] = move(t.data[i]) # data[j] will be marked EMPTY next loop + inc(t.countDeleted) + t.data[i].hcode = deletedMarker + t.data[i].key = default(type(t.data[i].key)) + t.data[i].val = default(type(t.data[i].val)) + # mustRehash + enlarge not needed because counter+countDeleted doesn't change template delImpl() {.dirty.} = var hc: Hash @@ -123,8 +116,8 @@ template initImpl(result: typed, size: int) = result.last = -1 template insertImpl() = # for CountTable - if t.dataLen == 0: initImpl(t, defaultInitialSize) - if mustRehash(len(t.data), t.counter): enlarge(t) + checkIfInitialized() + if mustRehash(t): enlarge(t) ctRawInsert(t, t.data, key, val) inc(t.counter) diff --git a/lib/pure/collections/tables.nim b/lib/pure/collections/tables.nim index a0cc73ad6..bf08eaa92 100644 --- a/lib/pure/collections/tables.nim +++ b/lib/pure/collections/tables.nim @@ -233,6 +233,7 @@ type ## For creating an empty Table, use `initTable proc<#initTable,int>`_. data: KeyValuePairSeq[A, B] counter: int + countDeleted: int TableRef*[A, B] = ref Table[A, B] ## Ref version of `Table<#Table>`_. ## ## For creating a new empty TableRef, use `newTable proc @@ -250,8 +251,6 @@ template dataLen(t): untyped = len(t.data) include tableimpl -proc rightSize*(count: Natural): int {.inline.} - template get(t, key): untyped = ## retrieves the value at ``t[key]``. The value can be modified. ## If ``key`` is not in ``t``, the ``KeyError`` exception is raised. @@ -267,14 +266,16 @@ template get(t, key): untyped = proc enlarge[A, B](t: var Table[A, B]) = var n: KeyValuePairSeq[A, B] - newSeq(n, len(t.data) * growthFactor) + newSeq(n, t.counter.rightSize) swap(t.data, n) + t.countDeleted = 0 for i in countup(0, high(n)): let eh = n[i].hcode - if isFilled(eh): + if isFilledAndValid(eh): var j: Hash = eh and maxHash(t) + var perturb = t.getPerturb(eh) while isFilled(t.data[j].hcode): - j = nextTry(j, maxHash(t)) + j = nextTry(j, maxHash(t), perturb) when defined(js): rawInsert(t, t.data, n[i].key, n[i].val, eh, j) else: @@ -579,15 +580,6 @@ proc `==`*[A, B](s, t: Table[A, B]): bool = equalsImpl(s, t) -proc rightSize*(count: Natural): int {.inline.} = - ## Return the value of ``initialSize`` to support ``count`` items. - ## - ## If more items are expected to be added, simply add that - ## expected extra amount to the parameter before calling this. - ## - ## Internally, we want mustRehash(rightSize(x), x) == false. - result = nextPowerOfTwo(count * 3 div 2 + 4) - proc indexBy*[A, B, C](collection: A, index: proc(x: B): C): Table[C, B] = ## Index the collection with the proc provided. # TODO: As soon as supported, change collection: A to collection: A[B] @@ -670,7 +662,7 @@ iterator pairs*[A, B](t: Table[A, B]): (A, B) = ## # value: [1, 5, 7, 9] let L = len(t) for h in 0 .. high(t.data): - if isFilled(t.data[h].hcode): + if isFilledAndValid(t.data[h].hcode): yield (t.data[h].key, t.data[h].val) assert(len(t) == L, "the length of the table changed while iterating over it") @@ -692,7 +684,7 @@ iterator mpairs*[A, B](t: var Table[A, B]): (A, var B) = let L = len(t) for h in 0 .. high(t.data): - if isFilled(t.data[h].hcode): + if isFilledAndValid(t.data[h].hcode): yield (t.data[h].key, t.data[h].val) assert(len(t) == L, "the length of the table changed while iterating over it") @@ -713,7 +705,7 @@ iterator keys*[A, B](t: Table[A, B]): A = let L = len(t) for h in 0 .. high(t.data): - if isFilled(t.data[h].hcode): + if isFilledAndValid(t.data[h].hcode): yield t.data[h].key assert(len(t) == L, "the length of the table changed while iterating over it") @@ -734,7 +726,7 @@ iterator values*[A, B](t: Table[A, B]): B = let L = len(t) for h in 0 .. high(t.data): - if isFilled(t.data[h].hcode): + if isFilledAndValid(t.data[h].hcode): yield t.data[h].val assert(len(t) == L, "the length of the table changed while iterating over it") @@ -756,36 +748,53 @@ iterator mvalues*[A, B](t: var Table[A, B]): var B = let L = len(t) for h in 0 .. high(t.data): - if isFilled(t.data[h].hcode): + if isFilledAndValid(t.data[h].hcode): yield t.data[h].val assert(len(t) == L, "the length of the table changed while iterating over it") +template hasKeyOrPutCache(cache, h): bool = + # using `IntSet` would be an option worth considering to avoid quadratic + # behavior in case user misuses Table with lots of duplicate keys; but it + # has overhead in the common case of small number of duplicates. + # However: when lots of duplicates are used, all operations would be slow + # anyway because the full `hash(key)` is identical for these, which makes + # `nextTry` follow the exact same path for each key, resulting in large + # collision clusters. Alternatives could involve modifying the hash/retrieval + # based on duplicate key count. + var ret = false + for hi in cache: + if hi == h: + ret = true + break + if not ret: cache.add h + ret + iterator allValues*[A, B](t: Table[A, B]; key: A): B = ## Iterates over any value in the table ``t`` that belongs to the given ``key``. ## ## Used if you have a table with duplicate keys (as a result of using ## `add proc<#add,Table[A,B],A,B>`_). ## - ## **Examples:** - ## - ## .. code-block:: - ## var a = {'a': 3, 'b': 5}.toTable - ## for i in 1..3: - ## a.add('z', 10*i) - ## echo a # {'a': 3, 'b': 5, 'z': 10, 'z': 20, 'z': 30} - ## - ## for v in a.allValues('z'): - ## echo v - ## # 10 - ## # 20 - ## # 30 - var h: Hash = genHash(key) and high(t.data) + runnableExamples: + import testutils + var a = {'a': 3, 'b': 5}.toTable + for i in 1..3: a.add('z', 10*i) + doAssert a.sortedPairs == @[('a', 3), ('b', 5), ('z', 10), ('z', 20), ('z', 30)] + doAssert sortedItems(a.allValues('z')) == @[10, 20, 30] + + let hc = genHash(key) + var h: Hash = hc and high(t.data) let L = len(t) - while isFilled(t.data[h].hcode): - if t.data[h].key == key: - yield t.data[h].val - assert(len(t) == L, "the length of the table changed while iterating over it") - h = nextTry(h, high(t.data)) + var perturb = t.getPerturb(hc) + + var num = 0 + var cache: seq[Hash] + while isFilled(t.data[h].hcode): # `isFilledAndValid` would be incorrect, see test for `allValues` + if t.data[h].hcode == hc and t.data[h].key == key: + if not hasKeyOrPutCache(cache, h): + yield t.data[h].val + assert(len(t) == L, "the length of the table changed while iterating over it") + h = nextTry(h, high(t.data), perturb) @@ -1219,6 +1228,8 @@ type ## <#initOrderedTable,int>`_. data: OrderedKeyValuePairSeq[A, B] counter, first, last: int + countDeleted: int + OrderedTableRef*[A, B] = ref OrderedTable[A, B] ## Ref version of ## `OrderedTable<#OrderedTable>`_. ## @@ -1240,7 +1251,7 @@ proc rawGet[A, B](t: OrderedTable[A, B], key: A, hc: var Hash): int = proc rawInsert[A, B](t: var OrderedTable[A, B], data: var OrderedKeyValuePairSeq[A, B], key: A, val: B, hc: Hash, h: Hash) = - rawInsertImpl() + rawInsertImpl(t) data[h].next = -1 if t.first < 0: t.first = h if t.last >= 0: data[t.last].next = h @@ -1248,18 +1259,20 @@ proc rawInsert[A, B](t: var OrderedTable[A, B], proc enlarge[A, B](t: var OrderedTable[A, B]) = var n: OrderedKeyValuePairSeq[A, B] - newSeq(n, len(t.data) * growthFactor) + newSeq(n, t.counter.rightSize) var h = t.first t.first = -1 t.last = -1 swap(t.data, n) + t.countDeleted = 0 while h >= 0: var nxt = n[h].next let eh = n[h].hcode - if isFilled(eh): + if isFilledAndValid(eh): var j: Hash = eh and maxHash(t) + var perturb = t.getPerturb(eh) while isFilled(t.data[j].hcode): - j = nextTry(j, maxHash(t)) + j = nextTry(j, maxHash(t), perturb) rawInsert(t, t.data, n[h].key, n[h].val, n[h].hcode, j) h = nxt @@ -2211,6 +2224,7 @@ type ## <#initCountTable,int>`_. data: seq[tuple[key: A, val: int]] counter: int + countDeleted: int isSorted: bool CountTableRef*[A] = ref CountTable[A] ## Ref version of ## `CountTable<#CountTable>`_. @@ -2223,8 +2237,10 @@ type proc ctRawInsert[A](t: CountTable[A], data: var seq[tuple[key: A, val: int]], key: A, val: int) = - var h: Hash = hash(key) and high(data) - while data[h].val != 0: h = nextTry(h, high(data)) + let hc = hash(key) + var perturb = t.getPerturb(hc) + var h: Hash = hc and high(data) + while data[h].val != 0: h = nextTry(h, high(data), perturb) # TODO: handle deletedMarker data[h].key = key data[h].val = val @@ -2251,10 +2267,12 @@ proc remove[A](t: var CountTable[A], key: A) = proc rawGet[A](t: CountTable[A], key: A): int = if t.data.len == 0: return -1 - var h: Hash = hash(key) and high(t.data) # start with real hash value - while t.data[h].val != 0: + let hc = hash(key) + var perturb = t.getPerturb(hc) + var h: Hash = hc and high(t.data) # start with real hash value + while t.data[h].val != 0: # TODO: may need to handle t.data[h].hcode == deletedMarker? if t.data[h].key == key: return h - h = nextTry(h, high(t.data)) + h = nextTry(h, high(t.data), perturb) result = -1 - h # < 0 => MISSING; insert idx = -1 - result template ctget(t, key, default: untyped): untyped = diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim index ac8498517..b55a7865d 100644 --- a/lib/pure/hashes.nim +++ b/lib/pure/hashes.nim @@ -112,38 +112,14 @@ proc hash*[T: proc](x: T): Hash {.inline.} = else: result = hash(pointer(x)) -const - prime = uint(11) - -proc hash*(x: int): Hash {.inline.} = +proc hash*(x: int|int64|uint|uint64|char|Ordinal): Hash {.inline.} = ## Efficient hashing of integers. - result = cast[Hash](cast[uint](x) * prime) - -proc hash*(x: int64): Hash {.inline.} = - ## Efficient hashing of `int64` integers. - result = cast[Hash](cast[uint](x) * prime) - -proc hash*(x: uint): Hash {.inline.} = - ## Efficient hashing of unsigned integers. - result = cast[Hash](x * prime) - -proc hash*(x: uint64): Hash {.inline.} = - ## Efficient hashing of `uint64` integers. - result = cast[Hash](cast[uint](x) * prime) - -proc hash*(x: char): Hash {.inline.} = - ## Efficient hashing of characters. - result = cast[Hash](cast[uint](ord(x)) * prime) - -proc hash*[T: Ordinal](x: T): Hash {.inline.} = - ## Efficient hashing of other ordinal types (e.g. enums). - result = cast[Hash](cast[uint](ord(x)) * prime) + cast[Hash](ord(x)) proc hash*(x: float): Hash {.inline.} = ## Efficient hashing of floats. - var y = x + 1.0 - result = cast[ptr Hash](addr(y))[] - + var y = x + 0.0 # for denormalization + result = hash(cast[ptr Hash](addr(y))[]) # Forward declarations before methods that hash containers. This allows # containers to contain other containers diff --git a/lib/pure/testutils.nim b/lib/pure/testutils.nim new file mode 100644 index 000000000..bdf90b616 --- /dev/null +++ b/lib/pure/testutils.nim @@ -0,0 +1,15 @@ +## Utilities to help writing tests +## +## Unstable, experimental API. + +import std/[sequtils, algorithm] + +proc sortedPairs*[T](t: T): auto = + ## helps when writing tests involving tables in a way that's robust to + ## changes in hashing functions / table implementation. + toSeq(t.pairs).sorted + +template sortedItems*(t: untyped): untyped = + ## helps when writing tests involving tables in a way that's robust to + ## changes in hashing functions / table implementation. + sorted(toSeq(t)) diff --git a/tests/arc/trepr.nim b/tests/arc/trepr.nim index 7a92112ed..391622ff6 100644 --- a/tests/arc/trepr.nim +++ b/tests/arc/trepr.nim @@ -1,7 +1,7 @@ discard """ cmd: "nim c --gc:arc $file" nimout: '''(a: true, n: doAssert) -Table[system.string, trepr.MyType](data: @[], counter: 0) +Table[system.string, trepr.MyType](data: @[], counter: 0, countDeleted: 0) nil ''' """ diff --git a/tests/collections/ttables.nim b/tests/collections/ttables.nim index 9eccf345a..7459b3f13 100644 --- a/tests/collections/ttables.nim +++ b/tests/collections/ttables.nim @@ -8,7 +8,12 @@ And we get here ''' joinable: false """ -import hashes, sequtils, tables, algorithm +import hashes, sequtils, tables, algorithm, testutils + +block tableDollar: + # other tests should use `sortedPairs` to be robust to future table/hash + # implementation changes + doAssert ${1: 'a', 2: 'b'}.toTable in ["{1: 'a', 2: 'b'}", "{2: 'b', 1: 'a'}"] # test should not be joined because it takes too long. block tableadds: @@ -188,6 +193,23 @@ block ttables2: run1() echo "2" +block allValues: + var t: Table[int, string] + var key = 0 + let n = 1000 + for i in 0..<n: t.add(i, $i) + const keys = [0, -1, 12] + for i in 0..1: + for key in keys: + t.add(key, $key & ":" & $i) + for i in 0..<n: + if i notin keys: + t.del(i) + doAssert t.sortedPairs == @[(-1, "-1:0"), (-1, "-1:1"), (0, "0"), (0, "0:0"), (0, "0:1"), (12, "12"), (12, "12:0"), (12, "12:1")] + doAssert sortedItems(t.allValues(0)) == @["0", "0:0", "0:1"] + doAssert sortedItems(t.allValues(-1)) == @["-1:0", "-1:1"] + doAssert sortedItems(t.allValues(12)) == @["12", "12:0", "12:1"] + doAssert sortedItems(t.allValues(1)) == @[] block tablesref: const @@ -232,8 +254,8 @@ block tablesref: for x in 0..1: for y in 0..1: assert t[(x,y)] == $x & $y - assert($t == - "{(x: 1, y: 1): \"11\", (x: 0, y: 0): \"00\", (x: 0, y: 1): \"01\", (x: 1, y: 0): \"10\"}") + assert t.sortedPairs == + @[((x: 0, y: 0), "00"), ((x: 0, y: 1), "01"), ((x: 1, y: 0), "10"), ((x: 1, y: 1), "11")] block tableTest2: var t = newTable[string, float]() @@ -340,7 +362,7 @@ block tablesref: block anonZipTest: let keys = @['a','b','c'] let values = @[1, 2, 3] - doAssert "{'c': 3, 'a': 1, 'b': 2}" == $ toTable zip(keys, values) + doAssert zip(keys, values).toTable.sortedPairs == @[('a', 1), ('b', 2), ('c', 3)] block clearTableTest: var t = newTable[string, float]() @@ -369,3 +391,4 @@ block tablesref: orderedTableSortTest() echo "3" + diff --git a/tests/collections/ttablesthreads.nim b/tests/collections/ttablesthreads.nim index 5553b31ef..757a8cebe 100644 --- a/tests/collections/ttablesthreads.nim +++ b/tests/collections/ttablesthreads.nim @@ -3,7 +3,7 @@ discard """ output: '''true''' """ -import hashes, tables, sharedtables +import hashes, tables, sharedtables, testutils const data = { @@ -47,8 +47,7 @@ block tableTest1: for x in 0..1: for y in 0..1: assert t[(x,y)] == $x & $y - assert($t == - "{(x: 1, y: 1): \"11\", (x: 0, y: 0): \"00\", (x: 0, y: 1): \"01\", (x: 1, y: 0): \"10\"}") + assert t.sortedPairs == @[((x: 0, y: 0), "00"), ((x: 0, y: 1), "01"), ((x: 1, y: 0), "10"), ((x: 1, y: 1), "11")] block tableTest2: var t = initTable[string, float]() |