diff options
author | Miran <narimiran@disroot.org> | 2019-09-01 00:04:10 +0200 |
---|---|---|
committer | Andreas Rumpf <rumpf_a@web.de> | 2019-09-01 00:04:10 +0200 |
commit | ab48d7901e4626120ff430e597cbb32d007e9e0b (patch) | |
tree | 7503ba743de13726c93f793e80c1cfa01c37a21d | |
parent | 35268c500f2143e757a71846b246a1ecb31c72f8 (diff) | |
download | Nim-ab48d7901e4626120ff430e597cbb32d007e9e0b.tar.gz |
hashes: implement murmur3 (#12022)
* hashes: implement murmur3 * refactoring; there is only one murmurHash and it works at compile-time via VM hooks * fixes JS tests * makes toOpenArrayByte work with C++ * make it bootstrap in C++ mode for 0.20
-rw-r--r-- | changelog.md | 2 | ||||
-rw-r--r-- | compiler/ccgcalls.nim | 14 | ||||
-rw-r--r-- | compiler/condsyms.nim | 1 | ||||
-rw-r--r-- | compiler/vmops.nim | 29 | ||||
-rw-r--r-- | lib/pure/hashes.nim | 196 | ||||
-rw-r--r-- | lib/system.nim | 10 | ||||
-rw-r--r-- | tests/parallel/tsendtwice.nim | 7 |
7 files changed, 202 insertions, 57 deletions
diff --git a/changelog.md b/changelog.md index 35395afd3..57820c81f 100644 --- a/changelog.md +++ b/changelog.md @@ -50,7 +50,7 @@ type - Added `os.delEnv` and `nimscript.delEnv`. (#11466) -- Enable Oid usage in hashtables. (#11472) +- Enabled Oid usage in hashtables. (#11472) - Added `unsafeColumnAt` procs, that return unsafe cstring from InstantRow. (#11647) diff --git a/compiler/ccgcalls.nim b/compiler/ccgcalls.nim index 4efd33d1e..53ebc2806 100644 --- a/compiler/ccgcalls.nim +++ b/compiler/ccgcalls.nim @@ -100,21 +100,23 @@ proc openArrayLoc(p: BProc, n: PNode): Rope = if optBoundsCheck in p.options: genBoundsCheck(p, a, b, c) let ty = skipTypes(a.t, abstractVar+{tyPtr}) + let dest = getTypeDesc(p.module, n.typ.sons[0]) case ty.kind of tyArray: let first = toInt64(firstOrd(p.config, ty)) if first == 0: - result = "($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c)] + result = "($4*)(($1)+($2)), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dest] else: - result = "($1)+(($2)-($4)), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), intLiteral(first)] - of tyOpenArray, tyVarargs, tyUncheckedArray: - result = "($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c)] + result = "($5*)($1)+(($2)-($4)), ($3)-($2)+1" % + [rdLoc(a), rdLoc(b), rdLoc(c), intLiteral(first), dest] + of tyOpenArray, tyVarargs, tyUncheckedArray, tyCString: + result = "($4*)($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dest] of tyString, tySequence: if skipTypes(n.typ, abstractInst).kind == tyVar and not compileToCpp(p.module): - result = "(*$1)$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p)] + result = "($5*)(*$1)$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p), dest] else: - result = "$1$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p)] + result = "($5*)$1$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p), dest] else: internalError(p.config, "openArrayLoc: " & typeToString(a.t)) else: diff --git a/compiler/condsyms.nim b/compiler/condsyms.nim index 15a625472..00ce352f2 100644 --- a/compiler/condsyms.nim +++ b/compiler/condsyms.nim @@ -97,4 +97,5 @@ proc initDefines*(symbols: StringTableRef) = defineSymbol("nimFixedOwned") defineSymbol("nimHasStyleChecks") + defineSymbol("nimToOpenArrayCString") defineSymbol("nimHasUsed") diff --git a/compiler/vmops.nim b/compiler/vmops.nim index 792feb0be..b0325b5b0 100644 --- a/compiler/vmops.nim +++ b/compiler/vmops.nim @@ -17,6 +17,8 @@ from os import getEnv, existsEnv, dirExists, fileExists, putEnv, walkDir, getApp from md5 import getMD5 from sighashes import symBodyDigest +from hashes import hash + template mathop(op) {.dirty.} = registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`) @@ -157,3 +159,30 @@ proc registerAdditionalOps*(c: PCtx) = stackTrace(c, PStackFrame(prc: c.prc.sym, comesFrom: 0, next: nil), c.exceptionInstr, "isExported() requires a symbol. '" & $n & "' is of kind '" & $n.kind & "'", n.info) setResult(a, sfExported in n.sym.flags) + + proc hashVmImpl(a: VmArgs) = + var res = hashes.hash(a.getString(0), a.getInt(1).int, a.getInt(2).int) + if c.config.cmd == cmdCompileToJS: + # emulate JS's terrible integers: + res = cast[int32](res) + setResult(a, res) + + registerCallback c, "stdlib.hashes.hashVmImpl", hashVmImpl + + proc hashVmImplByte(a: VmArgs) = + # nkBracket[...] + let sPos = a.getInt(1).int + let ePos = a.getInt(2).int + let arr = a.getNode(0) + var bytes = newSeq[byte](arr.len) + for i in 0 ..< arr.len: + bytes[i] = byte(arr[i].intVal and 0xff) + + var res = hashes.hash(bytes, sPos, ePos) + if c.config.cmd == cmdCompileToJS: + # emulate JS's terrible integers: + res = cast[int32](res) + setResult(a, res) + + registerCallback c, "stdlib.hashes.hashVmImplByte", hashVmImplByte + registerCallback c, "stdlib.hashes.hashVmImplChar", hashVmImplByte diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim index 6af515609..c1338376e 100644 --- a/lib/pure/hashes.nim +++ b/lib/pure/hashes.nim @@ -49,9 +49,6 @@ type ## always have a size of a power of two and can use the ``and`` ## operator instead of ``mod`` for truncation of the hash value. -const - IntSize = sizeof(int) - proc `!&`*(h: Hash, val: int): Hash {.inline.} = ## Mixes a hash value `h` with `val` to produce a new hash value. ## @@ -108,13 +105,12 @@ proc hash*(x: pointer): Hash {.inline.} = else: result = cast[Hash](cast[uint](x) shr 3) # skip the alignment -when not defined(booting): - proc hash*[T: proc](x: T): Hash {.inline.} = - ## Efficient hashing of proc vars. Closures are supported too. - when T is "closure": - result = hash(rawProc(x)) !& hash(rawEnv(x)) - else: - result = hash(pointer(x)) +proc hash*[T: proc](x: T): Hash {.inline.} = + ## Efficient hashing of proc vars. Closures are supported too. + when T is "closure": + result = hash(rawProc(x)) !& hash(rawEnv(x)) + else: + result = hash(pointer(x)) proc hash*(x: int): Hash {.inline.} = ## Efficient hashing of integers. @@ -151,27 +147,87 @@ proc hash*(x: float): Hash {.inline.} = proc hash*[A](x: openArray[A]): Hash proc hash*[A](x: set[A]): Hash -template bytewiseHashing(result: Hash, x: typed, start, stop: int) = - for i in start .. stop: - result = result !& hash(x[i]) - result = !$result -template hashImpl(result: Hash, x: typed, start, stop: int) = +when defined(JS): + proc imul(a, b: uint32): uint32 = + # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Math/imul + let mask = 0xffff'u32 + var + aHi = (a shr 16) and mask + aLo = a and mask + bHi = (b shr 16) and mask + bLo = b and mask + result = (aLo * bLo) + (aHi * bLo + aLo * bHi) shl 16 +else: + template imul(a, b: uint32): untyped = a * b + +proc rotl32(x: uint32, r: int): uint32 {.inline.} = + (x shl r) or (x shr (32 - r)) + +proc murmurHash(x: openArray[byte]): Hash = + # https://github.com/PeterScott/murmur3/blob/master/murmur3.c + const + c1 = 0xcc9e2d51'u32 + c2 = 0x1b873593'u32 + n1 = 0xe6546b64'u32 + m1 = 0x85ebca6b'u32 + m2 = 0xc2b2ae35'u32 let - elementSize = sizeof(x[start]) - stepSize = IntSize div elementSize - var i = start - while i <= stop+1 - stepSize: - var n = 0 - when nimvm: - # we cannot cast in VM, so we do it manually - for j in countdown(stepSize-1, 0): - n = (n shl (8*elementSize)) or ord(x[i+j]) + size = len(x) + stepSize = 4 # 32-bit + n = size div stepSize + var + h1: uint32 + i = 0 + + # body + while i < n * stepSize: + var k1: uint32 + when defined(js): + var j = stepSize + while j > 0: + dec j + k1 = (k1 shl 8) or (ord(x[i+j])).uint32 else: - n = cast[ptr Hash](unsafeAddr x[i])[] - result = result !& n - i += stepSize - bytewiseHashing(result, x, i, stop) # hash the remaining elements and finish + k1 = cast[ptr uint32](unsafeAddr x[i])[] + inc i, stepSize + + k1 = imul(k1, c1) + k1 = rotl32(k1, 15) + k1 = imul(k1, c2) + + h1 = h1 xor k1 + h1 = rotl32(h1, 13) + h1 = h1*5 + n1 + + # tail + var k1: uint32 + var rem = size mod stepSize + while rem > 0: + dec rem + k1 = (k1 shl 8) or (ord(x[i+rem])).uint32 + k1 = imul(k1, c1) + k1 = rotl32(k1, 15) + k1 = imul(k1, c2) + h1 = h1 xor k1 + + # finalization + h1 = h1 xor size.uint32 + h1 = h1 xor (h1 shr 16) + h1 = imul(h1, m1) + h1 = h1 xor (h1 shr 13) + h1 = imul(h1, m2) + h1 = h1 xor (h1 shr 16) + return cast[Hash](h1) + +proc hashVmImpl(x: string, sPos, ePos: int): Hash = + doAssert false, "implementation override in compiler/vmops.nim" + +proc hashVmImplChar(x: openArray[char], sPos, ePos: int): Hash = + doAssert false, "implementation override in compiler/vmops.nim" + +proc hashVmImplByte(x: openArray[byte], sPos, ePos: int): Hash = + doAssert false, "implementation override in compiler/vmops.nim" proc hash*(x: string): Hash = ## Efficient hashing of strings. @@ -182,7 +238,16 @@ proc hash*(x: string): Hash = runnableExamples: doAssert hash("abracadabra") != hash("AbracadabrA") - hashImpl(result, x, 0, high(x)) + when not defined(nimToOpenArrayCString): + result = 0 + for c in x: + result = result !& ord(c) + result = !$result + else: + when nimvm: + result = hashVmImpl(x, 0, high(x)) + else: + result = murmurHash(toOpenArrayByte(x, 0, high(x))) proc hash*(x: cstring): Hash = ## Efficient hashing of null-terminated strings. @@ -191,7 +256,19 @@ proc hash*(x: cstring): Hash = doAssert hash(cstring"AbracadabrA") == hash("AbracadabrA") doAssert hash(cstring"abracadabra") != hash(cstring"AbracadabrA") - hashImpl(result, x, 0, high(x)) + when not defined(nimToOpenArrayCString): + result = 0 + var i = 0 + while x[i] != '\0': + result = result !& ord(x[i]) + inc i + result = !$result + else: + when not defined(JS) and defined(nimToOpenArrayCString): + murmurHash(toOpenArrayByte(x, 0, x.high)) + else: + let xx = $x + murmurHash(toOpenArrayByte(xx, 0, high(xx))) proc hash*(sBuf: string, sPos, ePos: int): Hash = ## Efficient hashing of a string buffer, from starting @@ -202,7 +279,13 @@ proc hash*(sBuf: string, sPos, ePos: int): Hash = var a = "abracadabra" doAssert hash(a, 0, 3) == hash(a, 7, 10) - hashImpl(result, sBuf, sPos, ePos) + when not defined(nimToOpenArrayCString): + result = 0 + for i in sPos..ePos: + result = result !& ord(sBuf[i]) + result = !$result + else: + murmurHash(toOpenArrayByte(sBuf, sPos, ePos)) proc hashIgnoreStyle*(x: string): Hash = ## Efficient hashing of strings; style is ignored. @@ -300,12 +383,20 @@ proc hash*[T: tuple](x: T): Hash = result = result !& hash(f) result = !$result + proc hash*[A](x: openArray[A]): Hash = ## Efficient hashing of arrays and sequences. - when A is char|SomeInteger: - hashImpl(result, x, 0, x.high) + when A is byte: + result = murmurHash(x) + elif A is char: + when nimvm: + result = hashVmImplChar(x, 0, x.high) + else: + result = murmurHash(toOpenArrayByte(x, 0, x.high)) else: - bytewiseHashing(result, x, 0, x.high) + for a in x: + result = result !& hash(a) + result = !$result proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash = ## Efficient hashing of portions of arrays and sequences, from starting @@ -316,10 +407,20 @@ proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash = let a = [1, 2, 5, 1, 2, 6] doAssert hash(a, 0, 1) == hash(a, 3, 4) - when A is char|SomeInteger: - hashImpl(result, aBuf, sPos, ePos) + when A is byte: + when nimvm: + result = hashVmImplByte(aBuf, sPos, ePos) + else: + result = murmurHash(toOpenArray(aBuf, sPos, ePos)) + elif A is char: + when nimvm: + result = hashVmImplChar(aBuf, sPos, ePos) + else: + result = murmurHash(toOpenArrayByte(aBuf, sPos, ePos)) else: - bytewiseHashing(result, aBuf, sPos, ePos) + for i in sPos .. ePos: + result = result !& hash(aBuf[i]) + result = !$result proc hash*[A](x: set[A]): Hash = ## Efficient hashing of sets. @@ -334,11 +435,15 @@ when isMainModule: a = "" b = newSeq[char]() c = newSeq[int]() + d = cstring"" + e = "abcd" doAssert hash(a) == 0 doAssert hash(b) == 0 doAssert hash(c) == 0 + doAssert hash(d) == 0 doAssert hashIgnoreCase(a) == 0 doAssert hashIgnoreStyle(a) == 0 + doAssert hash(e, 3, 2) == 0 block sameButDifferent: doAssert hash("aa bb aaaa1234") == hash("aa bb aaaa1234", 0, 13) doAssert hash("aa bb aaaa1234") == hash(cstring"aa bb aaaa1234") @@ -346,14 +451,14 @@ when isMainModule: doAssert hashIgnoreStyle("aa_bb_AAaa1234") == hashIgnoreCase("aaBBAAAa1234") block smallSize: # no multibyte hashing let - xx = @['H','e','l','l','o'] - ii = @[72'i8, 101, 108, 108, 111] - ss = "Hello" + xx = @['H','i'] + ii = @[72'u8, 105] + ss = "Hi" doAssert hash(xx) == hash(ii) doAssert hash(xx) == hash(ss) doAssert hash(xx) == hash(xx, 0, xx.high) doAssert hash(ss) == hash(ss, 0, ss.high) - block largeSize: # longer than 8 characters, should trigger multibyte hashing + block largeSize: # longer than 4 characters let xx = @['H','e','l','l','o'] xxl = @['H','e','l','l','o','w','e','e','n','s'] @@ -362,9 +467,6 @@ when isMainModule: doAssert hash(xxl) == hash(xxl, 0, xxl.high) doAssert hash(ssl) == hash(ssl, 0, ssl.high) doAssert hash(xx) == hash(xxl, 0, 4) - block misc: - let - a = [1'u8, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4] - b = [1'i8, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4] - doAssert hash(a) == hash(b) - doAssert hash(a, 2, 5) == hash(b, 2, 5) + doAssert hash(xx) == hash(ssl, 0, 4) + doAssert hash(xx, 0, 3) == hash(xxl, 0, 3) + doAssert hash(xx, 0, 3) == hash(ssl, 0, 3) diff --git a/lib/system.nim b/lib/system.nim index 16a5e03fe..edab33412 100644 --- a/lib/system.nim +++ b/lib/system.nim @@ -4501,6 +4501,11 @@ when defined(nimconfig): when not defined(js): proc toOpenArray*[T](x: ptr UncheckedArray[T]; first, last: int): openArray[T] {. magic: "Slice".} + when defined(nimToOpenArrayCString): + proc toOpenArray*(x: cstring; first, last: int): openArray[char] {. + magic: "Slice".} + proc toOpenArrayByte*(x: cstring; first, last: int): openArray[byte] {. + magic: "Slice".} proc toOpenArray*[T](x: seq[T]; first, last: int): openArray[T] {. magic: "Slice".} @@ -4510,8 +4515,13 @@ proc toOpenArray*[I, T](x: array[I, T]; first, last: I): openArray[T] {. magic: "Slice".} proc toOpenArray*(x: string; first, last: int): openArray[char] {. magic: "Slice".} + proc toOpenArrayByte*(x: string; first, last: int): openArray[byte] {. magic: "Slice".} +proc toOpenArrayByte*(x: openArray[char]; first, last: int): openArray[byte] {. + magic: "Slice".} +proc toOpenArrayByte*(x: seq[char]; first, last: int): openArray[byte] {. + magic: "Slice".} type ForLoopStmt* {.compilerproc.} = object ## \ diff --git a/tests/parallel/tsendtwice.nim b/tests/parallel/tsendtwice.nim index 2f60b904f..0b3ce15a5 100644 --- a/tests/parallel/tsendtwice.nim +++ b/tests/parallel/tsendtwice.nim @@ -1,11 +1,12 @@ discard """ - output: '''ob @[] + output: '''ob2 @[] +ob @[] ob3 @[] -ob2 @[] 3 +ob2 @[] ob @[] ob3 @[] -ob2 @[]''' +''' cmd: "nim c -r --threads:on $file" """ |