summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorMiran <narimiran@disroot.org>2019-09-01 00:04:10 +0200
committerAndreas Rumpf <rumpf_a@web.de>2019-09-01 00:04:10 +0200
commitab48d7901e4626120ff430e597cbb32d007e9e0b (patch)
tree7503ba743de13726c93f793e80c1cfa01c37a21d
parent35268c500f2143e757a71846b246a1ecb31c72f8 (diff)
downloadNim-ab48d7901e4626120ff430e597cbb32d007e9e0b.tar.gz
hashes: implement murmur3 (#12022)
* hashes: implement murmur3
* refactoring; there is only one murmurHash and it works at compile-time via VM hooks
* fixes JS tests
* makes toOpenArrayByte work with C++
* make it bootstrap in C++ mode for 0.20
-rw-r--r--changelog.md2
-rw-r--r--compiler/ccgcalls.nim14
-rw-r--r--compiler/condsyms.nim1
-rw-r--r--compiler/vmops.nim29
-rw-r--r--lib/pure/hashes.nim196
-rw-r--r--lib/system.nim10
-rw-r--r--tests/parallel/tsendtwice.nim7
7 files changed, 202 insertions, 57 deletions
diff --git a/changelog.md b/changelog.md
index 35395afd3..57820c81f 100644
--- a/changelog.md
+++ b/changelog.md
@@ -50,7 +50,7 @@ type
 
 - Added `os.delEnv` and `nimscript.delEnv`. (#11466)
 
-- Enable Oid usage in hashtables. (#11472)
+- Enabled Oid usage in hashtables. (#11472)
 
 - Added `unsafeColumnAt` procs, that return unsafe cstring from InstantRow. (#11647)
 
diff --git a/compiler/ccgcalls.nim b/compiler/ccgcalls.nim
index 4efd33d1e..53ebc2806 100644
--- a/compiler/ccgcalls.nim
+++ b/compiler/ccgcalls.nim
@@ -100,21 +100,23 @@ proc openArrayLoc(p: BProc, n: PNode): Rope =
     if optBoundsCheck in p.options:
       genBoundsCheck(p, a, b, c)
     let ty = skipTypes(a.t, abstractVar+{tyPtr})
+    let dest = getTypeDesc(p.module, n.typ.sons[0])
     case ty.kind
     of tyArray:
       let first = toInt64(firstOrd(p.config, ty))
       if first == 0:
-        result = "($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c)]
+        result = "($4*)(($1)+($2)), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dest]
       else:
-        result = "($1)+(($2)-($4)), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), intLiteral(first)]
-    of tyOpenArray, tyVarargs, tyUncheckedArray:
-      result = "($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c)]
+        result = "($5*)($1)+(($2)-($4)), ($3)-($2)+1" %
+          [rdLoc(a), rdLoc(b), rdLoc(c), intLiteral(first), dest]
+    of tyOpenArray, tyVarargs, tyUncheckedArray, tyCString:
+      result = "($4*)($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dest]
     of tyString, tySequence:
       if skipTypes(n.typ, abstractInst).kind == tyVar and
           not compileToCpp(p.module):
-        result = "(*$1)$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p)]
+        result = "($5*)(*$1)$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p), dest]
       else:
-        result = "$1$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p)]
+        result = "($5*)$1$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p), dest]
     else:
       internalError(p.config, "openArrayLoc: " & typeToString(a.t))
   else:
diff --git a/compiler/condsyms.nim b/compiler/condsyms.nim
index 15a625472..00ce352f2 100644
--- a/compiler/condsyms.nim
+++ b/compiler/condsyms.nim
@@ -97,4 +97,5 @@ proc initDefines*(symbols: StringTableRef) =
 
   defineSymbol("nimFixedOwned")
   defineSymbol("nimHasStyleChecks")
+  defineSymbol("nimToOpenArrayCString")
   defineSymbol("nimHasUsed")
diff --git a/compiler/vmops.nim b/compiler/vmops.nim
index 792feb0be..b0325b5b0 100644
--- a/compiler/vmops.nim
+++ b/compiler/vmops.nim
@@ -17,6 +17,8 @@ from os import getEnv, existsEnv, dirExists, fileExists, putEnv, walkDir, getApp
 from md5 import getMD5
 from sighashes import symBodyDigest
 
+from hashes import hash
+
 template mathop(op) {.dirty.} =
   registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`)
 
@@ -157,3 +159,30 @@ proc registerAdditionalOps*(c: PCtx) =
       stackTrace(c, PStackFrame(prc: c.prc.sym, comesFrom: 0, next: nil), c.exceptionInstr,
                   "isExported() requires a symbol. '" & $n & "' is of kind '" & $n.kind & "'", n.info)
     setResult(a, sfExported in n.sym.flags)
+
+  proc hashVmImpl(a: VmArgs) =
+    var res = hashes.hash(a.getString(0), a.getInt(1).int, a.getInt(2).int)
+    if c.config.cmd == cmdCompileToJS:
+      # emulate JS's terrible integers:
+      res = cast[int32](res)
+    setResult(a, res)
+
+  registerCallback c, "stdlib.hashes.hashVmImpl", hashVmImpl
+
+  proc hashVmImplByte(a: VmArgs) =
+    # nkBracket[...]
+    let sPos = a.getInt(1).int
+    let ePos = a.getInt(2).int
+    let arr = a.getNode(0)
+    var bytes = newSeq[byte](arr.len)
+    for i in 0 ..< arr.len:
+      bytes[i] = byte(arr[i].intVal and 0xff)
+
+    var res = hashes.hash(bytes, sPos, ePos)
+    if c.config.cmd == cmdCompileToJS:
+      # emulate JS's terrible integers:
+      res = cast[int32](res)
+    setResult(a, res)
+
+  registerCallback c, "stdlib.hashes.hashVmImplByte", hashVmImplByte
+  registerCallback c, "stdlib.hashes.hashVmImplChar", hashVmImplByte
diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim
index 6af515609..c1338376e 100644
--- a/lib/pure/hashes.nim
+++ b/lib/pure/hashes.nim
@@ -49,9 +49,6 @@ type
                ## always have a size of a power of two and can use the ``and``
                ## operator instead of ``mod`` for truncation of the hash value.
 
-const
-  IntSize = sizeof(int)
-
 proc `!&`*(h: Hash, val: int): Hash {.inline.} =
   ## Mixes a hash value `h` with `val` to produce a new hash value.
   ##
@@ -108,13 +105,12 @@ proc hash*(x: pointer): Hash {.inline.} =
   else:
     result = cast[Hash](cast[uint](x) shr 3) # skip the alignment
 
-when not defined(booting):
-  proc hash*[T: proc](x: T): Hash {.inline.} =
-    ## Efficient hashing of proc vars. Closures are supported too.
-    when T is "closure":
-      result = hash(rawProc(x)) !& hash(rawEnv(x))
-    else:
-      result = hash(pointer(x))
+proc hash*[T: proc](x: T): Hash {.inline.} =
+  ## Efficient hashing of proc vars. Closures are supported too.
+  when T is "closure":
+    result = hash(rawProc(x)) !& hash(rawEnv(x))
+  else:
+    result = hash(pointer(x))
 
 proc hash*(x: int): Hash {.inline.} =
   ## Efficient hashing of integers.
@@ -151,27 +147,87 @@ proc hash*(x: float): Hash {.inline.} =
 proc hash*[A](x: openArray[A]): Hash
 proc hash*[A](x: set[A]): Hash
 
-template bytewiseHashing(result: Hash, x: typed, start, stop: int) =
-  for i in start .. stop:
-    result = result !& hash(x[i])
-  result = !$result
 
-template hashImpl(result: Hash, x: typed, start, stop: int) =
+when defined(JS):
+  proc imul(a, b: uint32): uint32 =
+    # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Math/imul
+    let mask = 0xffff'u32
+    var
+      aHi = (a shr 16) and mask
+      aLo = a and mask
+      bHi = (b shr 16) and mask
+      bLo = b and mask
+    result = (aLo * bLo) + (aHi * bLo + aLo * bHi) shl 16
+else:
+  template imul(a, b: uint32): untyped = a * b
+
+proc rotl32(x: uint32, r: int): uint32 {.inline.} =
+  (x shl r) or (x shr (32 - r))
+
+proc murmurHash(x: openArray[byte]): Hash =
+  # https://github.com/PeterScott/murmur3/blob/master/murmur3.c
+  const
+    c1 = 0xcc9e2d51'u32
+    c2 = 0x1b873593'u32
+    n1 = 0xe6546b64'u32
+    m1 = 0x85ebca6b'u32
+    m2 = 0xc2b2ae35'u32
   let
-    elementSize = sizeof(x[start])
-    stepSize = IntSize div elementSize
-  var i = start
-  while i <= stop+1 - stepSize:
-    var n = 0
-    when nimvm:
-      # we cannot cast in VM, so we do it manually
-      for j in countdown(stepSize-1, 0):
-        n = (n shl (8*elementSize)) or ord(x[i+j])
+    size = len(x)
+    stepSize = 4 # 32-bit
+    n = size div stepSize
+  var
+    h1: uint32
+    i = 0
+
+  # body
+  while i < n * stepSize:
+    var k1: uint32
+    when defined(js):
+      var j = stepSize
+      while j > 0:
+        dec j
+        k1 = (k1 shl 8) or (ord(x[i+j])).uint32
     else:
-      n = cast[ptr Hash](unsafeAddr x[i])[]
-    result = result !& n
-    i += stepSize
-  bytewiseHashing(result, x, i, stop) # hash the remaining elements and finish
+      k1 = cast[ptr uint32](unsafeAddr x[i])[]
+    inc i, stepSize
+
+    k1 = imul(k1, c1)
+    k1 = rotl32(k1, 15)
+    k1 = imul(k1, c2)
+
+    h1 = h1 xor k1
+    h1 = rotl32(h1, 13)
+    h1 = h1*5 + n1
+
+  # tail
+  var k1: uint32
+  var rem = size mod stepSize
+  while rem > 0:
+    dec rem
+    k1 = (k1 shl 8) or (ord(x[i+rem])).uint32
+  k1 = imul(k1, c1)
+  k1 = rotl32(k1, 15)
+  k1 = imul(k1, c2)
+  h1 = h1 xor k1
+
+  # finalization
+  h1 = h1 xor size.uint32
+  h1 = h1 xor (h1 shr 16)
+  h1 = imul(h1, m1)
+  h1 = h1 xor (h1 shr 13)
+  h1 = imul(h1, m2)
+  h1 = h1 xor (h1 shr 16)
+  return cast[Hash](h1)
+
+proc hashVmImpl(x: string, sPos, ePos: int): Hash =
+  doAssert false, "implementation override in compiler/vmops.nim"
+
+proc hashVmImplChar(x: openArray[char], sPos, ePos: int): Hash =
+  doAssert false, "implementation override in compiler/vmops.nim"
+
+proc hashVmImplByte(x: openArray[byte], sPos, ePos: int): Hash =
+  doAssert false, "implementation override in compiler/vmops.nim"
 
 proc hash*(x: string): Hash =
   ## Efficient hashing of strings.
@@ -182,7 +238,16 @@ proc hash*(x: string): Hash =
   runnableExamples:
     doAssert hash("abracadabra") != hash("AbracadabrA")
 
-  hashImpl(result, x, 0, high(x))
+  when not defined(nimToOpenArrayCString):
+    result = 0
+    for c in x:
+      result = result !& ord(c)
+    result = !$result
+  else:
+    when nimvm:
+      result = hashVmImpl(x, 0, high(x))
+    else:
+      result = murmurHash(toOpenArrayByte(x, 0, high(x)))
 
 proc hash*(x: cstring): Hash =
   ## Efficient hashing of null-terminated strings.
@@ -191,7 +256,19 @@ proc hash*(x: cstring): Hash =
     doAssert hash(cstring"AbracadabrA") == hash("AbracadabrA")
     doAssert hash(cstring"abracadabra") != hash(cstring"AbracadabrA")
 
-  hashImpl(result, x, 0, high(x))
+  when not defined(nimToOpenArrayCString):
+    result = 0
+    var i = 0
+    while x[i] != '\0':
+      result = result !& ord(x[i])
+      inc i
+    result = !$result
+  else:
+    when not defined(JS) and defined(nimToOpenArrayCString):
+      murmurHash(toOpenArrayByte(x, 0, x.high))
+    else:
+      let xx = $x
+      murmurHash(toOpenArrayByte(xx, 0, high(xx)))
 
 proc hash*(sBuf: string, sPos, ePos: int): Hash =
   ## Efficient hashing of a string buffer, from starting
@@ -202,7 +279,13 @@ proc hash*(sBuf: string, sPos, ePos: int): Hash =
     var a = "abracadabra"
     doAssert hash(a, 0, 3) == hash(a, 7, 10)
 
-  hashImpl(result, sBuf, sPos, ePos)
+  when not defined(nimToOpenArrayCString):
+    result = 0
+    for i in sPos..ePos:
+      result = result !& ord(sBuf[i])
+    result = !$result
+  else:
+    murmurHash(toOpenArrayByte(sBuf, sPos, ePos))
 
 proc hashIgnoreStyle*(x: string): Hash =
   ## Efficient hashing of strings; style is ignored.
@@ -300,12 +383,20 @@ proc hash*[T: tuple](x: T): Hash =
     result = result !& hash(f)
   result = !$result
 
+
 proc hash*[A](x: openArray[A]): Hash =
   ## Efficient hashing of arrays and sequences.
-  when A is char|SomeInteger:
-    hashImpl(result, x, 0, x.high)
+  when A is byte:
+    result = murmurHash(x)
+  elif A is char:
+    when nimvm:
+      result = hashVmImplChar(x, 0, x.high)
+    else:
+      result = murmurHash(toOpenArrayByte(x, 0, x.high))
   else:
-    bytewiseHashing(result, x, 0, x.high)
+    for a in x:
+      result = result !& hash(a)
+    result = !$result
 
 proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash =
   ## Efficient hashing of portions of arrays and sequences, from starting
@@ -316,10 +407,20 @@ proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash =
     let a = [1, 2, 5, 1, 2, 6]
     doAssert hash(a, 0, 1) == hash(a, 3, 4)
 
-  when A is char|SomeInteger:
-    hashImpl(result, aBuf, sPos, ePos)
+  when A is byte:
+    when nimvm:
+      result = hashVmImplByte(aBuf, sPos, ePos)
+    else:
+      result = murmurHash(toOpenArray(aBuf, sPos, ePos))
+  elif A is char:
+    when nimvm:
+      result = hashVmImplChar(aBuf, sPos, ePos)
+    else:
+      result = murmurHash(toOpenArrayByte(aBuf, sPos, ePos))
   else:
-    bytewiseHashing(result, aBuf, sPos, ePos)
+    for i in sPos .. ePos:
+      result = result !& hash(aBuf[i])
+    result = !$result
 
 proc hash*[A](x: set[A]): Hash =
   ## Efficient hashing of sets.
@@ -334,11 +435,15 @@ when isMainModule:
       a = ""
       b = newSeq[char]()
       c = newSeq[int]()
+      d = cstring""
+      e = "abcd"
     doAssert hash(a) == 0
     doAssert hash(b) == 0
     doAssert hash(c) == 0
+    doAssert hash(d) == 0
     doAssert hashIgnoreCase(a) == 0
     doAssert hashIgnoreStyle(a) == 0
+    doAssert hash(e, 3, 2) == 0
   block sameButDifferent:
     doAssert hash("aa bb aaaa1234") == hash("aa bb aaaa1234", 0, 13)
     doAssert hash("aa bb aaaa1234") == hash(cstring"aa bb aaaa1234")
@@ -346,14 +451,14 @@ when isMainModule:
     doAssert hashIgnoreStyle("aa_bb_AAaa1234") == hashIgnoreCase("aaBBAAAa1234")
   block smallSize: # no multibyte hashing
     let
-      xx = @['H','e','l','l','o']
-      ii = @[72'i8, 101, 108, 108, 111]
-      ss = "Hello"
+      xx = @['H','i']
+      ii = @[72'u8, 105]
+      ss = "Hi"
     doAssert hash(xx) == hash(ii)
     doAssert hash(xx) == hash(ss)
     doAssert hash(xx) == hash(xx, 0, xx.high)
     doAssert hash(ss) == hash(ss, 0, ss.high)
-  block largeSize: # longer than 8 characters, should trigger multibyte hashing
+  block largeSize: # longer than 4 characters
     let
       xx = @['H','e','l','l','o']
       xxl = @['H','e','l','l','o','w','e','e','n','s']
@@ -362,9 +467,6 @@ when isMainModule:
     doAssert hash(xxl) == hash(xxl, 0, xxl.high)
     doAssert hash(ssl) == hash(ssl, 0, ssl.high)
     doAssert hash(xx) == hash(xxl, 0, 4)
-  block misc:
-    let
-      a = [1'u8, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4]
-      b = [1'i8, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4]
-    doAssert hash(a) == hash(b)
-    doAssert hash(a, 2, 5) == hash(b, 2, 5)
+    doAssert hash(xx) == hash(ssl, 0, 4)
+    doAssert hash(xx, 0, 3) == hash(xxl, 0, 3)
+    doAssert hash(xx, 0, 3) == hash(ssl, 0, 3)
diff --git a/lib/system.nim b/lib/system.nim
index 16a5e03fe..edab33412 100644
--- a/lib/system.nim
+++ b/lib/system.nim
@@ -4501,6 +4501,11 @@ when defined(nimconfig):
 when not defined(js):
   proc toOpenArray*[T](x: ptr UncheckedArray[T]; first, last: int): openArray[T] {.
     magic: "Slice".}
+  when defined(nimToOpenArrayCString):
+    proc toOpenArray*(x: cstring; first, last: int): openArray[char] {.
+      magic: "Slice".}
+    proc toOpenArrayByte*(x: cstring; first, last: int): openArray[byte] {.
+      magic: "Slice".}
 
 proc toOpenArray*[T](x: seq[T]; first, last: int): openArray[T] {.
   magic: "Slice".}
@@ -4510,8 +4515,13 @@ proc toOpenArray*[I, T](x: array[I, T]; first, last: I): openArray[T] {.
   magic: "Slice".}
 proc toOpenArray*(x: string; first, last: int): openArray[char] {.
   magic: "Slice".}
+
 proc toOpenArrayByte*(x: string; first, last: int): openArray[byte] {.
   magic: "Slice".}
+proc toOpenArrayByte*(x: openArray[char]; first, last: int): openArray[byte] {.
+  magic: "Slice".}
+proc toOpenArrayByte*(x: seq[char]; first, last: int): openArray[byte] {.
+  magic: "Slice".}
 
 type
   ForLoopStmt* {.compilerproc.} = object ## \
diff --git a/tests/parallel/tsendtwice.nim b/tests/parallel/tsendtwice.nim
index 2f60b904f..0b3ce15a5 100644
--- a/tests/parallel/tsendtwice.nim
+++ b/tests/parallel/tsendtwice.nim
@@ -1,11 +1,12 @@
 discard """
-  output: '''ob @[]
+  output: '''ob2 @[]
+ob @[]
 ob3 @[]
-ob2 @[]
 3
+ob2 @[]
 ob @[]
 ob3 @[]
-ob2 @[]'''
+'''
   cmd: "nim c -r --threads:on $file"
 """