Better doc search (#8260)

* Modified the doc generation to produce a custom data attribute to allow for better search functionality * Implemented fuzzy matching for the Nim Doc search instead of the simple regex match. * Fix to the WordBoundry state transition from code review with @Varriount. Also removed silly testing template that is no longer used. * Update fuzzysearch.nim * Update fuzzysearch.nim * Update fuzzysearch.nim * Update dochack.nim * Update dochack.nim
author: Ray Imber <rayimber@gmail.com> 2018-07-20 02:58:42 -0700
committer: Varriount <Varriount@users.noreply.github.com> 2018-07-20 04:58:42 -0500
commit: 060871e64ac9d665430d4d9ae912bf7a379ee976 (patch)
tree: 12147c746c10df31987eda9c61ce7337dc5a3350 /tools
parent: f92d61b1f4e193bd17e1f76722d26f1f0605ad17 (diff)
download: Nim-060871e64ac9d665430d4d9ae912bf7a379ee976.tar.gz
2 files changed, 159 insertions, 16 deletions
diff --git a/tools/dochack/dochack.nim b/tools/dochack/dochack.nim
index 79a0e7482..8cc27b6eb 100644
--- a/tools/dochack/dochack.nim
+++ b/tools/dochack/dochack.nim
@@ -1,6 +1,5 @@
-
-
 import karax
+import fuzzysearch
 
 proc findNodeWith(x: Element; tag, content: cstring): Element =
   if x.nodeName == tag and x.textContent == content:
@@ -88,11 +87,11 @@ proc toHtml(x: TocEntry; isRoot=false): Element =
   if ul.len != 0: result.add ul
   if result.len == 0: result = nil
 
-proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} =
-  {.emit: """
-     var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
-     return new RegExp("\\b" + escaped + "\\b").test(`a`);
-  """.}
+#proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} =
+  #{.emit: """
+     #var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
+     #return new RegExp("\\b" + escaped + "\\b").test(`a`);
+  #""".}
 
 proc isWhitespace(text: cstring): bool {.asmNoStackFrame.} =
   {.emit: """
@@ -252,24 +251,29 @@ proc dosearch(value: cstring): Element =
 
     `stuff` = doc.documentElement;
     """.}
-    db = stuff.getElementsByClass"reference external"
+    db = stuff.getElementsByClass"reference"
     contents = @[]
     for ahref in db:
-      contents.add ahref.textContent.normalize
+      contents.add ahref.getAttribute("data-doc-search-tag")
   let ul = tree("UL")
   result = tree("DIV")
   result.setClass"search_results"
   var matches: seq[(Element, int)] = @[]
-  let key = value.normalize
   for i in 0..<db.len:
     let c = contents[i]
-    if c.containsWord(key):
-      matches.add((db[i], -(30_000 - c.len)))
-    elif c.contains(key):
-      matches.add((db[i], c.len))
+    if c == "Examples" or c == "PEG construction":
+    # Some manual exclusions.
+    # Ideally these should be fixed in the index to be more
+    # descriptive of what they are.
+      continue
+    let (score, matched) = fuzzymatch(value, c)
+    if matched:
+      matches.add((db[i], score))
+
   matches.sort do (a, b: auto) -> int:
-    a[1] - b[1]
-  for i in 0..min(<matches.len, 19):
+    b[1] - a[1]
+  for i in 0 ..< min(matches.len, 19):
+    matches[i][0].innerHTML = matches[i][0].getAttribute("data-doc-search-tag")
     ul.add(tree("LI", matches[i][0]))
   if ul.len == 0:
     result.add tree("B", text"no search results")
diff --git a/tools/dochack/fuzzysearch.nim b/tools/dochack/fuzzysearch.nim
new file mode 100644
index 000000000..e6b1ea3cd
--- /dev/null
+++ b/tools/dochack/fuzzysearch.nim
@@ -0,0 +1,139 @@
+# A Fuzzy Match implementation inspired by the sublime text fuzzy match algorithm
+# as described here: https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
+# Heavily modified to provide more subjectively useful results
+# for on the Nim manual.
+#
+import strutils
+import math
+import macros
+
+
+const
+  MaxUnmatchedLeadingChar = 3
+  ## Maximum number of times the penalty for unmatched leading chars is applied.
+
+  HeadingScaleFactor = 0.5
+  ## The score from before the colon Char is multiplied by this.
+  ## This is to weight function signatures and descriptions over module titles.
+
+
+type 
+  ScoreCard = enum 
+    StartMatch           = -100 ## Start matching.
+    LeadingCharDiff      = -3   ## An unmatched, leading character was found.
+    CharDiff             = -1   ## An unmatched character was found.
+    CharMatch            = 0    ## A matched character was found.
+    ConsecutiveMatch     = 5    ## A consecutive match was found.
+    LeadingCharMatch     = 10   ## The character matches the begining of the
+                                ## string or the first character of a word
+                                ## or camel case boundry.
+    WordBoundryMatch     = 20   ## The last ConsecutiveCharMatch that
+                                ## immediately precedes the end of the string,
+                                ## end of the pattern, or a LeadingCharMatch.
+
+
+proc fuzzyMatch*(pattern, str: cstring) : tuple[score: int, matched: bool] =
+  var
+    scoreState = StartMatch
+    headerMatched = false
+    unmatchedLeadingCharCount = 0
+    consecutiveMatchCount = 0
+    strIndex = 0
+    patIndex = 0
+    score = 0
+
+  template transition(nextState) =
+    scoreState = nextState
+    score += ord(scoreState)
+
+  while (strIndex < str.len) and (patIndex < pattern.len):
+    var
+      patternChar = pattern[patIndex].toLowerAscii
+      strChar     = str[strIndex].toLowerAscii
+
+    # Ignore certain characters
+    if patternChar in {'_', ' ', '.'}:
+      patIndex += 1
+      continue
+    if strChar in {'_', ' ', '.'}:
+      strIndex += 1
+      continue
+    
+    # Since this algorithm will be used to search against Nim documentation,
+    # the below logic prioritizes headers.
+    if not headerMatched and strChar == ':':
+      headerMatched = true
+      scoreState = StartMatch
+      score = toInt(floor(HeadingScaleFactor * toFloat(score)))
+      patIndex = 0
+      strIndex += 1
+      continue
+
+    if strChar == patternChar:
+      case scoreState 
+      of StartMatch, WordBoundryMatch:
+        scoreState = LeadingCharMatch
+
+      of CharMatch:
+        transition(ConsecutiveMatch)
+
+      of LeadingCharMatch, ConsecutiveMatch:
+        consecutiveMatchCount += 1
+        scoreState = ConsecutiveMatch
+        score += ord(ConsecutiveMatch) * consecutiveMatchCount
+
+        if scoreState == LeadingCharMatch:
+          score += ord(LeadingCharMatch)
+          
+        var onBoundary = (patIndex == high(pattern))
+        if not onBoundary:
+          let
+            nextPatternChar = toLowerAscii(pattern[patIndex + 1])
+            nextStrChar     = toLowerAscii(str[strIndex + 1])
+
+          onBoundary = (
+            nextStrChar notin {'a'..'z'} and
+            nextStrChar != nextPatternChar
+          )
+        
+        if onBoundary:
+          transition(WordBoundryMatch)
+
+      of CharDiff, LeadingCharDiff:
+        var isLeadingChar = (
+          str[strIndex - 1] notin Letters or
+          str[strIndex - 1] in {'a'..'z'} and
+          str[strIndex] in {'A'..'Z'}
+        )
+
+        if isLeadingChar:
+          scoreState = LeadingCharMatch
+          #a non alpha or a camel case transition counts as a leading char.
+          # Transition the state, but don't give the bonus yet; wait until we verify a consecutive match.
+        else:
+          transition(CharMatch)
+      patIndex += 1
+
+    else:
+      case scoreState 
+      of StartMatch:
+        transition(LeadingCharDiff)
+
+      of ConsecutiveMatch:
+        transition(CharDiff)
+        consecutiveMatchCount = 0
+
+      of LeadingCharDiff:
+        if unmatchedLeadingCharCount < MaxUnmatchedLeadingChar:
+          transition(LeadingCharDiff)
+        unmatchedLeadingCharCount += 1
+
+      else:
+        transition(CharDiff)
+
+    strIndex += 1
+
+  result = (
+    score:   max(0, score),
+    matched: (score > 0),
+  )
author	Ray Imber <rayimber@gmail.com>	2018-07-20 02:58:42 -0700
committer	Varriount <Varriount@users.noreply.github.com>	2018-07-20 04:58:42 -0500
commit	060871e64ac9d665430d4d9ae912bf7a379ee976 (patch)
tree	12147c746c10df31987eda9c61ce7337dc5a3350 /tools
parent	f92d61b1f4e193bd17e1f76722d26f1f0605ad17 (diff)
download	Nim-060871e64ac9d665430d4d9ae912bf7a379ee976.tar.gz