summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--lib/packages/docutils/rstgen.nim12
-rw-r--r--tools/dochack/dochack.nim36
-rw-r--r--tools/dochack/fuzzysearch.nim139
3 files changed, 167 insertions, 20 deletions
diff --git a/lib/packages/docutils/rstgen.nim b/lib/packages/docutils/rstgen.nim
index ef456f093..43a429a17 100644
--- a/lib/packages/docutils/rstgen.nim
+++ b/lib/packages/docutils/rstgen.nim
@@ -449,10 +449,11 @@ proc generateSymbolIndex(symbols: seq[IndexEntry]): string =
         desc = if not symbols[j].linkDesc.isNil: symbols[j].linkDesc else: ""
       if desc.len > 0:
         result.addf("""<li><a class="reference external"
-          title="$3" href="$1">$2</a></li>
+          title="$3" data-doc-search-tag="$2" href="$1">$2</a></li>
           """, [url, text, desc])
       else:
-        result.addf("""<li><a class="reference external" href="$1">$2</a></li>
+        result.addf("""<li><a class="reference external" 
+          data-doc-search-tag="$2" href="$1">$2</a></li>
           """, [url, text])
       inc j
     result.add("</ul></dd>\n")
@@ -493,6 +494,7 @@ proc generateDocumentationTOC(entries: seq[IndexEntry]): string =
   # Build a list of levels and extracted titles to make processing easier.
   var
     titleRef: string
+    titleTag: string
     levels: seq[tuple[level: int, text: string]]
     L = 0
     level = 1
@@ -519,10 +521,12 @@ proc generateDocumentationTOC(entries: seq[IndexEntry]): string =
     let link = entries[L].link
     if link.isDocumentationTitle:
       titleRef = link
+      titleTag = levels[L].text
     else:
       result.add(level.indentToLevel(levels[L].level))
-      result.add("<li><a href=\"" & link & "\">" &
-        levels[L].text & "</a></li>\n")
+      result.addf("""<li><a class="reference" data-doc-search-tag="$1" href="$2"> 
+        $3</a></li>
+        """, [titleTag & " : " & levels[L].text, link, levels[L].text])
     inc L
   result.add(level.indentToLevel(1) & "</ul>\n")
   assert(not titleRef.isNil,
diff --git a/tools/dochack/dochack.nim b/tools/dochack/dochack.nim
index 79a0e7482..8cc27b6eb 100644
--- a/tools/dochack/dochack.nim
+++ b/tools/dochack/dochack.nim
@@ -1,6 +1,5 @@
-
-
 import karax
+import fuzzysearch
 
 proc findNodeWith(x: Element; tag, content: cstring): Element =
   if x.nodeName == tag and x.textContent == content:
@@ -88,11 +87,11 @@ proc toHtml(x: TocEntry; isRoot=false): Element =
   if ul.len != 0: result.add ul
   if result.len == 0: result = nil
 
-proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} =
-  {.emit: """
-     var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
-     return new RegExp("\\b" + escaped + "\\b").test(`a`);
-  """.}
+#proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} =
+  #{.emit: """
+     #var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
+     #return new RegExp("\\b" + escaped + "\\b").test(`a`);
+  #""".}
 
 proc isWhitespace(text: cstring): bool {.asmNoStackFrame.} =
   {.emit: """
@@ -252,24 +251,29 @@ proc dosearch(value: cstring): Element =
 
     `stuff` = doc.documentElement;
     """.}
-    db = stuff.getElementsByClass"reference external"
+    db = stuff.getElementsByClass"reference"
     contents = @[]
     for ahref in db:
-      contents.add ahref.textContent.normalize
+      contents.add ahref.getAttribute("data-doc-search-tag")
   let ul = tree("UL")
   result = tree("DIV")
   result.setClass"search_results"
   var matches: seq[(Element, int)] = @[]
-  let key = value.normalize
   for i in 0..<db.len:
     let c = contents[i]
-    if c.containsWord(key):
-      matches.add((db[i], -(30_000 - c.len)))
-    elif c.contains(key):
-      matches.add((db[i], c.len))
+    if c == "Examples" or c == "PEG construction":
+    # Some manual exclusions.
+    # Ideally these should be fixed in the index to be more
+    # descriptive of what they are.
+      continue
+    let (score, matched) = fuzzymatch(value, c)
+    if matched:
+      matches.add((db[i], score))
+
   matches.sort do (a, b: auto) -> int:
-    a[1] - b[1]
-  for i in 0..min(<matches.len, 19):
+    b[1] - a[1]
+  for i in 0 ..< min(matches.len, 19):
+    matches[i][0].innerHTML = matches[i][0].getAttribute("data-doc-search-tag")
     ul.add(tree("LI", matches[i][0]))
   if ul.len == 0:
     result.add tree("B", text"no search results")
diff --git a/tools/dochack/fuzzysearch.nim b/tools/dochack/fuzzysearch.nim
new file mode 100644
index 000000000..e6b1ea3cd
--- /dev/null
+++ b/tools/dochack/fuzzysearch.nim
@@ -0,0 +1,139 @@
+# A Fuzzy Match implementation inspired by the sublime text fuzzy match algorithm
+# as described here: https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
+# Heavily modified to provide more subjectively useful results
+# for on the Nim manual.
+#
+import strutils
+import math
+import macros
+
+
+const
+  MaxUnmatchedLeadingChar = 3
+  ## Maximum number of times the penalty for unmatched leading chars is applied.
+
+  HeadingScaleFactor = 0.5
+  ## The score from before the colon Char is multiplied by this.
+  ## This is to weight function signatures and descriptions over module titles.
+
+
+type 
+  ScoreCard = enum 
+    StartMatch           = -100 ## Start matching.
+    LeadingCharDiff      = -3   ## An unmatched, leading character was found.
+    CharDiff             = -1   ## An unmatched character was found.
+    CharMatch            = 0    ## A matched character was found.
+    ConsecutiveMatch     = 5    ## A consecutive match was found.
+    LeadingCharMatch     = 10   ## The character matches the begining of the
+                                ## string or the first character of a word
+                                ## or camel case boundry.
+    WordBoundryMatch     = 20   ## The last ConsecutiveCharMatch that
+                                ## immediately precedes the end of the string,
+                                ## end of the pattern, or a LeadingCharMatch.
+
+
+proc fuzzyMatch*(pattern, str: cstring) : tuple[score: int, matched: bool] =
+  var
+    scoreState = StartMatch
+    headerMatched = false
+    unmatchedLeadingCharCount = 0
+    consecutiveMatchCount = 0
+    strIndex = 0
+    patIndex = 0
+    score = 0
+
+  template transition(nextState) =
+    scoreState = nextState
+    score += ord(scoreState)
+
+  while (strIndex < str.len) and (patIndex < pattern.len):
+    var
+      patternChar = pattern[patIndex].toLowerAscii
+      strChar     = str[strIndex].toLowerAscii
+
+    # Ignore certain characters
+    if patternChar in {'_', ' ', '.'}:
+      patIndex += 1
+      continue
+    if strChar in {'_', ' ', '.'}:
+      strIndex += 1
+      continue
+    
+    # Since this algorithm will be used to search against Nim documentation,
+    # the below logic prioritizes headers.
+    if not headerMatched and strChar == ':':
+      headerMatched = true
+      scoreState = StartMatch
+      score = toInt(floor(HeadingScaleFactor * toFloat(score)))
+      patIndex = 0
+      strIndex += 1
+      continue
+
+    if strChar == patternChar:
+      case scoreState 
+      of StartMatch, WordBoundryMatch:
+        scoreState = LeadingCharMatch
+
+      of CharMatch:
+        transition(ConsecutiveMatch)
+
+      of LeadingCharMatch, ConsecutiveMatch:
+        consecutiveMatchCount += 1
+        scoreState = ConsecutiveMatch
+        score += ord(ConsecutiveMatch) * consecutiveMatchCount
+
+        if scoreState == LeadingCharMatch:
+          score += ord(LeadingCharMatch)
+          
+        var onBoundary = (patIndex == high(pattern))
+        if not onBoundary:
+          let
+            nextPatternChar = toLowerAscii(pattern[patIndex + 1])
+            nextStrChar     = toLowerAscii(str[strIndex + 1])
+
+          onBoundary = (
+            nextStrChar notin {'a'..'z'} and
+            nextStrChar != nextPatternChar
+          )
+        
+        if onBoundary:
+          transition(WordBoundryMatch)
+
+      of CharDiff, LeadingCharDiff:
+        var isLeadingChar = (
+          str[strIndex - 1] notin Letters or
+          str[strIndex - 1] in {'a'..'z'} and
+          str[strIndex] in {'A'..'Z'}
+        )
+
+        if isLeadingChar:
+          scoreState = LeadingCharMatch
+          #a non alpha or a camel case transition counts as a leading char.
+          # Transition the state, but don't give the bonus yet; wait until we verify a consecutive match.
+        else:
+          transition(CharMatch)
+      patIndex += 1
+
+    else:
+      case scoreState 
+      of StartMatch:
+        transition(LeadingCharDiff)
+
+      of ConsecutiveMatch:
+        transition(CharDiff)
+        consecutiveMatchCount = 0
+
+      of LeadingCharDiff:
+        if unmatchedLeadingCharCount < MaxUnmatchedLeadingChar:
+          transition(LeadingCharDiff)
+        unmatchedLeadingCharCount += 1
+
+      else:
+        transition(CharDiff)
+
+    strIndex += 1
+
+  result = (
+    score:   max(0, score),
+    matched: (score > 0),
+  )