summary refs log tree commit diff stats
path: root/tools/dochack/fuzzysearch.nim
diff options
context:
space:
mode:
Diffstat (limited to 'tools/dochack/fuzzysearch.nim')
-rw-r--r--tools/dochack/fuzzysearch.nim141
1 files changed, 141 insertions, 0 deletions
diff --git a/tools/dochack/fuzzysearch.nim b/tools/dochack/fuzzysearch.nim
new file mode 100644
index 000000000..fecbaf4f0
--- /dev/null
+++ b/tools/dochack/fuzzysearch.nim
@@ -0,0 +1,141 @@
+# A Fuzzy Match implementation inspired by the sublime text fuzzy match algorithm
+# as described here: https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
+# Heavily modified to provide more subjectively useful results
+# for on the Nim manual.
+#
+import strutils
+import math
+
+
+const
+  MaxUnmatchedLeadingChar = 3
+  ## Maximum number of times the penalty for unmatched leading chars is applied.
+
+  HeadingScaleFactor = 0.5
+  ## The score from before the colon Char is multiplied by this.
+  ## This is to weight function signatures and descriptions over module titles.
+
+
+type
+  ScoreCard = enum
+    StartMatch           = -100 ## Start matching.
+    LeadingCharDiff      = -3   ## An unmatched, leading character was found.
+    CharDiff             = -1   ## An unmatched character was found.
+    CharMatch            = 0    ## A matched character was found.
+    ConsecutiveMatch     = 5    ## A consecutive match was found.
+    LeadingCharMatch     = 10   ## The character matches the beginning of the
+                                ## string or the first character of a word
+                                ## or camel case boundary.
+    WordBoundryMatch     = 20   ## The last ConsecutiveCharMatch that
+                                ## immediately precedes the end of the string,
+                                ## end of the pattern, or a LeadingCharMatch.
+
+
+proc fuzzyMatch*(pattern, str: cstring) : tuple[score: int, matched: bool] =
+  var
+    scoreState = StartMatch
+    headerMatched = false
+    unmatchedLeadingCharCount = 0
+    consecutiveMatchCount = 0
+    strIndex = 0
+    patIndex = 0
+    score = 0
+
+  template transition(nextState) =
+    scoreState = nextState
+    score += ord(scoreState)
+
+  while (strIndex < str.len) and (patIndex < pattern.len):
+    var
+      patternChar = pattern[patIndex].toLowerAscii
+      strChar     = str[strIndex].toLowerAscii
+
+    # Ignore certain characters
+    if patternChar in {'_', ' ', '.'}:
+      patIndex += 1
+      continue
+    if strChar in {'_', ' ', '.'}:
+      strIndex += 1
+      continue
+
+    # Since this algorithm will be used to search against Nim documentation,
+    # the below logic prioritizes headers.
+    if not headerMatched and strChar == ':':
+      headerMatched = true
+      scoreState = StartMatch
+      score = int(floor(HeadingScaleFactor * float(score)))
+      patIndex = 0
+      strIndex += 1
+      continue
+
+    if strChar == patternChar:
+      case scoreState
+      of StartMatch, WordBoundryMatch:
+        scoreState = LeadingCharMatch
+
+      of CharMatch:
+        transition(ConsecutiveMatch)
+
+      of LeadingCharMatch, ConsecutiveMatch:
+        consecutiveMatchCount += 1
+        scoreState = ConsecutiveMatch
+        score += ord(ConsecutiveMatch) * consecutiveMatchCount
+
+        if scoreState == LeadingCharMatch:
+          score += ord(LeadingCharMatch)
+
+        var onBoundary = (patIndex == high(pattern))
+        if not onBoundary and strIndex < high(str):
+          let
+            nextPatternChar = toLowerAscii(pattern[patIndex + 1])
+            nextStrChar     = toLowerAscii(str[strIndex + 1])
+
+          onBoundary = (
+            nextStrChar notin {'a'..'z'} and
+            nextStrChar != nextPatternChar
+          )
+
+        if onBoundary:
+          transition(WordBoundryMatch)
+
+      of CharDiff, LeadingCharDiff:
+        var isLeadingChar = (
+          str[strIndex - 1] notin Letters or
+          str[strIndex - 1] in {'a'..'z'} and
+          str[strIndex] in {'A'..'Z'}
+        )
+
+        if isLeadingChar:
+          scoreState = LeadingCharMatch
+          #a non alpha or a camel case transition counts as a leading char.
+          # Transition the state, but don't give the bonus yet; wait until we verify a consecutive match.
+        else:
+          transition(CharMatch)
+      patIndex += 1
+
+    else:
+      case scoreState
+      of StartMatch:
+        transition(LeadingCharDiff)
+
+      of ConsecutiveMatch:
+        transition(CharDiff)
+        consecutiveMatchCount = 0
+
+      of LeadingCharDiff:
+        if unmatchedLeadingCharCount < MaxUnmatchedLeadingChar:
+          transition(LeadingCharDiff)
+        unmatchedLeadingCharCount += 1
+
+      else:
+        transition(CharDiff)
+
+    strIndex += 1
+
+  if patIndex == pattern.len and (strIndex == str.len or str[strIndex] notin Letters):
+    score += 10
+
+  result = (
+    score:   max(0, score),
+    matched: (score > 0),
+  )