tools/dochack/fuzzysearch.nim


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139

# A Fuzzy Match implementation inspired by the sublime text fuzzy match algorithm
# as described here: https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
# Heavily modified to provide more subjectively useful results
# for on the Nim manual.
#
import strutils
import math
import macros


const
  MaxUnmatchedLeadingChar = 3
  ## Maximum number of times the penalty for unmatched leading chars is applied.

  HeadingScaleFactor = 0.5
  ## The score from before the colon Char is multiplied by this.
  ## This is to weight function signatures and descriptions over module titles.


type 
  ScoreCard = enum 
    StartMatch           = -100 ## Start matching.
    LeadingCharDiff      = -3   ## An unmatched, leading character was found.
    CharDiff             = -1   ## An unmatched character was found.
    CharMatch            = 0    ## A matched character was found.
    ConsecutiveMatch     = 5    ## A consecutive match was found.
    LeadingCharMatch     = 10   ## The character matches the begining of the
                                ## string or the first character of a word
                                ## or camel case boundry.
    WordBoundryMatch     = 20   ## The last ConsecutiveCharMatch that
                                ## immediately precedes the end of the string,
                                ## end of the pattern, or a LeadingCharMatch.


proc fuzzyMatch*(pattern, str: cstring) : tuple[score: int, matched: bool] =
  var
    scoreState = StartMatch
    headerMatched = false
    unmatchedLeadingCharCount = 0
    consecutiveMatchCount = 0
    strIndex = 0
    patIndex = 0
    score = 0

  template transition(nextState) =
    scoreState = nextState
    score += ord(scoreState)

  while (strIndex < str.len) and (patIndex < pattern.len):
    var
      patternChar = pattern[patIndex].toLowerAscii
      strChar     = str[strIndex].toLowerAscii

    # Ignore certain characters
    if patternChar in {'_', ' ', '.'}:
      patIndex += 1
      continue
    if strChar in {'_', ' ', '.'}:
      strIndex += 1
      continue
    
    # Since this algorithm will be used to search against Nim documentation,
    # the below logic prioritizes headers.
    if not headerMatched and strChar == ':':
      headerMatched = true
      scoreState = StartMatch
      score = toInt(floor(HeadingScaleFactor * toFloat(score)))
      patIndex = 0
      strIndex += 1
      continue

    if strChar == patternChar:
      case scoreState 
      of StartMatch, WordBoundryMatch:
        scoreState = LeadingCharMatch

      of CharMatch:
        transition(ConsecutiveMatch)

      of LeadingCharMatch, ConsecutiveMatch:
        consecutiveMatchCount += 1
        scoreState = ConsecutiveMatch
        score += ord(ConsecutiveMatch) * consecutiveMatchCount

        if scoreState == LeadingCharMatch:
          score += ord(LeadingCharMatch)
          
        var onBoundary = (patIndex == high(pattern))
        if not onBoundary and strIndex < high(str):
          let
            nextPatternChar = toLowerAscii(pattern[patIndex + 1])
            nextStrChar     = toLowerAscii(str[strIndex + 1])

          onBoundary = (
            nextStrChar notin {'a'..'z'} and
            nextStrChar != nextPatternChar
          )
        
        if onBoundary:
          transition(WordBoundryMatch)

      of CharDiff, LeadingCharDiff:
        var isLeadingChar = (
          str[strIndex - 1] notin Letters or
          str[strIndex - 1] in {'a'..'z'} and
          str[strIndex] in {'A'..'Z'}
        )

        if isLeadingChar:
          scoreState = LeadingCharMatch
          #a non alpha or a camel case transition counts as a leading char.
          # Transition the state, but don't give the bonus yet; wait until we verify a consecutive match.
        else:
          transition(CharMatch)
      patIndex += 1

    else:
      case scoreState 
      of StartMatch:
        transition(LeadingCharDiff)

      of ConsecutiveMatch:
        transition(CharDiff)
        consecutiveMatchCount = 0

      of LeadingCharDiff:
        if unmatchedLeadingCharCount < MaxUnmatchedLeadingChar:
          transition(LeadingCharDiff)
        unmatchedLeadingCharCount += 1

      else:
        transition(CharDiff)

    strIndex += 1

  result = (
    score:   max(0, score),
    matched: (score > 0),
  )