diff options
author | Ray Imber <rayimber@gmail.com> | 2018-07-20 02:58:42 -0700 |
---|---|---|
committer | Varriount <Varriount@users.noreply.github.com> | 2018-07-20 04:58:42 -0500 |
commit | 060871e64ac9d665430d4d9ae912bf7a379ee976 (patch) | |
tree | 12147c746c10df31987eda9c61ce7337dc5a3350 /tools | |
parent | f92d61b1f4e193bd17e1f76722d26f1f0605ad17 (diff) | |
download | Nim-060871e64ac9d665430d4d9ae912bf7a379ee976.tar.gz |
Better doc search (#8260)
* Modified the doc generation to produce a custom data attribute to allow for better search functionality * Implemented fuzzy matching for the Nim Doc search instead of the simple regex match. * Fix to the WordBoundry state transition from code review with @Varriount. Also removed silly testing template that is no longer used. * Update fuzzysearch.nim * Update fuzzysearch.nim * Update fuzzysearch.nim * Update dochack.nim * Update dochack.nim
Diffstat (limited to 'tools')
-rw-r--r-- | tools/dochack/dochack.nim | 36 | ||||
-rw-r--r-- | tools/dochack/fuzzysearch.nim | 139 |
2 files changed, 159 insertions, 16 deletions
diff --git a/tools/dochack/dochack.nim b/tools/dochack/dochack.nim index 79a0e7482..8cc27b6eb 100644 --- a/tools/dochack/dochack.nim +++ b/tools/dochack/dochack.nim @@ -1,6 +1,5 @@ - - import karax +import fuzzysearch proc findNodeWith(x: Element; tag, content: cstring): Element = if x.nodeName == tag and x.textContent == content: @@ -88,11 +87,11 @@ proc toHtml(x: TocEntry; isRoot=false): Element = if ul.len != 0: result.add ul if result.len == 0: result = nil -proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} = - {.emit: """ - var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&"); - return new RegExp("\\b" + escaped + "\\b").test(`a`); - """.} +#proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} = + #{.emit: """ + #var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&"); + #return new RegExp("\\b" + escaped + "\\b").test(`a`); + #""".} proc isWhitespace(text: cstring): bool {.asmNoStackFrame.} = {.emit: """ @@ -252,24 +251,29 @@ proc dosearch(value: cstring): Element = `stuff` = doc.documentElement; """.} - db = stuff.getElementsByClass"reference external" + db = stuff.getElementsByClass"reference" contents = @[] for ahref in db: - contents.add ahref.textContent.normalize + contents.add ahref.getAttribute("data-doc-search-tag") let ul = tree("UL") result = tree("DIV") result.setClass"search_results" var matches: seq[(Element, int)] = @[] - let key = value.normalize for i in 0..<db.len: let c = contents[i] - if c.containsWord(key): - matches.add((db[i], -(30_000 - c.len))) - elif c.contains(key): - matches.add((db[i], c.len)) + if c == "Examples" or c == "PEG construction": + # Some manual exclusions. + # Ideally these should be fixed in the index to be more + # descriptive of what they are. + continue + let (score, matched) = fuzzymatch(value, c) + if matched: + matches.add((db[i], score)) + matches.sort do (a, b: auto) -> int: - a[1] - b[1] - for i in 0..min(<matches.len, 19): + b[1] - a[1] + for i in 0 ..< min(matches.len, 19): + matches[i][0].innerHTML = matches[i][0].getAttribute("data-doc-search-tag") ul.add(tree("LI", matches[i][0])) if ul.len == 0: result.add tree("B", text"no search results") diff --git a/tools/dochack/fuzzysearch.nim b/tools/dochack/fuzzysearch.nim new file mode 100644 index 000000000..e6b1ea3cd --- /dev/null +++ b/tools/dochack/fuzzysearch.nim @@ -0,0 +1,139 @@ +# A Fuzzy Match implementation inspired by the sublime text fuzzy match algorithm +# as described here: https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb +# Heavily modified to provide more subjectively useful results +# for on the Nim manual. +# +import strutils +import math +import macros + + +const + MaxUnmatchedLeadingChar = 3 + ## Maximum number of times the penalty for unmatched leading chars is applied. + + HeadingScaleFactor = 0.5 + ## The score from before the colon Char is multiplied by this. + ## This is to weight function signatures and descriptions over module titles. + + +type + ScoreCard = enum + StartMatch = -100 ## Start matching. + LeadingCharDiff = -3 ## An unmatched, leading character was found. + CharDiff = -1 ## An unmatched character was found. + CharMatch = 0 ## A matched character was found. + ConsecutiveMatch = 5 ## A consecutive match was found. + LeadingCharMatch = 10 ## The character matches the begining of the + ## string or the first character of a word + ## or camel case boundry. + WordBoundryMatch = 20 ## The last ConsecutiveCharMatch that + ## immediately precedes the end of the string, + ## end of the pattern, or a LeadingCharMatch. + + +proc fuzzyMatch*(pattern, str: cstring) : tuple[score: int, matched: bool] = + var + scoreState = StartMatch + headerMatched = false + unmatchedLeadingCharCount = 0 + consecutiveMatchCount = 0 + strIndex = 0 + patIndex = 0 + score = 0 + + template transition(nextState) = + scoreState = nextState + score += ord(scoreState) + + while (strIndex < str.len) and (patIndex < pattern.len): + var + patternChar = pattern[patIndex].toLowerAscii + strChar = str[strIndex].toLowerAscii + + # Ignore certain characters + if patternChar in {'_', ' ', '.'}: + patIndex += 1 + continue + if strChar in {'_', ' ', '.'}: + strIndex += 1 + continue + + # Since this algorithm will be used to search against Nim documentation, + # the below logic prioritizes headers. + if not headerMatched and strChar == ':': + headerMatched = true + scoreState = StartMatch + score = toInt(floor(HeadingScaleFactor * toFloat(score))) + patIndex = 0 + strIndex += 1 + continue + + if strChar == patternChar: + case scoreState + of StartMatch, WordBoundryMatch: + scoreState = LeadingCharMatch + + of CharMatch: + transition(ConsecutiveMatch) + + of LeadingCharMatch, ConsecutiveMatch: + consecutiveMatchCount += 1 + scoreState = ConsecutiveMatch + score += ord(ConsecutiveMatch) * consecutiveMatchCount + + if scoreState == LeadingCharMatch: + score += ord(LeadingCharMatch) + + var onBoundary = (patIndex == high(pattern)) + if not onBoundary: + let + nextPatternChar = toLowerAscii(pattern[patIndex + 1]) + nextStrChar = toLowerAscii(str[strIndex + 1]) + + onBoundary = ( + nextStrChar notin {'a'..'z'} and + nextStrChar != nextPatternChar + ) + + if onBoundary: + transition(WordBoundryMatch) + + of CharDiff, LeadingCharDiff: + var isLeadingChar = ( + str[strIndex - 1] notin Letters or + str[strIndex - 1] in {'a'..'z'} and + str[strIndex] in {'A'..'Z'} + ) + + if isLeadingChar: + scoreState = LeadingCharMatch + #a non alpha or a camel case transition counts as a leading char. + # Transition the state, but don't give the bonus yet; wait until we verify a consecutive match. + else: + transition(CharMatch) + patIndex += 1 + + else: + case scoreState + of StartMatch: + transition(LeadingCharDiff) + + of ConsecutiveMatch: + transition(CharDiff) + consecutiveMatchCount = 0 + + of LeadingCharDiff: + if unmatchedLeadingCharCount < MaxUnmatchedLeadingChar: + transition(LeadingCharDiff) + unmatchedLeadingCharCount += 1 + + else: + transition(CharDiff) + + strIndex += 1 + + result = ( + score: max(0, score), + matched: (score > 0), + ) |