diff options
-rw-r--r-- | lib/packages/docutils/rstgen.nim | 12 | ||||
-rw-r--r-- | tools/dochack/dochack.nim | 36 | ||||
-rw-r--r-- | tools/dochack/fuzzysearch.nim | 139 |
3 files changed, 167 insertions, 20 deletions
diff --git a/lib/packages/docutils/rstgen.nim b/lib/packages/docutils/rstgen.nim index ef456f093..43a429a17 100644 --- a/lib/packages/docutils/rstgen.nim +++ b/lib/packages/docutils/rstgen.nim @@ -449,10 +449,11 @@ proc generateSymbolIndex(symbols: seq[IndexEntry]): string = desc = if not symbols[j].linkDesc.isNil: symbols[j].linkDesc else: "" if desc.len > 0: result.addf("""<li><a class="reference external" - title="$3" href="$1">$2</a></li> + title="$3" data-doc-search-tag="$2" href="$1">$2</a></li> """, [url, text, desc]) else: - result.addf("""<li><a class="reference external" href="$1">$2</a></li> + result.addf("""<li><a class="reference external" + data-doc-search-tag="$2" href="$1">$2</a></li> """, [url, text]) inc j result.add("</ul></dd>\n") @@ -493,6 +494,7 @@ proc generateDocumentationTOC(entries: seq[IndexEntry]): string = # Build a list of levels and extracted titles to make processing easier. var titleRef: string + titleTag: string levels: seq[tuple[level: int, text: string]] L = 0 level = 1 @@ -519,10 +521,12 @@ proc generateDocumentationTOC(entries: seq[IndexEntry]): string = let link = entries[L].link if link.isDocumentationTitle: titleRef = link + titleTag = levels[L].text else: result.add(level.indentToLevel(levels[L].level)) - result.add("<li><a href=\"" & link & "\">" & - levels[L].text & "</a></li>\n") + result.addf("""<li><a class="reference" data-doc-search-tag="$1" href="$2"> + $3</a></li> + """, [titleTag & " : " & levels[L].text, link, levels[L].text]) inc L result.add(level.indentToLevel(1) & "</ul>\n") assert(not titleRef.isNil, diff --git a/tools/dochack/dochack.nim b/tools/dochack/dochack.nim index 79a0e7482..8cc27b6eb 100644 --- a/tools/dochack/dochack.nim +++ b/tools/dochack/dochack.nim @@ -1,6 +1,5 @@ - - import karax +import fuzzysearch proc findNodeWith(x: Element; tag, content: cstring): Element = if x.nodeName == tag and x.textContent == content: @@ -88,11 +87,11 @@ proc toHtml(x: TocEntry; isRoot=false): Element = if ul.len != 0: result.add ul if result.len == 0: result = nil -proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} = - {.emit: """ - var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&"); - return new RegExp("\\b" + escaped + "\\b").test(`a`); - """.} +#proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} = + #{.emit: """ + #var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&"); + #return new RegExp("\\b" + escaped + "\\b").test(`a`); + #""".} proc isWhitespace(text: cstring): bool {.asmNoStackFrame.} = {.emit: """ @@ -252,24 +251,29 @@ proc dosearch(value: cstring): Element = `stuff` = doc.documentElement; """.} - db = stuff.getElementsByClass"reference external" + db = stuff.getElementsByClass"reference" contents = @[] for ahref in db: - contents.add ahref.textContent.normalize + contents.add ahref.getAttribute("data-doc-search-tag") let ul = tree("UL") result = tree("DIV") result.setClass"search_results" var matches: seq[(Element, int)] = @[] - let key = value.normalize for i in 0..<db.len: let c = contents[i] - if c.containsWord(key): - matches.add((db[i], -(30_000 - c.len))) - elif c.contains(key): - matches.add((db[i], c.len)) + if c == "Examples" or c == "PEG construction": + # Some manual exclusions. + # Ideally these should be fixed in the index to be more + # descriptive of what they are. + continue + let (score, matched) = fuzzymatch(value, c) + if matched: + matches.add((db[i], score)) + matches.sort do (a, b: auto) -> int: - a[1] - b[1] - for i in 0..min(<matches.len, 19): + b[1] - a[1] + for i in 0 ..< min(matches.len, 19): + matches[i][0].innerHTML = matches[i][0].getAttribute("data-doc-search-tag") ul.add(tree("LI", matches[i][0])) if ul.len == 0: result.add tree("B", text"no search results") diff --git a/tools/dochack/fuzzysearch.nim b/tools/dochack/fuzzysearch.nim new file mode 100644 index 000000000..e6b1ea3cd --- /dev/null +++ b/tools/dochack/fuzzysearch.nim @@ -0,0 +1,139 @@ +# A Fuzzy Match implementation inspired by the sublime text fuzzy match algorithm +# as described here: https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb +# Heavily modified to provide more subjectively useful results +# for on the Nim manual. +# +import strutils +import math +import macros + + +const + MaxUnmatchedLeadingChar = 3 + ## Maximum number of times the penalty for unmatched leading chars is applied. + + HeadingScaleFactor = 0.5 + ## The score from before the colon Char is multiplied by this. + ## This is to weight function signatures and descriptions over module titles. + + +type + ScoreCard = enum + StartMatch = -100 ## Start matching. + LeadingCharDiff = -3 ## An unmatched, leading character was found. + CharDiff = -1 ## An unmatched character was found. + CharMatch = 0 ## A matched character was found. + ConsecutiveMatch = 5 ## A consecutive match was found. + LeadingCharMatch = 10 ## The character matches the begining of the + ## string or the first character of a word + ## or camel case boundry. + WordBoundryMatch = 20 ## The last ConsecutiveCharMatch that + ## immediately precedes the end of the string, + ## end of the pattern, or a LeadingCharMatch. + + +proc fuzzyMatch*(pattern, str: cstring) : tuple[score: int, matched: bool] = + var + scoreState = StartMatch + headerMatched = false + unmatchedLeadingCharCount = 0 + consecutiveMatchCount = 0 + strIndex = 0 + patIndex = 0 + score = 0 + + template transition(nextState) = + scoreState = nextState + score += ord(scoreState) + + while (strIndex < str.len) and (patIndex < pattern.len): + var + patternChar = pattern[patIndex].toLowerAscii + strChar = str[strIndex].toLowerAscii + + # Ignore certain characters + if patternChar in {'_', ' ', '.'}: + patIndex += 1 + continue + if strChar in {'_', ' ', '.'}: + strIndex += 1 + continue + + # Since this algorithm will be used to search against Nim documentation, + # the below logic prioritizes headers. + if not headerMatched and strChar == ':': + headerMatched = true + scoreState = StartMatch + score = toInt(floor(HeadingScaleFactor * toFloat(score))) + patIndex = 0 + strIndex += 1 + continue + + if strChar == patternChar: + case scoreState + of StartMatch, WordBoundryMatch: + scoreState = LeadingCharMatch + + of CharMatch: + transition(ConsecutiveMatch) + + of LeadingCharMatch, ConsecutiveMatch: + consecutiveMatchCount += 1 + scoreState = ConsecutiveMatch + score += ord(ConsecutiveMatch) * consecutiveMatchCount + + if scoreState == LeadingCharMatch: + score += ord(LeadingCharMatch) + + var onBoundary = (patIndex == high(pattern)) + if not onBoundary: + let + nextPatternChar = toLowerAscii(pattern[patIndex + 1]) + nextStrChar = toLowerAscii(str[strIndex + 1]) + + onBoundary = ( + nextStrChar notin {'a'..'z'} and + nextStrChar != nextPatternChar + ) + + if onBoundary: + transition(WordBoundryMatch) + + of CharDiff, LeadingCharDiff: + var isLeadingChar = ( + str[strIndex - 1] notin Letters or + str[strIndex - 1] in {'a'..'z'} and + str[strIndex] in {'A'..'Z'} + ) + + if isLeadingChar: + scoreState = LeadingCharMatch + #a non alpha or a camel case transition counts as a leading char. + # Transition the state, but don't give the bonus yet; wait until we verify a consecutive match. + else: + transition(CharMatch) + patIndex += 1 + + else: + case scoreState + of StartMatch: + transition(LeadingCharDiff) + + of ConsecutiveMatch: + transition(CharDiff) + consecutiveMatchCount = 0 + + of LeadingCharDiff: + if unmatchedLeadingCharCount < MaxUnmatchedLeadingChar: + transition(LeadingCharDiff) + unmatchedLeadingCharCount += 1 + + else: + transition(CharDiff) + + strIndex += 1 + + result = ( + score: max(0, score), + matched: (score > 0), + ) |