diff options
Diffstat (limited to 'lib/packages/docutils/highlite.nim')
-rw-r--r-- | lib/packages/docutils/highlite.nim | 207 |
1 files changed, 159 insertions, 48 deletions
diff --git a/lib/packages/docutils/highlite.nim b/lib/packages/docutils/highlite.nim index 8c91e0a8e..f8376f46c 100644 --- a/lib/packages/docutils/highlite.nim +++ b/lib/packages/docutils/highlite.nim @@ -11,11 +11,9 @@ ## Currently only few languages are supported, other languages may be added. ## The interface supports one language nested in another. ## -## **Note:** Import `packages/docutils/highlite` to use this module -## ## You can use this to build your own syntax highlighting, check this example: ## -## .. code::nim +## ```Nim ## let code = """for x in $int.high: echo x.ord mod 2 == 0""" ## var toknizr: GeneralTokenizer ## initGeneralTokenizer(toknizr, code) @@ -33,21 +31,43 @@ ## else: ## echo toknizr.kind # All the kinds of tokens can be processed here. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1) +## ``` ## ## The proc `getSourceLanguage` can get the language `enum` from a string: -## -## .. code::nim +## ```Nim ## for l in ["C", "c++", "jAvA", "Nim", "c#"]: echo getSourceLanguage(l) +## ``` +## +## There is also a `Cmd` pseudo-language supported, which is a simple generic +## shell/cmdline tokenizer (UNIX shell/Powershell/Windows Command): +## no escaping, no programming language constructs besides variable definition +## at the beginning of line. It supports these operators: +## ```Cmd +## & && | || ( ) '' "" ; # for comments +## ``` ## +## Instead of escaping always use quotes like here +## `nimgrep --ext:'nim|nims' file.name`:cmd: shows how to input ``|``. +## Any argument that contains ``.`` or ``/`` or ``\`` will be treated +## as a file or directory. +## +## In addition to `Cmd` there is also `Console` language for +## displaying interactive sessions. +## Lines with a command should start with ``$``, other lines are considered +## as program output. import - strutils -from algorithm import binarySearch + std/strutils +from std/algorithm import binarySearch + +when defined(nimPreviewSlimSystem): + import std/[assertions, syncio] + type SourceLanguage* = enum langNone, langNim, langCpp, langCsharp, langC, langJava, - langYaml, langPython + langYaml, langPython, langCmd, langConsole TokenClass* = enum gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber, gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit, @@ -55,7 +75,7 @@ type gtOperator, gtPunctuation, gtComment, gtLongComment, gtRegularExpression, gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler, gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel, - gtReference, gtOther + gtReference, gtPrompt, gtProgramOutput, gtProgram, gtOption, gtOther GeneralTokenizer* = object of RootObj kind*: TokenClass start*, length*: int @@ -66,14 +86,20 @@ type const sourceLanguageToStr*: array[SourceLanguage, string] = ["none", - "Nim", "C++", "C#", "C", "Java", "Yaml", "Python"] + "Nim", "C++", "C#", "C", "Java", "Yaml", "Python", "Cmd", "Console"] + sourceLanguageToAlpha*: array[SourceLanguage, string] = ["none", + "Nim", "cpp", "csharp", "C", "Java", "Yaml", "Python", "Cmd", "Console"] + ## list of languages spelled with alpabetic characters tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace", "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber", "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit", "EscapeSequence", "Operator", "Punctuation", "Comment", "LongComment", "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData", "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink", - "Label", "Reference", "Other"] + "Label", "Reference", "Prompt", "ProgramOutput", + # start from lower-case if there is a corresponding RST role (see rst.nim) + "program", "option", + "Other"] # The following list comes from doc/keywords.txt, make sure it is # synchronized with this array by running the module itself as a test case. @@ -90,9 +116,11 @@ const "xor", "yield"] proc getSourceLanguage*(name: string): SourceLanguage = - for i in countup(succ(low(SourceLanguage)), high(SourceLanguage)): + for i in succ(low(SourceLanguage)) .. high(SourceLanguage): if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0: return i + if cmpIgnoreStyle(name, sourceLanguageToAlpha[i]) == 0: + return i result = langNone proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) = @@ -102,9 +130,7 @@ proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) = g.length = 0 g.state = low(TokenClass) g.lang = low(SourceLanguage) - var pos = 0 # skip initial whitespace: - while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos) - g.pos = pos + g.pos = 0 proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: string) = initGeneralTokenizer(g, cstring(buf)) @@ -165,9 +191,6 @@ const proc isKeyword(x: openArray[string], y: string): int = binarySearch(x, y) -proc isKeywordIgnoreCase(x: openArray[string], y: string): int = - binarySearch(x, y, cmpIgnoreCase) - proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) = const hexChars = {'0'..'9', 'A'..'F', 'a'..'f', '_'} @@ -177,31 +200,33 @@ proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) = var pos = g.pos g.start = g.pos if g.state == gtStringLit: - g.kind = gtStringLit - while true: + if g.buf[pos] == '\\': + g.kind = gtEscapeSequence + inc(pos) case g.buf[pos] - of '\\': - g.kind = gtEscapeSequence + of 'x', 'X': inc(pos) + if g.buf[pos] in hexChars: inc(pos) + if g.buf[pos] in hexChars: inc(pos) + of '0'..'9': + while g.buf[pos] in {'0'..'9'}: inc(pos) + of '\0': + g.state = gtNone + else: inc(pos) + else: + g.kind = gtStringLit + while true: case g.buf[pos] - of 'x', 'X': + of '\\': + break + of '\0', '\r', '\n': + g.state = gtNone + break + of '\"': inc(pos) - if g.buf[pos] in hexChars: inc(pos) - if g.buf[pos] in hexChars: inc(pos) - of '0'..'9': - while g.buf[pos] in {'0'..'9'}: inc(pos) - of '\0': g.state = gtNone + break else: inc(pos) - break - of '\0', '\r', '\n': - g.state = gtNone - break - of '\"': - inc(pos) - g.state = gtNone - break - else: inc(pos) else: case g.buf[pos] of ' ', '\t'..'\r': @@ -299,17 +324,18 @@ proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) = pos = nimNumber(g, pos) of '\'': inc(pos) - g.kind = gtCharLit - while true: - case g.buf[pos] - of '\0', '\r', '\n': - break - of '\'': - inc(pos) - break - of '\\': - inc(pos, 2) - else: inc(pos) + if g.kind != gtPunctuation: + g.kind = gtCharLit + while true: + case g.buf[pos] + of '\0', '\r', '\n': + break + of '\'': + inc(pos) + break + of '\\': + inc(pos, 2) + else: inc(pos) of '\"': inc(pos) if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'): @@ -473,6 +499,9 @@ proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string], of '\0': break else: inc(pos) + else: + g.kind = gtOperator + while g.buf[pos] in OpChars: inc(pos) of '#': inc(pos) if hasPreprocessor in flags: @@ -900,6 +929,74 @@ proc pythonNextToken(g: var GeneralTokenizer) = "with", "yield"] nimNextToken(g, keywords) +proc cmdNextToken(g: var GeneralTokenizer, dollarPrompt = false) = + var pos = g.pos + g.start = g.pos + if g.state == low(TokenClass): + g.state = if dollarPrompt: gtPrompt else: gtProgram + case g.buf[pos] + of ' ', '\t'..'\r': + g.kind = gtWhitespace + while g.buf[pos] in {' ', '\t'..'\r'}: + if g.buf[pos] == '\n': + g.state = if dollarPrompt: gtPrompt else: gtProgram + inc(pos) + of '\'', '"': + g.kind = gtOption + let q = g.buf[pos] + inc(pos) + while g.buf[pos] notin {q, '\0'}: + inc(pos) + if g.buf[pos] == q: inc(pos) + of '#': + g.kind = gtComment + while g.buf[pos] notin {'\n', '\0'}: + inc(pos) + of '&', '|': + g.kind = gtOperator + inc(pos) + if g.buf[pos] == g.buf[pos-1]: inc(pos) + g.state = gtProgram + of '(': + g.kind = gtOperator + g.state = gtProgram + inc(pos) + of ')': + g.kind = gtOperator + inc(pos) + of ';': + g.state = gtProgram + g.kind = gtOperator + inc(pos) + of '\0': g.kind = gtEof + elif dollarPrompt and g.state == gtPrompt: + if g.buf[pos] == '$' and g.buf[pos+1] in {' ', '\t'}: + g.kind = gtPrompt + inc pos, 2 + g.state = gtProgram + else: + g.kind = gtProgramOutput + while g.buf[pos] notin {'\n', '\0'}: + inc(pos) + else: + if g.state == gtProgram: + g.kind = gtProgram + g.state = gtOption + else: + g.kind = gtOption + while g.buf[pos] notin {' ', '\t'..'\r', '&', '|', '(', ')', '\'', '"', '\0'}: + if g.buf[pos] == ';' and g.buf[pos+1] == ' ': + # (check space because ';' can be used inside arguments in Win bat) + break + if g.kind == gtOption and g.buf[pos] in {'/', '\\', '.'}: + g.kind = gtIdentifier # for file/dir name + elif g.kind == gtProgram and g.buf[pos] == '=': + g.kind = gtIdentifier # for env variable setting at beginning of line + g.state = gtProgram + inc(pos) + g.length = pos - g.pos + g.pos = pos + proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) = g.lang = lang case lang @@ -911,6 +1008,20 @@ proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) = of langJava: javaNextToken(g) of langYaml: yamlNextToken(g) of langPython: pythonNextToken(g) + of langCmd: cmdNextToken(g) + of langConsole: cmdNextToken(g, dollarPrompt=true) + +proc tokenize*(text: string, lang: SourceLanguage): seq[(string, TokenClass)] = + var g: GeneralTokenizer + initGeneralTokenizer(g, text) + var prevPos = 0 + while true: + getNextToken(g, lang) + if g.kind == gtEof: + break + var s = text[prevPos ..< g.pos] + result.add (s, g.kind) + prevPos = g.pos when isMainModule: var keywords: seq[string] |