diff options
Diffstat (limited to 'lib/packages/docutils/highlite.nim')
-rw-r--r-- | lib/packages/docutils/highlite.nim | 345 |
1 files changed, 234 insertions, 111 deletions
diff --git a/lib/packages/docutils/highlite.nim b/lib/packages/docutils/highlite.nim index 796c17d7d..f8376f46c 100644 --- a/lib/packages/docutils/highlite.nim +++ b/lib/packages/docutils/highlite.nim @@ -11,11 +11,9 @@ ## Currently only few languages are supported, other languages may be added. ## The interface supports one language nested in another. ## -## **Note:** Import ``packages/docutils/highlite`` to use this module -## ## You can use this to build your own syntax highlighting, check this example: ## -## .. code::nim +## ```Nim ## let code = """for x in $int.high: echo x.ord mod 2 == 0""" ## var toknizr: GeneralTokenizer ## initGeneralTokenizer(toknizr, code) @@ -33,18 +31,43 @@ ## else: ## echo toknizr.kind # All the kinds of tokens can be processed here. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1) +## ``` ## -## The proc ``getSourceLanguage`` can get the language ``enum`` from a string: -## -## .. code::nim +## The proc `getSourceLanguage` can get the language `enum` from a string: +## ```Nim ## for l in ["C", "c++", "jAvA", "Nim", "c#"]: echo getSourceLanguage(l) +## ``` +## +## There is also a `Cmd` pseudo-language supported, which is a simple generic +## shell/cmdline tokenizer (UNIX shell/Powershell/Windows Command): +## no escaping, no programming language constructs besides variable definition +## at the beginning of line. It supports these operators: +## ```Cmd +## & && | || ( ) '' "" ; # for comments +## ``` +## +## Instead of escaping always use quotes like here +## `nimgrep --ext:'nim|nims' file.name`:cmd: shows how to input ``|``. +## Any argument that contains ``.`` or ``/`` or ``\`` will be treated +## as a file or directory. ## +## In addition to `Cmd` there is also `Console` language for +## displaying interactive sessions. +## Lines with a command should start with ``$``, other lines are considered +## as program output. import - strutils -from algorithm import binarySearch + std/strutils +from std/algorithm import binarySearch + +when defined(nimPreviewSlimSystem): + import std/[assertions, syncio] + type + SourceLanguage* = enum + langNone, langNim, langCpp, langCsharp, langC, langJava, + langYaml, langPython, langCmd, langConsole TokenClass* = enum gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber, gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit, @@ -52,28 +75,31 @@ type gtOperator, gtPunctuation, gtComment, gtLongComment, gtRegularExpression, gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler, gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel, - gtReference, gtOther + gtReference, gtPrompt, gtProgramOutput, gtProgram, gtOption, gtOther GeneralTokenizer* = object of RootObj kind*: TokenClass start*, length*: int buf: cstring pos: int state: TokenClass - - SourceLanguage* = enum - langNone, langNim, langCpp, langCsharp, langC, langJava, - langYaml + lang: SourceLanguage const sourceLanguageToStr*: array[SourceLanguage, string] = ["none", - "Nim", "C++", "C#", "C", "Java", "Yaml"] + "Nim", "C++", "C#", "C", "Java", "Yaml", "Python", "Cmd", "Console"] + sourceLanguageToAlpha*: array[SourceLanguage, string] = ["none", + "Nim", "cpp", "csharp", "C", "Java", "Yaml", "Python", "Cmd", "Console"] + ## list of languages spelled with alpabetic characters tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace", "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber", "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit", "EscapeSequence", "Operator", "Punctuation", "Comment", "LongComment", "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData", "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink", - "Label", "Reference", "Other"] + "Label", "Reference", "Prompt", "ProgramOutput", + # start from lower-case if there is a corresponding RST role (see rst.nim) + "program", "option", + "Other"] # The following list comes from doc/keywords.txt, make sure it is # synchronized with this array by running the module itself as a test case. @@ -90,9 +116,11 @@ const "xor", "yield"] proc getSourceLanguage*(name: string): SourceLanguage = - for i in countup(succ(low(SourceLanguage)), high(SourceLanguage)): + for i in succ(low(SourceLanguage)) .. high(SourceLanguage): if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0: return i + if cmpIgnoreStyle(name, sourceLanguageToAlpha[i]) == 0: + return i result = langNone proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) = @@ -101,9 +129,8 @@ proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) = g.start = 0 g.length = 0 g.state = low(TokenClass) - var pos = 0 # skip initial whitespace: - while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos) - g.pos = pos + g.lang = low(SourceLanguage) + g.pos = 0 proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: string) = initGeneralTokenizer(g, cstring(buf)) @@ -161,7 +188,10 @@ const OpChars = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.', '|', '=', '%', '&', '$', '@', '~', ':'} -proc nimNextToken(g: var GeneralTokenizer) = +proc isKeyword(x: openArray[string], y: string): int = + binarySearch(x, y) + +proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) = const hexChars = {'0'..'9', 'A'..'F', 'a'..'f', '_'} octChars = {'0'..'7', '_'} @@ -170,36 +200,38 @@ proc nimNextToken(g: var GeneralTokenizer) = var pos = g.pos g.start = g.pos if g.state == gtStringLit: - g.kind = gtStringLit - while true: + if g.buf[pos] == '\\': + g.kind = gtEscapeSequence + inc(pos) case g.buf[pos] - of '\\': - g.kind = gtEscapeSequence + of 'x', 'X': inc(pos) + if g.buf[pos] in hexChars: inc(pos) + if g.buf[pos] in hexChars: inc(pos) + of '0'..'9': + while g.buf[pos] in {'0'..'9'}: inc(pos) + of '\0': + g.state = gtNone + else: inc(pos) + else: + g.kind = gtStringLit + while true: case g.buf[pos] - of 'x', 'X': + of '\\': + break + of '\0', '\r', '\n': + g.state = gtNone + break + of '\"': inc(pos) - if g.buf[pos] in hexChars: inc(pos) - if g.buf[pos] in hexChars: inc(pos) - of '0'..'9': - while g.buf[pos] in {'0'..'9'}: inc(pos) - of '\0': g.state = gtNone + break else: inc(pos) - break - of '\0', '\x0D', '\x0A': - g.state = gtNone - break - of '\"': - inc(pos) - g.state = gtNone - break - else: inc(pos) else: case g.buf[pos] - of ' ', '\x09'..'\x0D': + of ' ', '\t'..'\r': g.kind = gtWhitespace - while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos) + while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos) of '#': g.kind = gtComment inc(pos) @@ -207,7 +239,7 @@ proc nimNextToken(g: var GeneralTokenizer) = if g.buf[pos] == '#': inc(pos) isDoc = true - if g.buf[pos] == '[': + if g.buf[pos] == '[' and g.lang == langNim: g.kind = gtLongComment var nesting = 0 while true: @@ -236,7 +268,7 @@ proc nimNextToken(g: var GeneralTokenizer) = else: inc pos else: - while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos) of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF': var id = "" while g.buf[pos] in SymChars + {'_'}: @@ -260,12 +292,15 @@ proc nimNextToken(g: var GeneralTokenizer) = else: g.kind = gtRawData inc(pos) - while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): + while not (g.buf[pos] in {'\0', '\n', '\r'}): if g.buf[pos] == '"' and g.buf[pos+1] != '"': break inc(pos) if g.buf[pos] == '\"': inc(pos) else: - g.kind = nimGetKeyword(id) + if g.lang == langNim: + g.kind = nimGetKeyword(id) + elif isKeyword(keywords, id) >= 0: + g.kind = gtKeyword of '0': inc(pos) case g.buf[pos] @@ -289,17 +324,18 @@ proc nimNextToken(g: var GeneralTokenizer) = pos = nimNumber(g, pos) of '\'': inc(pos) - g.kind = gtCharLit - while true: - case g.buf[pos] - of '\0', '\x0D', '\x0A': - break - of '\'': - inc(pos) - break - of '\\': - inc(pos, 2) - else: inc(pos) + if g.kind != gtPunctuation: + g.kind = gtCharLit + while true: + case g.buf[pos] + of '\0', '\r', '\n': + break + of '\'': + inc(pos) + break + of '\\': + inc(pos, 2) + else: inc(pos) of '\"': inc(pos) if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'): @@ -320,7 +356,7 @@ proc nimNextToken(g: var GeneralTokenizer) = g.kind = gtStringLit while true: case g.buf[pos] - of '\0', '\x0D', '\x0A': + of '\0', '\r', '\n': break of '\"': inc(pos) @@ -394,12 +430,6 @@ proc generalStrLit(g: var GeneralTokenizer, position: int): int = inc(pos) result = pos -proc isKeyword(x: openArray[string], y: string): int = - binarySearch(x, y) - -proc isKeywordIgnoreCase(x: openArray[string], y: string): int = - binarySearch(x, y, cmpIgnoreCase) - type TokenizerFlag = enum hasPreprocessor, hasNestedComments @@ -432,7 +462,7 @@ proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string], g.state = gtNone else: inc(pos) break - of '\0', '\x0D', '\x0A': + of '\0', '\r', '\n': g.state = gtNone break of '\"': @@ -442,14 +472,14 @@ proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string], else: inc(pos) else: case g.buf[pos] - of ' ', '\x09'..'\x0D': + of ' ', '\t'..'\r': g.kind = gtWhitespace - while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos) + while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos) of '/': inc(pos) if g.buf[pos] == '/': g.kind = gtComment - while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): inc(pos) + while not (g.buf[pos] in {'\0', '\n', '\r'}): inc(pos) elif g.buf[pos] == '*': g.kind = gtLongComment var nested = 0 @@ -469,6 +499,9 @@ proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string], of '\0': break else: inc(pos) + else: + g.kind = gtOperator + while g.buf[pos] in OpChars: inc(pos) of '#': inc(pos) if hasPreprocessor in flags: @@ -589,9 +622,9 @@ proc javaNextToken(g: var GeneralTokenizer) = proc yamlPlainStrLit(g: var GeneralTokenizer, pos: var int) = g.kind = gtStringLit - while g.buf[pos] notin {'\0', '\x09'..'\x0D', ',', ']', '}'}: + while g.buf[pos] notin {'\0', '\t'..'\r', ',', ']', '}'}: if g.buf[pos] == ':' and - g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}: + g.buf[pos + 1] in {'\0', '\t'..'\r', ' '}: break inc(pos) @@ -604,14 +637,14 @@ proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) = while g.buf[pos] in {'0'..'9'}: inc(pos) else: yamlPlainStrLit(g, pos) if g.kind == gtNone: - if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}: + if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}: g.kind = gtDecNumber elif g.buf[pos] == '.': inc(pos) if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos) else: while g.buf[pos] in {'0'..'9'}: inc(pos) - if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}: + if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}: g.kind = gtFloatNumber if g.kind == gtNone: if g.buf[pos] in {'e', 'E'}: @@ -620,13 +653,13 @@ proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) = if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos) else: while g.buf[pos] in {'0'..'9'}: inc(pos) - if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}: + if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}: g.kind = gtFloatNumber else: yamlPlainStrLit(g, pos) else: yamlPlainStrLit(g, pos) - while g.buf[pos] notin {'\0', ',', ']', '}', '\x0A', '\x0D'}: + while g.buf[pos] notin {'\0', ',', ']', '}', '\n', '\r'}: inc(pos) - if g.buf[pos] notin {'\x09'..'\x0D', ' ', ',', ']', '}'}: + if g.buf[pos] notin {'\t'..'\r', ' ', ',', ']', '}'}: yamlPlainStrLit(g, pos) break # theoretically, we would need to parse indentation (like with block scalars) @@ -651,19 +684,16 @@ proc yamlNextToken(g: var GeneralTokenizer) = of 'x': inc(pos) for i in 1..2: - {.unroll.} if g.buf[pos] in hexChars: inc(pos) break of 'u': inc(pos) for i in 1..4: - {.unroll.} if g.buf[pos] in hexChars: inc(pos) break of 'U': inc(pos) for i in 1..8: - {.unroll.} if g.buf[pos] in hexChars: inc(pos) break else: inc(pos) @@ -698,13 +728,13 @@ proc yamlNextToken(g: var GeneralTokenizer) = while g.buf[pos] in {' ', '\t'}: inc(pos) of '#': g.kind = gtComment - while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) - of '\x0A', '\x0D': discard + while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos) + of '\n', '\r': discard else: # illegal here. just don't parse a block scalar g.kind = gtNone g.state = gtOther - if g.buf[pos] in {'\x0A', '\x0D'} and g.state == gtCommand: + if g.buf[pos] in {'\n', '\r'} and g.state == gtCommand: g.state = gtLongStringLit elif g.state == gtLongStringLit: # beware, this is the only token where we actually have to parse @@ -713,10 +743,10 @@ proc yamlNextToken(g: var GeneralTokenizer) = g.kind = gtLongStringLit # first, we have to find the parent indentation of the block scalar, so that # we know when to stop - assert g.buf[pos] in {'\x0A', '\x0D'} + assert g.buf[pos] in {'\n', '\r'} var lookbehind = pos - 1 var headerStart = -1 - while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}: + while lookbehind >= 0 and g.buf[lookbehind] notin {'\n', '\r'}: if headerStart == -1 and g.buf[lookbehind] in {'|', '>'}: headerStart = lookbehind dec(lookbehind) @@ -727,12 +757,12 @@ proc yamlNextToken(g: var GeneralTokenizer) = # when the header is alone in a line, this line does not show the parent's # indentation, so we must go further. search the first previous line with # non-whitespace content. - while lookbehind >= 0 and g.buf[lookbehind] in {'\x0A', '\x0D'}: + while lookbehind >= 0 and g.buf[lookbehind] in {'\n', '\r'}: dec(lookbehind) while lookbehind >= 0 and g.buf[lookbehind] in {' ', '\t'}: dec(lookbehind) # now, find the beginning of the line... - while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}: + while lookbehind >= 0 and g.buf[lookbehind] notin {'\n', '\r'}: dec(lookbehind) # ... and its indentation indentation = 1 @@ -740,7 +770,7 @@ proc yamlNextToken(g: var GeneralTokenizer) = if lookbehind == -1: indentation = 0 # top level elif g.buf[lookbehind + 1] == '-' and g.buf[lookbehind + 2] == '-' and g.buf[lookbehind + 3] == '-' and - g.buf[lookbehind + 4] in {'\x09'..'\x0D', ' '}: + g.buf[lookbehind + 4] in {'\t'..'\r', ' '}: # this is a document start, therefore, we are at top level indentation = 0 # because lookbehind was at newline char when calculating indentation, we're @@ -748,7 +778,7 @@ proc yamlNextToken(g: var GeneralTokenizer) = let parentIndentation = indentation - 1 # find first content - while g.buf[pos] in {' ', '\x0A', '\x0D'}: + while g.buf[pos] in {' ', '\n', '\r'}: if g.buf[pos] == ' ': inc(indentation) else: indentation = 0 inc(pos) @@ -765,12 +795,12 @@ proc yamlNextToken(g: var GeneralTokenizer) = if (indentation < minIndentation and g.buf[pos] == '#') or (indentation == 0 and g.buf[pos] == '.' and g.buf[pos + 1] == '.' and g.buf[pos + 2] == '.' and - g.buf[pos + 3] in {'\0', '\x09'..'\x0D', ' '}): + g.buf[pos + 3] in {'\0', '\t'..'\r', ' '}): # comment after end of block scalar, or end of document break minIndentation = min(indentation, minIndentation) - while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) - while g.buf[pos] in {' ', '\x0A', '\x0D'}: + while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos) + while g.buf[pos] in {' ', '\n', '\r'}: if g.buf[pos] == ' ': inc(indentation) else: indentation = 0 inc(pos) @@ -779,30 +809,29 @@ proc yamlNextToken(g: var GeneralTokenizer) = elif g.state == gtOther: # gtOther means 'inside YAML document' case g.buf[pos] - of ' ', '\x09'..'\x0D': + of ' ', '\t'..'\r': g.kind = gtWhitespace - while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos) + while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos) of '#': g.kind = gtComment inc(pos) - while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos) of '-': inc(pos) - if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}: + if g.buf[pos] in {'\0', ' ', '\t'..'\r'}: g.kind = gtPunctuation elif g.buf[pos] == '-' and - (pos == 1 or g.buf[pos - 2] in {'\x0A', '\x0D'}): # start of line + (pos == 1 or g.buf[pos - 2] in {'\n', '\r'}): # start of line inc(pos) - if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}: + if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\t'..'\r', ' '}: inc(pos) g.kind = gtKeyword else: yamlPossibleNumber(g, pos) else: yamlPossibleNumber(g, pos) of '.': - if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}: + if pos == 0 or g.buf[pos - 1] in {'\n', '\r'}: inc(pos) for i in 1..2: - {.unroll.} if g.buf[pos] != '.': break inc(pos) if pos == g.start + 3: @@ -812,12 +841,12 @@ proc yamlNextToken(g: var GeneralTokenizer) = else: yamlPlainStrLit(g, pos) of '?': inc(pos) - if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}: + if g.buf[pos] in {'\0', ' ', '\t'..'\r'}: g.kind = gtPunctuation else: yamlPlainStrLit(g, pos) of ':': inc(pos) - if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', '\'', '\"'} or + if g.buf[pos] in {'\0', '\t'..'\r', ' ', '\'', '\"'} or (pos > 0 and g.buf[pos - 2] in {'}', ']', '\"', '\''}): g.kind = gtPunctuation else: yamlPlainStrLit(g, pos) @@ -836,7 +865,7 @@ proc yamlNextToken(g: var GeneralTokenizer) = inc(pos) if g.buf[pos] == '<': # literal tag (e.g. `!<tag:yaml.org,2002:str>`) - while g.buf[pos] notin {'\0', '>', '\x09'..'\x0D', ' '}: inc(pos) + while g.buf[pos] notin {'\0', '>', '\t'..'\r', ' '}: inc(pos) if g.buf[pos] == '>': inc(pos) else: while g.buf[pos] in {'A'..'Z', 'a'..'z', '0'..'9', '-'}: inc(pos) @@ -845,17 +874,17 @@ proc yamlNextToken(g: var GeneralTokenizer) = # prefixed tag (e.g. `!!str`) inc(pos) while g.buf[pos] notin - {'\0', '\x09'..'\x0D', ' ', ',', '[', ']', '{', '}'}: inc(pos) - of '\0', '\x09'..'\x0D', ' ': discard + {'\0', '\t'..'\r', ' ', ',', '[', ']', '{', '}'}: inc(pos) + of '\0', '\t'..'\r', ' ': discard else: # local tag (e.g. `!nim:system:int`) - while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos) + while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos) of '&': g.kind = gtLabel - while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos) + while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos) of '*': g.kind = gtReference - while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos) + while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos) of '|', '>': # this can lead to incorrect tokenization when | or > appear inside flow # content. checking whether we're inside flow content is not @@ -871,18 +900,18 @@ proc yamlNextToken(g: var GeneralTokenizer) = # outside document case g.buf[pos] of '%': - if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}: + if pos == 0 or g.buf[pos - 1] in {'\n', '\r'}: g.kind = gtDirective - while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos) else: g.state = gtOther yamlPlainStrLit(g, pos) - of ' ', '\x09'..'\x0D': + of ' ', '\t'..'\r': g.kind = gtWhitespace - while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos) + while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos) of '#': g.kind = gtComment - while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos) of '\0': g.kind = gtEof else: g.kind = gtNone @@ -890,7 +919,86 @@ proc yamlNextToken(g: var GeneralTokenizer) = g.length = pos - g.pos g.pos = pos +proc pythonNextToken(g: var GeneralTokenizer) = + const + keywords: array[0..34, string] = [ + "False", "None", "True", "and", "as", "assert", "async", "await", + "break", "class", "continue", "def", "del", "elif", "else", "except", + "finally", "for", "from", "global", "if", "import", "in", "is", "lambda", + "nonlocal", "not", "or", "pass", "raise", "return", "try", "while", + "with", "yield"] + nimNextToken(g, keywords) + +proc cmdNextToken(g: var GeneralTokenizer, dollarPrompt = false) = + var pos = g.pos + g.start = g.pos + if g.state == low(TokenClass): + g.state = if dollarPrompt: gtPrompt else: gtProgram + case g.buf[pos] + of ' ', '\t'..'\r': + g.kind = gtWhitespace + while g.buf[pos] in {' ', '\t'..'\r'}: + if g.buf[pos] == '\n': + g.state = if dollarPrompt: gtPrompt else: gtProgram + inc(pos) + of '\'', '"': + g.kind = gtOption + let q = g.buf[pos] + inc(pos) + while g.buf[pos] notin {q, '\0'}: + inc(pos) + if g.buf[pos] == q: inc(pos) + of '#': + g.kind = gtComment + while g.buf[pos] notin {'\n', '\0'}: + inc(pos) + of '&', '|': + g.kind = gtOperator + inc(pos) + if g.buf[pos] == g.buf[pos-1]: inc(pos) + g.state = gtProgram + of '(': + g.kind = gtOperator + g.state = gtProgram + inc(pos) + of ')': + g.kind = gtOperator + inc(pos) + of ';': + g.state = gtProgram + g.kind = gtOperator + inc(pos) + of '\0': g.kind = gtEof + elif dollarPrompt and g.state == gtPrompt: + if g.buf[pos] == '$' and g.buf[pos+1] in {' ', '\t'}: + g.kind = gtPrompt + inc pos, 2 + g.state = gtProgram + else: + g.kind = gtProgramOutput + while g.buf[pos] notin {'\n', '\0'}: + inc(pos) + else: + if g.state == gtProgram: + g.kind = gtProgram + g.state = gtOption + else: + g.kind = gtOption + while g.buf[pos] notin {' ', '\t'..'\r', '&', '|', '(', ')', '\'', '"', '\0'}: + if g.buf[pos] == ';' and g.buf[pos+1] == ' ': + # (check space because ';' can be used inside arguments in Win bat) + break + if g.kind == gtOption and g.buf[pos] in {'/', '\\', '.'}: + g.kind = gtIdentifier # for file/dir name + elif g.kind == gtProgram and g.buf[pos] == '=': + g.kind = gtIdentifier # for env variable setting at beginning of line + g.state = gtProgram + inc(pos) + g.length = pos - g.pos + g.pos = pos + proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) = + g.lang = lang case lang of langNone: assert false of langNim: nimNextToken(g) @@ -899,13 +1007,28 @@ proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) = of langC: cNextToken(g) of langJava: javaNextToken(g) of langYaml: yamlNextToken(g) + of langPython: pythonNextToken(g) + of langCmd: cmdNextToken(g) + of langConsole: cmdNextToken(g, dollarPrompt=true) + +proc tokenize*(text: string, lang: SourceLanguage): seq[(string, TokenClass)] = + var g: GeneralTokenizer + initGeneralTokenizer(g, text) + var prevPos = 0 + while true: + getNextToken(g, lang) + if g.kind == gtEof: + break + var s = text[prevPos ..< g.pos] + result.add (s, g.kind) + prevPos = g.pos when isMainModule: var keywords: seq[string] # Try to work running in both the subdir or at the root. for filename in ["doc/keywords.txt", "../../../doc/keywords.txt"]: try: - let input = string(readFile(filename)) + let input = readFile(filename) keywords = input.splitWhitespace() break except: |