diff options
-rw-r--r-- | lib/packages/docutils/highlite.nim | 298 |
1 files changed, 296 insertions, 2 deletions
diff --git a/lib/packages/docutils/highlite.nim b/lib/packages/docutils/highlite.nim index 1bc0af1b6..488b85545 100644 --- a/lib/packages/docutils/highlite.nim +++ b/lib/packages/docutils/highlite.nim @@ -31,13 +31,14 @@ type state: TokenClass SourceLanguage* = enum - langNone, langNim, langNimrod, langCpp, langCsharp, langC, langJava + langNone, langNim, langNimrod, langCpp, langCsharp, langC, langJava, + langYaml {.deprecated: [TSourceLanguage: SourceLanguage, TTokenClass: TokenClass, TGeneralTokenizer: GeneralTokenizer].} const sourceLanguageToStr*: array[SourceLanguage, string] = ["none", - "Nim", "Nimrod", "C++", "C#", "C", "Java"] + "Nim", "Nimrod", "C++", "C#", "C", "Java", "Yaml"] tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace", "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber", "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit", @@ -578,6 +579,298 @@ proc javaNextToken(g: var GeneralTokenizer) = "try", "void", "volatile", "while"] clikeNextToken(g, keywords, {}) +proc yamlPlainStrLit(g: var GeneralTokenizer, pos: var int) = + g.kind = gtStringLit + while g.buf[pos] notin {'\0', '\x09'..'\x0D', ',', ']', '}'}: + if g.buf[pos] == ':' and + g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}: + break + inc(pos) + +proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) = + g.kind = gtNone + if g.buf[pos] == '-': inc(pos) + if g.buf[pos] == '0': + inc(pos) + elif g.buf[pos] in '1'..'9': + inc(pos) + while g.buf[pos] in {'0'..'9'}: inc(pos) + else: yamlPlainStrLit(g, pos) + if g.kind == gtNone: + if g.buf[pos] in {'\0', '\x09'..'\x0D', ' '}: g.kind = gtDecNumber + elif g.buf[pos] == '.': + inc(pos) + if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos) + else: + while g.buf[pos] in {'0'..'9'}: inc(pos) + if g.buf[pos] in {'\0', '\x09'..'\x0D', ' '}: g.kind = gtFloatNumber + if g.kind == gtNone: + if g.buf[pos] in {'e', 'E'}: + inc(pos) + if g.buf[pos] in {'-', '+'}: inc(pos) + if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos) + else: + while g.buf[pos] in {'0'..'9'}: inc(pos) + if g.buf[pos] in {'\0', '\x09'..'\x0D', ' '}: g.kind = gtFloatNumber + else: yamlPlainStrLit(g, pos) + else: yamlPlainStrLit(g, pos) + +proc yamlNextToken(g: var GeneralTokenizer) = + const + hexChars = {'0'..'9', 'A'..'F', 'a'..'f'} + var pos = g.pos + g.start = g.pos + if g.state == gtStringLit: + g.kind = gtStringLit + while true: + case g.buf[pos] + of '\\': + if pos != g.pos: break + g.kind = gtEscapeSequence + inc(pos) + case g.buf[pos] + of 'x': + inc(pos) + for i in 1..2: + {.unroll.} + if g.buf[pos] in hexChars: inc(pos) + break + of 'u': + inc(pos) + for i in 1..4: + {.unroll.} + if g.buf[pos] in hexChars: inc(pos) + break + of 'U': + inc(pos) + for i in 1..8: + {.unroll.} + if g.buf[pos] in hexChars: inc(pos) + break + else: inc(pos) + break + of '\0': + g.state = gtOther + break + of '\"': + inc(pos) + g.state = gtOther + break + else: inc(pos) + elif g.state == gtCharLit: + # abusing gtCharLit as single-quoted string lit + g.kind = gtStringLit + while true: + case g.buf[pos] + of '\'': + inc(pos) + if g.buf[pos] == '\'': + inc(pos) + g.kind = gtEscapeSequence + else: g.state = gtOther + break + else: inc(pos) + elif g.state == gtCommand: + # gtCommand means 'block scalar header' + case g.buf[pos] + of ' ', '\t': + g.kind = gtWhitespace + while g.buf[pos] in {' ', '\t'}: inc(pos) + of '#': + g.kind = gtComment + while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + of '\x0A', '\x0D': discard + else: + # illegal here. just don't parse a block scalar + g.kind = gtNone + g.state = gtOther + if g.buf[pos] in {'\x0A', '\x0D'} and g.state == gtCommand: + g.state = gtLongStringLit + elif g.state == gtLongStringLit: + # beware, this is the only token where we actually have to parse + # indentation. + + g.kind = gtLongStringLit + # first, we have to find the parent indentation of the block scalar, so that + # we know when to stop + assert g.buf[pos] in {'\x0A', '\x0D'} + var lookbehind = pos - 1 + var headerStart = -1 + while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}: + if headerStart == -1 and g.buf[lookbehind] in {'|', '>'}: + headerStart = lookbehind + dec(lookbehind) + assert headerStart != -1 + var indentation = 1 + while g.buf[lookbehind + indentation] == ' ': inc(indentation) + if g.buf[lookbehind + indentation] in {'|', '>'}: + # when the header is alone in a line, this line does not show the parent's + # indentation, so we must go further. search the first previous line with + # non-whitespace content. + while lookbehind >= 0 and g.buf[lookbehind] in {'\x0A', '\x0D'}: + dec(lookbehind) + while lookbehind >= 0 and + g.buf[lookbehind] in {' ', '\t'}: dec(lookbehind) + # now, find the beginning of the line... + while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}: + dec(lookbehind) + # ... and its indentation + indentation = 1 + while g.buf[lookbehind + indentation] == ' ': inc(indentation) + if lookbehind == -1: indentation = 0 # top level + elif g.buf[lookbehind + 1] == '-' and g.buf[lookbehind + 2] == '-' and + g.buf[lookbehind + 3] == '-' and + g.buf[lookbehind + 4] in {'\x09'..'\x0D', ' '}: + # this is a document start, therefore, we are at top level + indentation = 0 + # because lookbehind was at newline char when calculating indentation, we're + # off by one. fix that. top level's parent will have indentation of -1. + let parentIndentation = indentation - 1 + + # find first content + while g.buf[pos] in {' ', '\x0A', '\x0D'}: + if g.buf[pos] == ' ': inc(indentation) + else: indentation = 0 + inc(pos) + var minIndentation = indentation + + # for stupid edge cases, we must check whether an explicit indentation depth + # is given at the header. + while g.buf[headerStart] in {'>', '|', '+', '-'}: inc(headerStart) + if g.buf[headerStart] in {'0'..'9'}: + minIndentation = min(minIndentation, ord(g.buf[headerStart]) - ord('0')) + + # process content lines + while indentation > parentIndentation and g.buf[pos] != '\0': + if (indentation < minIndentation and g.buf[pos] == '#') or + (indentation == 0 and g.buf[pos] == '.' and g.buf[pos + 1] == '.' and + g.buf[pos + 2] == '.' and + g.buf[pos + 3] in {'\0', '\x09'..'\x0D', ' '}): + # comment after end of block scalar, or end of document + break + minIndentation = min(indentation, minIndentation) + while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + while g.buf[pos] in {' ', '\x0A', '\x0D'}: + if g.buf[pos] == ' ': inc(indentation) + else: indentation = 0 + inc(pos) + + g.state = gtOther + elif g.state == gtOther: + # gtOther means 'inside YAML document' + case g.buf[pos] + of ' ', '\x09'..'\x0D': + g.kind = gtWhitespace + while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos) + of '#': + g.kind = gtComment + inc(pos) + while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + of '-': + inc(pos) + if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}: + g.kind = gtPunctuation + elif g.buf[pos] == '-' and + (pos == 1 or g.buf[pos - 2] in {'\x0A', '\x0D'}): # start of line + inc(pos) + if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}: + inc(pos) + g.kind = gtKeyword + else: yamlPossibleNumber(g, pos) + else: yamlPossibleNumber(g, pos) + of '.': + if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}: + inc(pos) + for i in 1..2: + {.unroll.} + if g.buf[pos] != '.': break + inc(pos) + if pos == g.start + 3: + g.kind = gtKeyword + g.state = gtNone + else: yamlPlainStrLit(g, pos) + else: yamlPlainStrLit(g, pos) + of '?': + inc(pos) + if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}: + g.kind = gtPunctuation + else: yamlPlainStrLit(g, pos) + of ':': + inc(pos) + if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', '\'', '\"'} or + (pos > 0 and g.buf[pos - 2] in {'}', ']', '\"', '\''}): + g.kind = gtPunctuation + else: yamlPlainStrLit(g, pos) + of '[', ']', '{', '}', ',': + inc(pos) + g.kind = gtPunctuation + of '\"': + inc(pos) + g.state = gtStringLit + g.kind = gtNone + of '\'': + inc(pos) + g.state = gtCharLit + g.kind = gtNone + of '!': + g.kind = gtTagStart + inc(pos) + if g.buf[pos] == '<': + # literal tag (e.g. `!<tag:yaml.org,2002:str>`) + while g.buf[pos] notin {'\0', '>', '\x09'..'\x0D', ' '}: inc(pos) + if g.buf[pos] == '>': inc(pos) + else: + while g.buf[pos] in {'A'..'Z', 'a'..'z', '0'..'9', '-'}: inc(pos) + case g.buf[pos] + of '!': + # prefixed tag (e.g. `!!str`) + inc(pos) + while g.buf[pos] notin + {'\0', '\x09'..'\x0D', ' ', ',', '[', ']', '{', '}'}: inc(pos) + of '\0', '\x09'..'\x0D', ' ': discard + else: + # local tag (e.g. `!nim:system:int`) + while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos) + of '&': + g.kind = gtLabel + while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos) + of '*': + g.kind = gtReference + while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos) + of '|', '>': + # this can lead to incorrect tokenization when | or > appear inside flow + # content. checking whether we're inside flow content is not + # chomsky type-3, so we won't do that here. + g.kind = gtCommand + g.state = gtCommand + inc(pos) + while g.buf[pos] in {'0'..'9', '+', '-'}: inc(pos) + of '0'..'9': yamlPossibleNumber(g, pos) + of '\0': g.kind = gtEOF + else: yamlPlainStrLit(g, pos) + else: + # outside document + case g.buf[pos] + of '%': + if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}: + g.kind = gtDirective + while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + else: + g.state = gtOther + yamlPlainStrLit(g, pos) + of ' ', '\x09'..'\x0D': + g.kind = gtWhitespace + while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos) + of '#': + g.kind = gtComment + while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + of '\0': g.kind = gtEOF + else: + g.kind = gtNone + g.state = gtOther + g.length = pos - g.pos + g.pos = pos + proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) = case lang of langNone: assert false @@ -586,6 +879,7 @@ proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) = of langCsharp: csharpNextToken(g) of langC: cNextToken(g) of langJava: javaNextToken(g) + of langYaml: yamlNextToken(g) when isMainModule: var keywords: seq[string] |