diff options
author | Andreas Rumpf <rumpf_a@web.de> | 2016-05-12 14:59:00 +0200 |
---|---|---|
committer | Andreas Rumpf <rumpf_a@web.de> | 2016-05-12 14:59:00 +0200 |
commit | 4b1348402504f9b874def5f94638ded2a12f2965 (patch) | |
tree | d8f0aaba20a528c3b9fd61fc22292dfd900ee558 | |
parent | 81ebb969220377221de524253fff2df0d0807a6c (diff) | |
parent | 6fe916fc77c717700dd47451c498e5c99928ba63 (diff) | |
download | Nim-4b1348402504f9b874def5f94638ded2a12f2965.tar.gz |
Merge pull request #4025 from flyx/highlight-yaml
YAML highlighting support for doctools/highlite
-rw-r--r-- | lib/packages/docutils/highlite.nim | 309 | ||||
-rw-r--r-- | tests/stdlib/trstgen.nim | 139 |
2 files changed, 446 insertions, 2 deletions
diff --git a/lib/packages/docutils/highlite.nim b/lib/packages/docutils/highlite.nim index 1bc0af1b6..9de25f82b 100644 --- a/lib/packages/docutils/highlite.nim +++ b/lib/packages/docutils/highlite.nim @@ -31,13 +31,14 @@ type state: TokenClass SourceLanguage* = enum - langNone, langNim, langNimrod, langCpp, langCsharp, langC, langJava + langNone, langNim, langNimrod, langCpp, langCsharp, langC, langJava, + langYaml {.deprecated: [TSourceLanguage: SourceLanguage, TTokenClass: TokenClass, TGeneralTokenizer: GeneralTokenizer].} const sourceLanguageToStr*: array[SourceLanguage, string] = ["none", - "Nim", "Nimrod", "C++", "C#", "C", "Java"] + "Nim", "Nimrod", "C++", "C#", "C", "Java", "Yaml"] tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace", "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber", "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit", @@ -578,6 +579,309 @@ proc javaNextToken(g: var GeneralTokenizer) = "try", "void", "volatile", "while"] clikeNextToken(g, keywords, {}) +proc yamlPlainStrLit(g: var GeneralTokenizer, pos: var int) = + g.kind = gtStringLit + while g.buf[pos] notin {'\0', '\x09'..'\x0D', ',', ']', '}'}: + if g.buf[pos] == ':' and + g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}: + break + inc(pos) + +proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) = + g.kind = gtNone + if g.buf[pos] == '-': inc(pos) + if g.buf[pos] == '0': inc(pos) + elif g.buf[pos] in '1'..'9': + inc(pos) + while g.buf[pos] in {'0'..'9'}: inc(pos) + else: yamlPlainStrLit(g, pos) + if g.kind == gtNone: + if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}: + g.kind = gtDecNumber + elif g.buf[pos] == '.': + inc(pos) + if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos) + else: + while g.buf[pos] in {'0'..'9'}: inc(pos) + if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}: + g.kind = gtFloatNumber + if g.kind == gtNone: + if g.buf[pos] in {'e', 'E'}: + inc(pos) + if g.buf[pos] in {'-', '+'}: inc(pos) + if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos) + else: + while g.buf[pos] in {'0'..'9'}: inc(pos) + if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}: + g.kind = gtFloatNumber + else: yamlPlainStrLit(g, pos) + else: yamlPlainStrLit(g, pos) + while g.buf[pos] notin {'\0', ',', ']', '}', '\x0A', '\x0D'}: + inc(pos) + if g.buf[pos] notin {'\x09'..'\x0D', ' ', ',', ']', '}'}: + yamlPlainStrLit(g, pos) + break + # theoretically, we would need to parse indentation (like with block scalars) + # because of possible multiline flow scalars that start with number-like + # content, but that is far too troublesome. I think it is fine that the + # highlighter is sloppy here. + +proc yamlNextToken(g: var GeneralTokenizer) = + const + hexChars = {'0'..'9', 'A'..'F', 'a'..'f'} + var pos = g.pos + g.start = g.pos + if g.state == gtStringLit: + g.kind = gtStringLit + while true: + case g.buf[pos] + of '\\': + if pos != g.pos: break + g.kind = gtEscapeSequence + inc(pos) + case g.buf[pos] + of 'x': + inc(pos) + for i in 1..2: + {.unroll.} + if g.buf[pos] in hexChars: inc(pos) + break + of 'u': + inc(pos) + for i in 1..4: + {.unroll.} + if g.buf[pos] in hexChars: inc(pos) + break + of 'U': + inc(pos) + for i in 1..8: + {.unroll.} + if g.buf[pos] in hexChars: inc(pos) + break + else: inc(pos) + break + of '\0': + g.state = gtOther + break + of '\"': + inc(pos) + g.state = gtOther + break + else: inc(pos) + elif g.state == gtCharLit: + # abusing gtCharLit as single-quoted string lit + g.kind = gtStringLit + inc(pos) # skip the starting ' + while true: + case g.buf[pos] + of '\'': + inc(pos) + if g.buf[pos] == '\'': + inc(pos) + g.kind = gtEscapeSequence + else: g.state = gtOther + break + else: inc(pos) + elif g.state == gtCommand: + # gtCommand means 'block scalar header' + case g.buf[pos] + of ' ', '\t': + g.kind = gtWhitespace + while g.buf[pos] in {' ', '\t'}: inc(pos) + of '#': + g.kind = gtComment + while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + of '\x0A', '\x0D': discard + else: + # illegal here. just don't parse a block scalar + g.kind = gtNone + g.state = gtOther + if g.buf[pos] in {'\x0A', '\x0D'} and g.state == gtCommand: + g.state = gtLongStringLit + elif g.state == gtLongStringLit: + # beware, this is the only token where we actually have to parse + # indentation. + + g.kind = gtLongStringLit + # first, we have to find the parent indentation of the block scalar, so that + # we know when to stop + assert g.buf[pos] in {'\x0A', '\x0D'} + var lookbehind = pos - 1 + var headerStart = -1 + while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}: + if headerStart == -1 and g.buf[lookbehind] in {'|', '>'}: + headerStart = lookbehind + dec(lookbehind) + assert headerStart != -1 + var indentation = 1 + while g.buf[lookbehind + indentation] == ' ': inc(indentation) + if g.buf[lookbehind + indentation] in {'|', '>'}: + # when the header is alone in a line, this line does not show the parent's + # indentation, so we must go further. search the first previous line with + # non-whitespace content. + while lookbehind >= 0 and g.buf[lookbehind] in {'\x0A', '\x0D'}: + dec(lookbehind) + while lookbehind >= 0 and + g.buf[lookbehind] in {' ', '\t'}: dec(lookbehind) + # now, find the beginning of the line... + while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}: + dec(lookbehind) + # ... and its indentation + indentation = 1 + while g.buf[lookbehind + indentation] == ' ': inc(indentation) + if lookbehind == -1: indentation = 0 # top level + elif g.buf[lookbehind + 1] == '-' and g.buf[lookbehind + 2] == '-' and + g.buf[lookbehind + 3] == '-' and + g.buf[lookbehind + 4] in {'\x09'..'\x0D', ' '}: + # this is a document start, therefore, we are at top level + indentation = 0 + # because lookbehind was at newline char when calculating indentation, we're + # off by one. fix that. top level's parent will have indentation of -1. + let parentIndentation = indentation - 1 + + # find first content + while g.buf[pos] in {' ', '\x0A', '\x0D'}: + if g.buf[pos] == ' ': inc(indentation) + else: indentation = 0 + inc(pos) + var minIndentation = indentation + + # for stupid edge cases, we must check whether an explicit indentation depth + # is given at the header. + while g.buf[headerStart] in {'>', '|', '+', '-'}: inc(headerStart) + if g.buf[headerStart] in {'0'..'9'}: + minIndentation = min(minIndentation, ord(g.buf[headerStart]) - ord('0')) + + # process content lines + while indentation > parentIndentation and g.buf[pos] != '\0': + if (indentation < minIndentation and g.buf[pos] == '#') or + (indentation == 0 and g.buf[pos] == '.' and g.buf[pos + 1] == '.' and + g.buf[pos + 2] == '.' and + g.buf[pos + 3] in {'\0', '\x09'..'\x0D', ' '}): + # comment after end of block scalar, or end of document + break + minIndentation = min(indentation, minIndentation) + while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + while g.buf[pos] in {' ', '\x0A', '\x0D'}: + if g.buf[pos] == ' ': inc(indentation) + else: indentation = 0 + inc(pos) + + g.state = gtOther + elif g.state == gtOther: + # gtOther means 'inside YAML document' + case g.buf[pos] + of ' ', '\x09'..'\x0D': + g.kind = gtWhitespace + while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos) + of '#': + g.kind = gtComment + inc(pos) + while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + of '-': + inc(pos) + if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}: + g.kind = gtPunctuation + elif g.buf[pos] == '-' and + (pos == 1 or g.buf[pos - 2] in {'\x0A', '\x0D'}): # start of line + inc(pos) + if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}: + inc(pos) + g.kind = gtKeyword + else: yamlPossibleNumber(g, pos) + else: yamlPossibleNumber(g, pos) + of '.': + if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}: + inc(pos) + for i in 1..2: + {.unroll.} + if g.buf[pos] != '.': break + inc(pos) + if pos == g.start + 3: + g.kind = gtKeyword + g.state = gtNone + else: yamlPlainStrLit(g, pos) + else: yamlPlainStrLit(g, pos) + of '?': + inc(pos) + if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}: + g.kind = gtPunctuation + else: yamlPlainStrLit(g, pos) + of ':': + inc(pos) + if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', '\'', '\"'} or + (pos > 0 and g.buf[pos - 2] in {'}', ']', '\"', '\''}): + g.kind = gtPunctuation + else: yamlPlainStrLit(g, pos) + of '[', ']', '{', '}', ',': + inc(pos) + g.kind = gtPunctuation + of '\"': + inc(pos) + g.state = gtStringLit + g.kind = gtStringLit + of '\'': + g.state = gtCharLit + g.kind = gtNone + of '!': + g.kind = gtTagStart + inc(pos) + if g.buf[pos] == '<': + # literal tag (e.g. `!<tag:yaml.org,2002:str>`) + while g.buf[pos] notin {'\0', '>', '\x09'..'\x0D', ' '}: inc(pos) + if g.buf[pos] == '>': inc(pos) + else: + while g.buf[pos] in {'A'..'Z', 'a'..'z', '0'..'9', '-'}: inc(pos) + case g.buf[pos] + of '!': + # prefixed tag (e.g. `!!str`) + inc(pos) + while g.buf[pos] notin + {'\0', '\x09'..'\x0D', ' ', ',', '[', ']', '{', '}'}: inc(pos) + of '\0', '\x09'..'\x0D', ' ': discard + else: + # local tag (e.g. `!nim:system:int`) + while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos) + of '&': + g.kind = gtLabel + while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos) + of '*': + g.kind = gtReference + while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos) + of '|', '>': + # this can lead to incorrect tokenization when | or > appear inside flow + # content. checking whether we're inside flow content is not + # chomsky type-3, so we won't do that here. + g.kind = gtCommand + g.state = gtCommand + inc(pos) + while g.buf[pos] in {'0'..'9', '+', '-'}: inc(pos) + of '0'..'9': yamlPossibleNumber(g, pos) + of '\0': g.kind = gtEOF + else: yamlPlainStrLit(g, pos) + else: + # outside document + case g.buf[pos] + of '%': + if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}: + g.kind = gtDirective + while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + else: + g.state = gtOther + yamlPlainStrLit(g, pos) + of ' ', '\x09'..'\x0D': + g.kind = gtWhitespace + while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos) + of '#': + g.kind = gtComment + while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos) + of '\0': g.kind = gtEOF + else: + g.kind = gtNone + g.state = gtOther + g.length = pos - g.pos + g.pos = pos + proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) = case lang of langNone: assert false @@ -586,6 +890,7 @@ proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) = of langCsharp: csharpNextToken(g) of langC: cNextToken(g) of langJava: javaNextToken(g) + of langYaml: yamlNextToken(g) when isMainModule: var keywords: seq[string] diff --git a/tests/stdlib/trstgen.nim b/tests/stdlib/trstgen.nim new file mode 100644 index 000000000..c702ccc2a --- /dev/null +++ b/tests/stdlib/trstgen.nim @@ -0,0 +1,139 @@ +# tests for rstgen module. + +import ../../lib/packages/docutils/rstgen +import unittest + +suite "YAML syntax highlighting": + test "Basics": + let input = """.. code-block:: yaml + %YAML 1.2 + --- + a string: string + a list: + - item 1 + - item 2 + a map: + ? key + : value + ...""" + let output = rstTohtml(input, {}, defaultConfig()) + assert output == """<pre class = "listing"><span class="Directive">%YAML 1.2</span> +<span class="Keyword">---</span> +<span class="StringLit">a string</span><span class="Punctuation">:</span> <span class="StringLit">string</span> +<span class="StringLit">a list</span><span class="Punctuation">:</span> + <span class="Punctuation">-</span> <span class="StringLit">item 1</span> + <span class="Punctuation">-</span> <span class="StringLit">item 2</span> +<span class="StringLit">a map</span><span class="Punctuation">:</span> +<span class="Punctuation">?</span> <span class="StringLit">key</span> +<span class="Punctuation">:</span> <span class="StringLit">value</span> +<span class="Keyword">...</span></pre>""" + + test "Block scalars": + let input = """.. code-block:: yaml + a literal block scalar: | + some text + # not a comment + # a comment, since less indented + # another comment + a folded block scalar: >2 + some text + # not a comment since indented as specified + # a comment + another literal block scalar: + |+ # comment after header + allowed, since more indented than parent""" + let output = rstToHtml(input, {}, defaultConfig()) + assert output == """<pre class = "listing"><span class="StringLit">a literal block scalar</span><span class="Punctuation">:</span> <span class="Command">|</span><span class="Command"></span><span class="LongStringLit"> + some text + # not a comment + </span><span class="Comment"># a comment, since less indented</span> + <span class="Comment"># another comment</span> +<span class="StringLit">a folded block scalar</span><span class="Punctuation">:</span> <span class="Command">>2</span><span class="Command"></span><span class="LongStringLit"> + some text + # not a comment since indented as specified + </span><span class="Comment"># a comment</span> +<span class="StringLit">another literal block scalar</span><span class="Punctuation">:</span> + <span class="Command">|+</span> <span class="Comment"># comment after header</span><span class="LongStringLit"> + allowed, since more indented than parent</span></pre>""" + + test "Directives": + let input = """.. code-block:: yaml + %YAML 1.2 + --- + %not a directive + ... + %a directive + ... + a string + % not a directive + ... + %TAG ! !foo:""" + let output = rstToHtml(input, {}, defaultConfig()) + assert output == """<pre class = "listing"><span class="Directive">%YAML 1.2</span> +<span class="Keyword">---</span> +<span class="StringLit">%not a directive</span> +<span class="Keyword">...</span> +<span class="Directive">%a directive</span> +<span class="Keyword">...</span> +<span class="StringLit">a string</span> +<span class="StringLit">% not a directive</span> +<span class="Keyword">...</span> +<span class="Directive">%TAG ! !foo:</span></pre>""" + + test "Flow Style and Numbers": + let input = """.. code-block:: yaml + { + "quoted string": 42, + 'single quoted string': false, + [ list, "with", 'entries' ]: 73.32e-73, + more numbers: [-783, 11e78], + not numbers: [ 42e, 0023, +32.37, 8 ball] + }""" + let output = rstToHtml(input, {}, defaultConfig()) + assert output == """<pre class = "listing"><span class="Punctuation">{</span> + <span class="StringLit">"</span><span class="StringLit">quoted string"</span><span class="Punctuation">:</span> <span class="DecNumber">42</span><span class="Punctuation">,</span> + <span class="StringLit">'single quoted string'</span><span class="Punctuation">:</span> <span class="StringLit">false</span><span class="Punctuation">,</span> + <span class="Punctuation">[</span> <span class="StringLit">list</span><span class="Punctuation">,</span> <span class="StringLit">"</span><span class="StringLit">with"</span><span class="Punctuation">,</span> <span class="StringLit">'entries'</span> <span class="Punctuation">]</span><span class="Punctuation">:</span> <span class="FloatNumber">73.32e-73</span><span class="Punctuation">,</span> + <span class="StringLit">more numbers</span><span class="Punctuation">:</span> <span class="Punctuation">[</span><span class="DecNumber">-783</span><span class="Punctuation">,</span> <span class="FloatNumber">11e78</span><span class="Punctuation">]</span><span class="Punctuation">,</span> + <span class="StringLit">not numbers</span><span class="Punctuation">:</span> <span class="Punctuation">[</span> <span class="StringLit">42e</span><span class="Punctuation">,</span> <span class="StringLit">0023</span><span class="Punctuation">,</span> <span class="StringLit">+32.37</span><span class="Punctuation">,</span> <span class="StringLit">8 ball</span><span class="Punctuation">]</span> +<span class="Punctuation">}</span></pre>""" + + test "Anchors, Aliases, Tags": + let input = """.. code-block:: yaml + --- !!map + !!str string: !<tag:yaml.org,2002:int> 42 + ? &anchor !!seq []: + : !localtag foo + alias: *anchor + """ + let output = rstToHtml(input, {}, defaultConfig()) + assert output == """<pre class = "listing"><span class="Keyword">---</span> <span class="TagStart">!!map</span> +<span class="TagStart">!!str</span> <span class="StringLit">string</span><span class="Punctuation">:</span> <span class="TagStart">!<tag:yaml.org,2002:int></span> <span class="DecNumber">42</span> +<span class="Punctuation">?</span> <span class="Label">&anchor</span> <span class="TagStart">!!seq</span> <span class="Punctuation">[</span><span class="Punctuation">]</span><span class="Punctuation">:</span> +<span class="Punctuation">:</span> <span class="TagStart">!localtag</span> <span class="StringLit">foo</span> +<span class="StringLit">alias</span><span class="Punctuation">:</span> <span class="Reference">*anchor</span></pre>""" + + test "Edge cases": + let input = """.. code-block:: yaml + ... + %a string: + a:string:not:a:map + ... + not a list: + -2 + -3 + -4 + example.com/not/a#comment: + ?not a map key + """ + let output = rstToHtml(input, {}, defaultConfig()) + assert output == """<pre class = "listing"><span class="Keyword">...</span> + <span class="StringLit">%a string</span><span class="Punctuation">:</span> + <span class="StringLit">a:string:not:a:map</span> +<span class="Keyword">...</span> +<span class="StringLit">not a list</span><span class="Punctuation">:</span> + <span class="DecNumber">-2</span> + <span class="DecNumber">-3</span> + <span class="DecNumber">-4</span> +<span class="StringLit">example.com/not/a#comment</span><span class="Punctuation">:</span> + <span class="StringLit">?not a map key</span></pre>""" \ No newline at end of file |