Merge pull request #4025 from flyx/highlight-yaml

YAML highlighting support for doctools/highlite
author: Andreas Rumpf <rumpf_a@web.de> 2016-05-12 14:59:00 +0200
committer: Andreas Rumpf <rumpf_a@web.de> 2016-05-12 14:59:00 +0200
commit: 4b1348402504f9b874def5f94638ded2a12f2965 (patch)
tree: d8f0aaba20a528c3b9fd61fc22292dfd900ee558
parent: 81ebb969220377221de524253fff2df0d0807a6c (diff)
parent: 6fe916fc77c717700dd47451c498e5c99928ba63 (diff)
download: Nim-4b1348402504f9b874def5f94638ded2a12f2965.tar.gz
2 files changed, 446 insertions, 2 deletions
diff --git a/lib/packages/docutils/highlite.nim b/lib/packages/docutils/highlite.nim
index 1bc0af1b6..9de25f82b 100644
--- a/lib/packages/docutils/highlite.nim
+++ b/lib/packages/docutils/highlite.nim
@@ -31,13 +31,14 @@ type
     state: TokenClass
 
   SourceLanguage* = enum
-    langNone, langNim, langNimrod, langCpp, langCsharp, langC, langJava
+    langNone, langNim, langNimrod, langCpp, langCsharp, langC, langJava,
+    langYaml
 {.deprecated: [TSourceLanguage: SourceLanguage, TTokenClass: TokenClass,
               TGeneralTokenizer: GeneralTokenizer].}
 
 const
   sourceLanguageToStr*: array[SourceLanguage, string] = ["none",
-    "Nim", "Nimrod", "C++", "C#", "C", "Java"]
+    "Nim", "Nimrod", "C++", "C#", "C", "Java", "Yaml"]
   tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace",
     "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber",
     "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit",
@@ -578,6 +579,309 @@ proc javaNextToken(g: var GeneralTokenizer) =
       "try", "void", "volatile", "while"]
   clikeNextToken(g, keywords, {})
 
+proc yamlPlainStrLit(g: var GeneralTokenizer, pos: var int) =
+  g.kind = gtStringLit
+  while g.buf[pos] notin {'\0', '\x09'..'\x0D', ',', ']', '}'}:
+    if g.buf[pos] == ':' and
+        g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}:
+      break
+    inc(pos)
+
+proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) =
+  g.kind = gtNone
+  if g.buf[pos] == '-': inc(pos)
+  if g.buf[pos] == '0': inc(pos)
+  elif g.buf[pos] in '1'..'9':
+    inc(pos)
+    while g.buf[pos] in {'0'..'9'}: inc(pos)
+  else: yamlPlainStrLit(g, pos)
+  if g.kind == gtNone:
+    if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}:
+      g.kind = gtDecNumber
+    elif g.buf[pos] == '.':
+      inc(pos)
+      if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
+      else:
+        while g.buf[pos] in {'0'..'9'}: inc(pos)
+        if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}:
+          g.kind = gtFloatNumber
+    if g.kind == gtNone:
+      if g.buf[pos] in {'e', 'E'}:
+        inc(pos)
+        if g.buf[pos] in {'-', '+'}: inc(pos)
+        if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
+        else:
+          while g.buf[pos] in {'0'..'9'}: inc(pos)
+          if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}:
+            g.kind = gtFloatNumber
+          else: yamlPlainStrLit(g, pos)
+      else: yamlPlainStrLit(g, pos)
+  while g.buf[pos] notin {'\0', ',', ']', '}', '\x0A', '\x0D'}:
+    inc(pos)
+    if g.buf[pos] notin {'\x09'..'\x0D', ' ', ',', ']', '}'}:
+      yamlPlainStrLit(g, pos)
+      break
+  # theoretically, we would need to parse indentation (like with block scalars)
+  # because of possible multiline flow scalars that start with number-like
+  # content, but that is far too troublesome. I think it is fine that the
+  # highlighter is sloppy here.
+
+proc yamlNextToken(g: var GeneralTokenizer) =
+  const
+    hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
+  var pos = g.pos
+  g.start = g.pos
+  if g.state == gtStringLit:
+    g.kind = gtStringLit
+    while true:
+      case g.buf[pos]
+      of '\\':
+        if pos != g.pos: break
+        g.kind = gtEscapeSequence
+        inc(pos)
+        case g.buf[pos]
+        of 'x':
+          inc(pos)
+          for i in 1..2:
+            {.unroll.}
+            if g.buf[pos] in hexChars: inc(pos)
+          break
+        of 'u':
+          inc(pos)
+          for i in 1..4:
+            {.unroll.}
+            if g.buf[pos] in hexChars: inc(pos)
+          break
+        of 'U':
+          inc(pos)
+          for i in 1..8:
+            {.unroll.}
+            if g.buf[pos] in hexChars: inc(pos)
+          break
+        else: inc(pos)
+        break
+      of '\0':
+        g.state = gtOther
+        break
+      of '\"':
+        inc(pos)
+        g.state = gtOther
+        break
+      else: inc(pos)
+  elif g.state == gtCharLit:
+    # abusing gtCharLit as single-quoted string lit
+    g.kind = gtStringLit
+    inc(pos) # skip the starting '
+    while true:
+      case g.buf[pos]
+      of '\'':
+        inc(pos)
+        if g.buf[pos] == '\'':
+          inc(pos)
+          g.kind = gtEscapeSequence
+        else: g.state = gtOther
+        break
+      else: inc(pos)
+  elif g.state == gtCommand:
+    # gtCommand means 'block scalar header'
+    case g.buf[pos]
+    of ' ', '\t':
+      g.kind = gtWhitespace
+      while g.buf[pos] in {' ', '\t'}: inc(pos)
+    of '#':
+      g.kind = gtComment
+      while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
+    of '\x0A', '\x0D': discard
+    else:
+      # illegal here. just don't parse a block scalar
+      g.kind = gtNone
+      g.state = gtOther
+    if g.buf[pos] in {'\x0A', '\x0D'} and g.state == gtCommand:
+      g.state = gtLongStringLit
+  elif g.state == gtLongStringLit:
+    # beware, this is the only token where we actually have to parse
+    # indentation. 
+    
+    g.kind = gtLongStringLit
+    # first, we have to find the parent indentation of the block scalar, so that
+    # we know when to stop
+    assert g.buf[pos] in {'\x0A', '\x0D'}
+    var lookbehind = pos - 1
+    var headerStart = -1
+    while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}:
+      if headerStart == -1 and g.buf[lookbehind] in {'|', '>'}:
+        headerStart = lookbehind
+      dec(lookbehind)
+    assert headerStart != -1
+    var indentation = 1
+    while g.buf[lookbehind + indentation] == ' ': inc(indentation)
+    if g.buf[lookbehind + indentation] in {'|', '>'}:
+      # when the header is alone in a line, this line does not show the parent's
+      # indentation, so we must go further. search the first previous line with
+      # non-whitespace content.
+      while lookbehind >= 0 and g.buf[lookbehind] in {'\x0A', '\x0D'}:
+        dec(lookbehind)
+        while lookbehind >= 0 and
+            g.buf[lookbehind] in {' ', '\t'}: dec(lookbehind)
+      # now, find the beginning of the line...
+      while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}:
+        dec(lookbehind)
+      # ... and its indentation
+      indentation = 1
+      while g.buf[lookbehind + indentation] == ' ': inc(indentation)
+    if lookbehind == -1: indentation = 0 # top level
+    elif g.buf[lookbehind + 1] == '-' and g.buf[lookbehind + 2] == '-' and
+        g.buf[lookbehind + 3] == '-' and
+        g.buf[lookbehind + 4] in {'\x09'..'\x0D', ' '}:
+      # this is a document start, therefore, we are at top level
+      indentation = 0
+    # because lookbehind was at newline char when calculating indentation, we're
+    # off by one. fix that. top level's parent will have indentation of -1.
+    let parentIndentation = indentation - 1
+    
+    # find first content
+    while g.buf[pos] in {' ', '\x0A', '\x0D'}:
+      if g.buf[pos] == ' ': inc(indentation)
+      else: indentation = 0
+      inc(pos)
+    var minIndentation = indentation
+    
+    # for stupid edge cases, we must check whether an explicit indentation depth
+    # is given at the header.
+    while g.buf[headerStart] in {'>', '|', '+', '-'}: inc(headerStart)
+    if g.buf[headerStart] in {'0'..'9'}:
+      minIndentation = min(minIndentation, ord(g.buf[headerStart]) - ord('0'))
+    
+    # process content lines
+    while indentation > parentIndentation and g.buf[pos] != '\0':
+      if (indentation < minIndentation and g.buf[pos] == '#') or
+          (indentation == 0 and g.buf[pos] == '.' and g.buf[pos + 1] == '.' and
+          g.buf[pos + 2] == '.' and
+          g.buf[pos + 3] in {'\0', '\x09'..'\x0D', ' '}):
+        # comment after end of block scalar, or end of document
+        break
+      minIndentation = min(indentation, minIndentation)
+      while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
+      while g.buf[pos] in {' ', '\x0A', '\x0D'}:
+        if g.buf[pos] == ' ': inc(indentation)
+        else: indentation = 0
+        inc(pos)
+    
+    g.state = gtOther
+  elif g.state == gtOther:
+    # gtOther means 'inside YAML document'
+    case g.buf[pos]
+    of ' ', '\x09'..'\x0D':
+      g.kind = gtWhitespace
+      while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
+    of '#':
+      g.kind = gtComment
+      inc(pos)
+      while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
+    of '-':
+      inc(pos)
+      if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}:
+        g.kind = gtPunctuation
+      elif g.buf[pos] == '-' and
+          (pos == 1 or g.buf[pos - 2] in {'\x0A', '\x0D'}): # start of line
+        inc(pos)
+        if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}:
+          inc(pos)
+          g.kind = gtKeyword
+        else: yamlPossibleNumber(g, pos)
+      else: yamlPossibleNumber(g, pos)
+    of '.':
+      if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}:
+        inc(pos)
+        for i in 1..2:
+          {.unroll.}
+          if g.buf[pos] != '.': break
+          inc(pos)
+        if pos == g.start + 3:
+          g.kind = gtKeyword
+          g.state = gtNone
+        else: yamlPlainStrLit(g, pos)
+      else: yamlPlainStrLit(g, pos)
+    of '?':
+      inc(pos)
+      if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}:
+        g.kind = gtPunctuation
+      else: yamlPlainStrLit(g, pos)
+    of ':':
+      inc(pos)
+      if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', '\'', '\"'} or
+          (pos > 0 and g.buf[pos - 2] in {'}', ']', '\"', '\''}):
+        g.kind = gtPunctuation
+      else: yamlPlainStrLit(g, pos)
+    of '[', ']', '{', '}', ',':
+      inc(pos)
+      g.kind = gtPunctuation
+    of '\"':
+      inc(pos)
+      g.state = gtStringLit
+      g.kind = gtStringLit
+    of '\'':
+      g.state = gtCharLit
+      g.kind = gtNone
+    of '!':
+      g.kind = gtTagStart
+      inc(pos)
+      if g.buf[pos] == '<':
+        # literal tag (e.g. `!<tag:yaml.org,2002:str>`)
+        while g.buf[pos] notin {'\0', '>', '\x09'..'\x0D', ' '}: inc(pos)
+        if g.buf[pos] == '>': inc(pos)
+      else:
+        while g.buf[pos] in {'A'..'Z', 'a'..'z', '0'..'9', '-'}: inc(pos)
+        case g.buf[pos]
+        of '!':
+          # prefixed tag (e.g. `!!str`)
+          inc(pos)
+          while g.buf[pos] notin
+              {'\0', '\x09'..'\x0D', ' ', ',', '[', ']', '{', '}'}: inc(pos)
+        of '\0', '\x09'..'\x0D', ' ': discard
+        else:
+          # local tag (e.g. `!nim:system:int`)
+          while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos)
+    of '&':
+      g.kind = gtLabel
+      while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos)
+    of '*':
+      g.kind = gtReference
+      while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos)
+    of '|', '>':
+      # this can lead to incorrect tokenization when | or > appear inside flow
+      # content. checking whether we're inside flow content is not
+      # chomsky type-3, so we won't do that here.
+      g.kind = gtCommand
+      g.state = gtCommand
+      inc(pos)
+      while g.buf[pos] in {'0'..'9', '+', '-'}: inc(pos)
+    of '0'..'9': yamlPossibleNumber(g, pos)
+    of '\0': g.kind = gtEOF
+    else: yamlPlainStrLit(g, pos)
+  else:
+    # outside document
+    case g.buf[pos]
+    of '%':
+      if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}:
+        g.kind = gtDirective
+        while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
+      else:
+        g.state = gtOther
+        yamlPlainStrLit(g, pos)
+    of ' ', '\x09'..'\x0D':
+      g.kind = gtWhitespace
+      while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
+    of '#':
+      g.kind = gtComment
+      while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
+    of '\0': g.kind = gtEOF
+    else:
+      g.kind = gtNone
+      g.state = gtOther
+  g.length = pos - g.pos
+  g.pos = pos
+
 proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
   case lang
   of langNone: assert false
@@ -586,6 +890,7 @@ proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
   of langCsharp: csharpNextToken(g)
   of langC: cNextToken(g)
   of langJava: javaNextToken(g)
+  of langYaml: yamlNextToken(g)
 
 when isMainModule:
   var keywords: seq[string]
diff --git a/tests/stdlib/trstgen.nim b/tests/stdlib/trstgen.nim
new file mode 100644
index 000000000..c702ccc2a
--- /dev/null
+++ b/tests/stdlib/trstgen.nim
@@ -0,0 +1,139 @@
+# tests for rstgen module.
+
+import ../../lib/packages/docutils/rstgen
+import unittest
+
+suite "YAML syntax highlighting":
+  test "Basics":
+    let input = """.. code-block:: yaml
+    %YAML 1.2
+    ---
+    a string: string
+    a list:
+      - item 1
+      - item 2
+    a map:
+    ? key
+    : value
+    ..."""
+    let output = rstTohtml(input, {}, defaultConfig())
+    assert output == """<pre class = "listing"><span class="Directive">%YAML 1.2</span>
+<span class="Keyword">---</span>
+<span class="StringLit">a string</span><span class="Punctuation">:</span> <span class="StringLit">string</span>
+<span class="StringLit">a list</span><span class="Punctuation">:</span>
+  <span class="Punctuation">-</span> <span class="StringLit">item 1</span>
+  <span class="Punctuation">-</span> <span class="StringLit">item 2</span>
+<span class="StringLit">a map</span><span class="Punctuation">:</span>
+<span class="Punctuation">?</span> <span class="StringLit">key</span>
+<span class="Punctuation">:</span> <span class="StringLit">value</span>
+<span class="Keyword">...</span></pre>"""
+  
+  test "Block scalars":
+    let input = """.. code-block:: yaml
+    a literal block scalar: |
+      some text
+      # not a comment
+     # a comment, since less indented
+      # another comment
+    a folded block scalar: >2
+       some text
+      # not a comment since indented as specified
+     # a comment
+    another literal block scalar:
+      |+ # comment after header
+     allowed, since more indented than parent"""
+    let output = rstToHtml(input, {}, defaultConfig())
+    assert output == """<pre class = "listing"><span class="StringLit">a literal block scalar</span><span class="Punctuation">:</span> <span class="Command">|</span><span class="Command"></span><span class="LongStringLit">
+  some text
+  # not a comment
+ </span><span class="Comment"># a comment, since less indented</span>
+  <span class="Comment"># another comment</span>
+<span class="StringLit">a folded block scalar</span><span class="Punctuation">:</span> <span class="Command">&gt;2</span><span class="Command"></span><span class="LongStringLit">
+   some text
+  # not a comment since indented as specified
+ </span><span class="Comment"># a comment</span>
+<span class="StringLit">another literal block scalar</span><span class="Punctuation">:</span>
+  <span class="Command">|+</span> <span class="Comment"># comment after header</span><span class="LongStringLit">
+ allowed, since more indented than parent</span></pre>"""
+ 
+  test "Directives":
+    let input = """.. code-block:: yaml
+    %YAML 1.2
+    ---
+    %not a directive
+    ...
+    %a directive
+    ...
+    a string
+    % not a directive
+    ...
+    %TAG ! !foo:"""
+    let output = rstToHtml(input, {}, defaultConfig())
+    assert output == """<pre class = "listing"><span class="Directive">%YAML 1.2</span>
+<span class="Keyword">---</span>
+<span class="StringLit">%not a directive</span>
+<span class="Keyword">...</span>
+<span class="Directive">%a directive</span>
+<span class="Keyword">...</span>
+<span class="StringLit">a string</span>
+<span class="StringLit">% not a directive</span>
+<span class="Keyword">...</span>
+<span class="Directive">%TAG ! !foo:</span></pre>"""
+
+  test "Flow Style and Numbers":
+    let input = """.. code-block:: yaml
+    {
+      "quoted string": 42,
+      'single quoted string': false,
+      [ list, "with", 'entries' ]: 73.32e-73,
+      more numbers: [-783, 11e78],
+      not numbers: [ 42e, 0023, +32.37, 8 ball]
+    }"""
+    let output = rstToHtml(input, {}, defaultConfig())
+    assert output == """<pre class = "listing"><span class="Punctuation">{</span>
+  <span class="StringLit">&quot;</span><span class="StringLit">quoted string&quot;</span><span class="Punctuation">:</span> <span class="DecNumber">42</span><span class="Punctuation">,</span>
+  <span class="StringLit">'single quoted string'</span><span class="Punctuation">:</span> <span class="StringLit">false</span><span class="Punctuation">,</span>
+  <span class="Punctuation">[</span> <span class="StringLit">list</span><span class="Punctuation">,</span> <span class="StringLit">&quot;</span><span class="StringLit">with&quot;</span><span class="Punctuation">,</span> <span class="StringLit">'entries'</span> <span class="Punctuation">]</span><span class="Punctuation">:</span> <span class="FloatNumber">73.32e-73</span><span class="Punctuation">,</span>
+  <span class="StringLit">more numbers</span><span class="Punctuation">:</span> <span class="Punctuation">[</span><span class="DecNumber">-783</span><span class="Punctuation">,</span> <span class="FloatNumber">11e78</span><span class="Punctuation">]</span><span class="Punctuation">,</span>
+  <span class="StringLit">not numbers</span><span class="Punctuation">:</span> <span class="Punctuation">[</span> <span class="StringLit">42e</span><span class="Punctuation">,</span> <span class="StringLit">0023</span><span class="Punctuation">,</span> <span class="StringLit">+32.37</span><span class="Punctuation">,</span> <span class="StringLit">8 ball</span><span class="Punctuation">]</span>
+<span class="Punctuation">}</span></pre>"""
+  
+  test "Anchors, Aliases, Tags":
+    let input = """.. code-block:: yaml
+    --- !!map
+    !!str string: !<tag:yaml.org,2002:int> 42
+    ? &anchor !!seq []:
+    : !localtag foo
+    alias: *anchor
+    """
+    let output = rstToHtml(input, {}, defaultConfig())
+    assert output == """<pre class = "listing"><span class="Keyword">---</span> <span class="TagStart">!!map</span>
+<span class="TagStart">!!str</span> <span class="StringLit">string</span><span class="Punctuation">:</span> <span class="TagStart">!&lt;tag:yaml.org,2002:int&gt;</span> <span class="DecNumber">42</span>
+<span class="Punctuation">?</span> <span class="Label">&amp;anchor</span> <span class="TagStart">!!seq</span> <span class="Punctuation">[</span><span class="Punctuation">]</span><span class="Punctuation">:</span>
+<span class="Punctuation">:</span> <span class="TagStart">!localtag</span> <span class="StringLit">foo</span>
+<span class="StringLit">alias</span><span class="Punctuation">:</span> <span class="Reference">*anchor</span></pre>"""
+
+  test "Edge cases":
+    let input = """.. code-block:: yaml
+    ...
+     %a string:
+      a:string:not:a:map
+    ...
+    not a list:
+      -2
+      -3
+      -4
+    example.com/not/a#comment:
+      ?not a map key
+    """
+    let output = rstToHtml(input, {}, defaultConfig())
+    assert output == """<pre class = "listing"><span class="Keyword">...</span>
+ <span class="StringLit">%a string</span><span class="Punctuation">:</span>
+  <span class="StringLit">a:string:not:a:map</span>
+<span class="Keyword">...</span>
+<span class="StringLit">not a list</span><span class="Punctuation">:</span>
+  <span class="DecNumber">-2</span>
+  <span class="DecNumber">-3</span>
+  <span class="DecNumber">-4</span>
+<span class="StringLit">example.com/not/a#comment</span><span class="Punctuation">:</span>
+  <span class="StringLit">?not a map key</span></pre>"""
\ No newline at end of file
author	Andreas Rumpf <rumpf_a@web.de>	2016-05-12 14:59:00 +0200
committer	Andreas Rumpf <rumpf_a@web.de>	2016-05-12 14:59:00 +0200
commit	4b1348402504f9b874def5f94638ded2a12f2965 (patch)
tree	d8f0aaba20a528c3b9fd61fc22292dfd900ee558
parent	81ebb969220377221de524253fff2df0d0807a6c (diff)
parent	6fe916fc77c717700dd47451c498e5c99928ba63 (diff)
download	Nim-4b1348402504f9b874def5f94638ded2a12f2965.tar.gz