more strict RST inline markup parsing (#17827)

* more strict RST inline markup parsing * add test for unexpected bonus * introduce `roPreferMarkdown`
author: Andrey Makarov <ph.makarov@gmail.com> 2021-04-29 18:31:15 +0300
committer: GitHub <noreply@github.com> 2021-04-29 17:31:15 +0200
commit: 16405083485967a395b8d677833bc26040881b21 (patch)
tree: a60fa53745e1ac48c2dc189347c1ad470a0d701b /lib
parent: e61381a293ab4986427c5ce8968c6c031c83f34f (diff)
download: Nim-16405083485967a395b8d677833bc26040881b21.tar.gz
1 files changed, 41 insertions, 11 deletions
diff --git a/lib/packages/docutils/rst.nim b/lib/packages/docutils/rst.nim
index cb65791ff..6ad466716 100644
--- a/lib/packages/docutils/rst.nim
+++ b/lib/packages/docutils/rst.nim
@@ -11,9 +11,9 @@
 ##                rst
 ## ==================================
 ##
-## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-## Nim-flavored reStructuredText
-## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+## Nim-flavored reStructuredText and Markdown
+## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ##
 ## This module implements a `reStructuredText`:idx: (RST) parser.
 ## A large subset is implemented with some limitations_ and
@@ -177,6 +177,8 @@ type
     roSupportRawDirective,    ## support the ``raw`` directive (don't support
                               ## it for sandboxing)
     roSupportMarkdown,        ## support additional features of Markdown
+    roPreferMarkdown,         ## parse as Markdown (keeping RST as "extension"
+                              ## to Markdown) -- implies `roSupportMarkdown`
     roNimFile                 ## set for Nim files where default interpreted
                               ## text role should be :nim:
 
@@ -277,6 +279,7 @@ type
     line*, col*, baseIndent*: int
     skipPounds*: bool
     adornmentLine*: bool
+    escapeNext*: bool
 
 proc getThing(L: var Lexer, tok: var Token, s: set[char]) =
   tok.kind = tkWord
@@ -314,10 +317,18 @@ proc getPunctAdornment(L: var Lexer, tok: var Token) =
   tok.col = L.col
   var pos = L.bufpos
   let c = L.buf[pos]
-  while true:
+  if not L.escapeNext and (c != '\\' or L.adornmentLine):
+    while true:
+      tok.symbol.add(L.buf[pos])
+      inc pos
+      if L.buf[pos] != c: break
+  elif L.escapeNext:
     tok.symbol.add(L.buf[pos])
     inc pos
-    if L.buf[pos] != c: break
+  else:  # not L.escapeNext and c == '\\' and not L.adornmentLine
+    tok.symbol.add '\\'
+    inc pos
+    L.escapeNext = true
   inc L.col, pos - L.bufpos
   L.bufpos = pos
   if tok.symbol == "\\": tok.kind = tkPunct
@@ -429,7 +440,9 @@ proc getTokens(buffer: string, skipPounds: bool, tokens: var TokenSeq): int =
   while true:
     inc length
     setLen(tokens, length)
+    let toEscape = L.escapeNext
     rawGetTok(L, tokens[length - 1])
+    if toEscape: L.escapeNext = false
     if tokens[length - 1].kind == tkEof: break
   if tokens[0].kind == tkWhite:
     # BUGFIX
@@ -981,16 +994,24 @@ proc expect(p: var RstParser, tok: string) =
   if currentTok(p).symbol == tok: inc p.idx
   else: rstMessage(p, meExpected, tok)
 
-proc isInlineMarkupEnd(p: RstParser, markup: string): bool =
+proc isInlineMarkupEnd(p: RstParser, markup: string, exact: bool): bool =
   # rst rules: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules
-  result = currentTok(p).symbol == markup
+  if exact:
+    result = currentTok(p).symbol == markup
+  else:
+    result = currentTok(p).symbol.endsWith markup
+    if (not result) and markup == "``":
+      # check that escaping may have splitted `` to 2 tokens ` and `
+      result = currentTok(p).symbol == "`" and prevTok(p).symbol == "`"
   if not result: return
   # Rule 2:
   result = prevTok(p).kind notin {tkIndent, tkWhite}
   if not result: return
   # Rule 7:
   result = nextTok(p).kind in {tkIndent, tkWhite, tkEof} or
-      markup in ["``", "`"] and nextTok(p).kind in {tkIndent, tkWhite, tkWord, tkEof} or
+      (roPreferMarkdown in p.s.options and
+        markup in ["``", "`"] and
+        nextTok(p).kind in {tkIndent, tkWhite, tkWord, tkEof}) or
       nextTok(p).symbol[0] in
       {'\'', '\"', ')', ']', '}', '>', '-', '/', '\\', ':', '.', ',', ';', '!', '?', '_'}
   if not result: return
@@ -1130,7 +1151,8 @@ proc toOtherRole(n: PRstNode, kind: RstNodeKind, roleName: string): PRstNode =
 proc parsePostfix(p: var RstParser, n: PRstNode): PRstNode =
   var newKind = n.kind
   var newSons = n.sons
-  if isInlineMarkupEnd(p, "_") or isInlineMarkupEnd(p, "__"):
+  if isInlineMarkupEnd(p, "_", exact=true) or
+      isInlineMarkupEnd(p, "__", exact=true):
     inc p.idx
     if p.tok[p.idx-2].symbol == "`" and p.tok[p.idx-3].symbol == ">":
       var a = newRstNode(rnInner)
@@ -1215,7 +1237,7 @@ proc parseWordOrRef(p: var RstParser, father: PRstNode) =
     inc p.idx
     while currentTok(p).kind in {tkWord, tkPunct}:
       if currentTok(p).kind == tkPunct:
-        if isInlineMarkupEnd(p, "_"):
+        if isInlineMarkupEnd(p, "_", exact=true):
           isRef = true
           break
         if not validRefnamePunct(currentTok(p).symbol):
@@ -1253,7 +1275,15 @@ proc parseUntil(p: var RstParser, father: PRstNode, postfix: string,
   while true:
     case currentTok(p).kind
     of tkPunct:
-      if isInlineMarkupEnd(p, postfix):
+      if isInlineMarkupEnd(p, postfix, exact=false):
+        let l = currentTok(p).symbol.len
+        if l > postfix.len:
+          # handle cases like *emphasis with stars****. (It's valid RST!)
+          father.add newLeaf(currentTok(p).symbol[0 ..< l - postfix.len])
+        elif postfix == "``" and currentTok(p).symbol == "`" and
+            prevTok(p).symbol == "`":
+          # handle cases like ``literal\`` - delete ` already added after \
+          father.sons.setLen(father.sons.len - 1)
         inc p.idx
         break
       else:
author	Andrey Makarov <ph.makarov@gmail.com>	2021-04-29 18:31:15 +0300
committer	GitHub <noreply@github.com>	2021-04-29 17:31:15 +0200
commit	16405083485967a395b8d677833bc26040881b21 (patch)
tree	a60fa53745e1ac48c2dc189347c1ad470a0d701b /lib
parent	e61381a293ab4986427c5ce8968c6c031c83f34f (diff)
download	Nim-16405083485967a395b8d677833bc26040881b21.tar.gz