summary refs log tree commit diff stats
path: root/lib/packages/docutils/highlite.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/packages/docutils/highlite.nim')
-rw-r--r--lib/packages/docutils/highlite.nim207
1 files changed, 159 insertions, 48 deletions
diff --git a/lib/packages/docutils/highlite.nim b/lib/packages/docutils/highlite.nim
index 8c91e0a8e..f8376f46c 100644
--- a/lib/packages/docutils/highlite.nim
+++ b/lib/packages/docutils/highlite.nim
@@ -11,11 +11,9 @@
 ## Currently only few languages are supported, other languages may be added.
 ## The interface supports one language nested in another.
 ##
-## **Note:** Import `packages/docutils/highlite` to use this module
-##
 ## You can use this to build your own syntax highlighting, check this example:
 ##
-## .. code::nim
+##   ```Nim
 ##   let code = """for x in $int.high: echo x.ord mod 2 == 0"""
 ##   var toknizr: GeneralTokenizer
 ##   initGeneralTokenizer(toknizr, code)
@@ -33,21 +31,43 @@
 ##     else:
 ##       echo toknizr.kind # All the kinds of tokens can be processed here.
 ##       echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
+##   ```
 ##
 ## The proc `getSourceLanguage` can get the language `enum` from a string:
-##
-## .. code::nim
+##   ```Nim
 ##   for l in ["C", "c++", "jAvA", "Nim", "c#"]: echo getSourceLanguage(l)
+##   ```
+##
+## There is also a `Cmd` pseudo-language supported, which is a simple generic
+## shell/cmdline tokenizer (UNIX shell/Powershell/Windows Command):
+## no escaping, no programming language constructs besides variable definition
+## at the beginning of line. It supports these operators:
+##   ```Cmd
+##   &  &&  |  ||  (  )  ''  ""  ;  # for comments
+##   ```
 ##
+## Instead of escaping always use quotes like here
+## `nimgrep --ext:'nim|nims' file.name`:cmd: shows how to input ``|``.
+## Any argument that contains ``.`` or ``/`` or ``\`` will be treated
+## as a file or directory.
+##
+## In addition to `Cmd` there is also `Console` language for
+## displaying interactive sessions.
+## Lines with a command should start with ``$``, other lines are considered
+## as program output.
 
 import
-  strutils
-from algorithm import binarySearch
+  std/strutils
+from std/algorithm import binarySearch
+
+when defined(nimPreviewSlimSystem):
+  import std/[assertions, syncio]
+
 
 type
   SourceLanguage* = enum
     langNone, langNim, langCpp, langCsharp, langC, langJava,
-    langYaml, langPython
+    langYaml, langPython, langCmd, langConsole
   TokenClass* = enum
     gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber,
     gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit,
@@ -55,7 +75,7 @@ type
     gtOperator, gtPunctuation, gtComment, gtLongComment, gtRegularExpression,
     gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler,
     gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel,
-    gtReference, gtOther
+    gtReference, gtPrompt, gtProgramOutput, gtProgram, gtOption, gtOther
   GeneralTokenizer* = object of RootObj
     kind*: TokenClass
     start*, length*: int
@@ -66,14 +86,20 @@ type
 
 const
   sourceLanguageToStr*: array[SourceLanguage, string] = ["none",
-    "Nim", "C++", "C#", "C", "Java", "Yaml", "Python"]
+    "Nim", "C++", "C#", "C", "Java", "Yaml", "Python", "Cmd", "Console"]
+  sourceLanguageToAlpha*: array[SourceLanguage, string] = ["none",
+    "Nim", "cpp", "csharp", "C", "Java", "Yaml", "Python", "Cmd", "Console"]
+    ## list of languages spelled with alpabetic characters
   tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace",
     "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber",
     "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit",
     "EscapeSequence", "Operator", "Punctuation", "Comment", "LongComment",
     "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData",
     "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink",
-    "Label", "Reference", "Other"]
+    "Label", "Reference", "Prompt", "ProgramOutput",
+    # start from lower-case if there is a corresponding RST role (see rst.nim)
+    "program", "option",
+    "Other"]
 
   # The following list comes from doc/keywords.txt, make sure it is
   # synchronized with this array by running the module itself as a test case.
@@ -90,9 +116,11 @@ const
     "xor", "yield"]
 
 proc getSourceLanguage*(name: string): SourceLanguage =
-  for i in countup(succ(low(SourceLanguage)), high(SourceLanguage)):
+  for i in succ(low(SourceLanguage)) .. high(SourceLanguage):
     if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0:
       return i
+    if cmpIgnoreStyle(name, sourceLanguageToAlpha[i]) == 0:
+      return i
   result = langNone
 
 proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) =
@@ -102,9 +130,7 @@ proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) =
   g.length = 0
   g.state = low(TokenClass)
   g.lang = low(SourceLanguage)
-  var pos = 0                     # skip initial whitespace:
-  while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
-  g.pos = pos
+  g.pos = 0
 
 proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: string) =
   initGeneralTokenizer(g, cstring(buf))
@@ -165,9 +191,6 @@ const
 proc isKeyword(x: openArray[string], y: string): int =
   binarySearch(x, y)
 
-proc isKeywordIgnoreCase(x: openArray[string], y: string): int =
-  binarySearch(x, y, cmpIgnoreCase)
-
 proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) =
   const
     hexChars = {'0'..'9', 'A'..'F', 'a'..'f', '_'}
@@ -177,31 +200,33 @@ proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) =
   var pos = g.pos
   g.start = g.pos
   if g.state == gtStringLit:
-    g.kind = gtStringLit
-    while true:
+    if g.buf[pos] == '\\':
+      g.kind = gtEscapeSequence
+      inc(pos)
       case g.buf[pos]
-      of '\\':
-        g.kind = gtEscapeSequence
+      of 'x', 'X':
         inc(pos)
+        if g.buf[pos] in hexChars: inc(pos)
+        if g.buf[pos] in hexChars: inc(pos)
+      of '0'..'9':
+        while g.buf[pos] in {'0'..'9'}: inc(pos)
+      of '\0':
+        g.state = gtNone
+      else: inc(pos)
+    else:
+      g.kind = gtStringLit
+      while true:
         case g.buf[pos]
-        of 'x', 'X':
+        of '\\':
+          break
+        of '\0', '\r', '\n':
+          g.state = gtNone
+          break
+        of '\"':
           inc(pos)
-          if g.buf[pos] in hexChars: inc(pos)
-          if g.buf[pos] in hexChars: inc(pos)
-        of '0'..'9':
-          while g.buf[pos] in {'0'..'9'}: inc(pos)
-        of '\0':
           g.state = gtNone
+          break
         else: inc(pos)
-        break
-      of '\0', '\r', '\n':
-        g.state = gtNone
-        break
-      of '\"':
-        inc(pos)
-        g.state = gtNone
-        break
-      else: inc(pos)
   else:
     case g.buf[pos]
     of ' ', '\t'..'\r':
@@ -299,17 +324,18 @@ proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) =
       pos = nimNumber(g, pos)
     of '\'':
       inc(pos)
-      g.kind = gtCharLit
-      while true:
-        case g.buf[pos]
-        of '\0', '\r', '\n':
-          break
-        of '\'':
-          inc(pos)
-          break
-        of '\\':
-          inc(pos, 2)
-        else: inc(pos)
+      if g.kind != gtPunctuation:
+        g.kind = gtCharLit
+        while true:
+          case g.buf[pos]
+          of '\0', '\r', '\n':
+            break
+          of '\'':
+            inc(pos)
+            break
+          of '\\':
+            inc(pos, 2)
+          else: inc(pos)
     of '\"':
       inc(pos)
       if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'):
@@ -473,6 +499,9 @@ proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string],
           of '\0':
             break
           else: inc(pos)
+      else:
+        g.kind = gtOperator
+        while g.buf[pos] in OpChars: inc(pos)
     of '#':
       inc(pos)
       if hasPreprocessor in flags:
@@ -900,6 +929,74 @@ proc pythonNextToken(g: var GeneralTokenizer) =
       "with", "yield"]
   nimNextToken(g, keywords)
 
+proc cmdNextToken(g: var GeneralTokenizer, dollarPrompt = false) =
+  var pos = g.pos
+  g.start = g.pos
+  if g.state == low(TokenClass):
+    g.state = if dollarPrompt: gtPrompt else: gtProgram
+  case g.buf[pos]
+  of ' ', '\t'..'\r':
+    g.kind = gtWhitespace
+    while g.buf[pos] in {' ', '\t'..'\r'}:
+      if g.buf[pos] == '\n':
+        g.state = if dollarPrompt: gtPrompt else: gtProgram
+      inc(pos)
+  of '\'', '"':
+    g.kind = gtOption
+    let q = g.buf[pos]
+    inc(pos)
+    while g.buf[pos] notin {q, '\0'}:
+      inc(pos)
+    if g.buf[pos] == q: inc(pos)
+  of '#':
+    g.kind = gtComment
+    while g.buf[pos] notin {'\n', '\0'}:
+      inc(pos)
+  of '&', '|':
+    g.kind = gtOperator
+    inc(pos)
+    if g.buf[pos] == g.buf[pos-1]: inc(pos)
+    g.state = gtProgram
+  of '(':
+    g.kind = gtOperator
+    g.state = gtProgram
+    inc(pos)
+  of ')':
+    g.kind = gtOperator
+    inc(pos)
+  of ';':
+    g.state = gtProgram
+    g.kind = gtOperator
+    inc(pos)
+  of '\0': g.kind = gtEof
+  elif dollarPrompt and g.state == gtPrompt:
+    if g.buf[pos] == '$' and g.buf[pos+1] in {' ', '\t'}:
+      g.kind = gtPrompt
+      inc pos, 2
+      g.state = gtProgram
+    else:
+      g.kind = gtProgramOutput
+      while g.buf[pos] notin {'\n', '\0'}:
+        inc(pos)
+  else:
+    if g.state == gtProgram:
+      g.kind = gtProgram
+      g.state = gtOption
+    else:
+      g.kind = gtOption
+    while g.buf[pos] notin {' ', '\t'..'\r', '&', '|', '(', ')', '\'', '"', '\0'}:
+      if g.buf[pos] == ';' and g.buf[pos+1] == ' ':
+        # (check space because ';' can be used inside arguments in Win bat)
+        break
+      if g.kind == gtOption and g.buf[pos] in {'/', '\\', '.'}:
+        g.kind = gtIdentifier  # for file/dir name
+      elif g.kind == gtProgram and g.buf[pos] == '=':
+        g.kind = gtIdentifier  # for env variable setting at beginning of line
+        g.state = gtProgram
+      inc(pos)
+  g.length = pos - g.pos
+  g.pos = pos
+
 proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
   g.lang = lang
   case lang
@@ -911,6 +1008,20 @@ proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
   of langJava: javaNextToken(g)
   of langYaml: yamlNextToken(g)
   of langPython: pythonNextToken(g)
+  of langCmd: cmdNextToken(g)
+  of langConsole: cmdNextToken(g, dollarPrompt=true)
+
+proc tokenize*(text: string, lang: SourceLanguage): seq[(string, TokenClass)] =
+  var g: GeneralTokenizer
+  initGeneralTokenizer(g, text)
+  var prevPos = 0
+  while true:
+    getNextToken(g, lang)
+    if g.kind == gtEof:
+      break
+    var s = text[prevPos ..< g.pos]
+    result.add (s, g.kind)
+    prevPos = g.pos
 
 when isMainModule:
   var keywords: seq[string]