summary refs log tree commit diff stats
path: root/lib/packages/docutils/highlite.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/packages/docutils/highlite.nim')
-rw-r--r--lib/packages/docutils/highlite.nim345
1 files changed, 234 insertions, 111 deletions
diff --git a/lib/packages/docutils/highlite.nim b/lib/packages/docutils/highlite.nim
index 796c17d7d..f8376f46c 100644
--- a/lib/packages/docutils/highlite.nim
+++ b/lib/packages/docutils/highlite.nim
@@ -11,11 +11,9 @@
 ## Currently only few languages are supported, other languages may be added.
 ## The interface supports one language nested in another.
 ##
-## **Note:** Import ``packages/docutils/highlite`` to use this module
-##
 ## You can use this to build your own syntax highlighting, check this example:
 ##
-## .. code::nim
+##   ```Nim
 ##   let code = """for x in $int.high: echo x.ord mod 2 == 0"""
 ##   var toknizr: GeneralTokenizer
 ##   initGeneralTokenizer(toknizr, code)
@@ -33,18 +31,43 @@
 ##     else:
 ##       echo toknizr.kind # All the kinds of tokens can be processed here.
 ##       echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
+##   ```
 ##
-## The proc ``getSourceLanguage`` can get the language ``enum`` from a string:
-##
-## .. code::nim
+## The proc `getSourceLanguage` can get the language `enum` from a string:
+##   ```Nim
 ##   for l in ["C", "c++", "jAvA", "Nim", "c#"]: echo getSourceLanguage(l)
+##   ```
+##
+## There is also a `Cmd` pseudo-language supported, which is a simple generic
+## shell/cmdline tokenizer (UNIX shell/Powershell/Windows Command):
+## no escaping, no programming language constructs besides variable definition
+## at the beginning of line. It supports these operators:
+##   ```Cmd
+##   &  &&  |  ||  (  )  ''  ""  ;  # for comments
+##   ```
+##
+## Instead of escaping always use quotes like here
+## `nimgrep --ext:'nim|nims' file.name`:cmd: shows how to input ``|``.
+## Any argument that contains ``.`` or ``/`` or ``\`` will be treated
+## as a file or directory.
 ##
+## In addition to `Cmd` there is also `Console` language for
+## displaying interactive sessions.
+## Lines with a command should start with ``$``, other lines are considered
+## as program output.
 
 import
-  strutils
-from algorithm import binarySearch
+  std/strutils
+from std/algorithm import binarySearch
+
+when defined(nimPreviewSlimSystem):
+  import std/[assertions, syncio]
+
 
 type
+  SourceLanguage* = enum
+    langNone, langNim, langCpp, langCsharp, langC, langJava,
+    langYaml, langPython, langCmd, langConsole
   TokenClass* = enum
     gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber,
     gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit,
@@ -52,28 +75,31 @@ type
     gtOperator, gtPunctuation, gtComment, gtLongComment, gtRegularExpression,
     gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler,
     gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel,
-    gtReference, gtOther
+    gtReference, gtPrompt, gtProgramOutput, gtProgram, gtOption, gtOther
   GeneralTokenizer* = object of RootObj
     kind*: TokenClass
     start*, length*: int
     buf: cstring
     pos: int
     state: TokenClass
-
-  SourceLanguage* = enum
-    langNone, langNim, langCpp, langCsharp, langC, langJava,
-    langYaml
+    lang: SourceLanguage
 
 const
   sourceLanguageToStr*: array[SourceLanguage, string] = ["none",
-    "Nim", "C++", "C#", "C", "Java", "Yaml"]
+    "Nim", "C++", "C#", "C", "Java", "Yaml", "Python", "Cmd", "Console"]
+  sourceLanguageToAlpha*: array[SourceLanguage, string] = ["none",
+    "Nim", "cpp", "csharp", "C", "Java", "Yaml", "Python", "Cmd", "Console"]
+    ## list of languages spelled with alpabetic characters
   tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace",
     "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber",
     "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit",
     "EscapeSequence", "Operator", "Punctuation", "Comment", "LongComment",
     "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData",
     "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink",
-    "Label", "Reference", "Other"]
+    "Label", "Reference", "Prompt", "ProgramOutput",
+    # start from lower-case if there is a corresponding RST role (see rst.nim)
+    "program", "option",
+    "Other"]
 
   # The following list comes from doc/keywords.txt, make sure it is
   # synchronized with this array by running the module itself as a test case.
@@ -90,9 +116,11 @@ const
     "xor", "yield"]
 
 proc getSourceLanguage*(name: string): SourceLanguage =
-  for i in countup(succ(low(SourceLanguage)), high(SourceLanguage)):
+  for i in succ(low(SourceLanguage)) .. high(SourceLanguage):
     if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0:
       return i
+    if cmpIgnoreStyle(name, sourceLanguageToAlpha[i]) == 0:
+      return i
   result = langNone
 
 proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) =
@@ -101,9 +129,8 @@ proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) =
   g.start = 0
   g.length = 0
   g.state = low(TokenClass)
-  var pos = 0                     # skip initial whitespace:
-  while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
-  g.pos = pos
+  g.lang = low(SourceLanguage)
+  g.pos = 0
 
 proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: string) =
   initGeneralTokenizer(g, cstring(buf))
@@ -161,7 +188,10 @@ const
   OpChars  = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
               '|', '=', '%', '&', '$', '@', '~', ':'}
 
-proc nimNextToken(g: var GeneralTokenizer) =
+proc isKeyword(x: openArray[string], y: string): int =
+  binarySearch(x, y)
+
+proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) =
   const
     hexChars = {'0'..'9', 'A'..'F', 'a'..'f', '_'}
     octChars = {'0'..'7', '_'}
@@ -170,36 +200,38 @@ proc nimNextToken(g: var GeneralTokenizer) =
   var pos = g.pos
   g.start = g.pos
   if g.state == gtStringLit:
-    g.kind = gtStringLit
-    while true:
+    if g.buf[pos] == '\\':
+      g.kind = gtEscapeSequence
+      inc(pos)
       case g.buf[pos]
-      of '\\':
-        g.kind = gtEscapeSequence
+      of 'x', 'X':
         inc(pos)
+        if g.buf[pos] in hexChars: inc(pos)
+        if g.buf[pos] in hexChars: inc(pos)
+      of '0'..'9':
+        while g.buf[pos] in {'0'..'9'}: inc(pos)
+      of '\0':
+        g.state = gtNone
+      else: inc(pos)
+    else:
+      g.kind = gtStringLit
+      while true:
         case g.buf[pos]
-        of 'x', 'X':
+        of '\\':
+          break
+        of '\0', '\r', '\n':
+          g.state = gtNone
+          break
+        of '\"':
           inc(pos)
-          if g.buf[pos] in hexChars: inc(pos)
-          if g.buf[pos] in hexChars: inc(pos)
-        of '0'..'9':
-          while g.buf[pos] in {'0'..'9'}: inc(pos)
-        of '\0':
           g.state = gtNone
+          break
         else: inc(pos)
-        break
-      of '\0', '\x0D', '\x0A':
-        g.state = gtNone
-        break
-      of '\"':
-        inc(pos)
-        g.state = gtNone
-        break
-      else: inc(pos)
   else:
     case g.buf[pos]
-    of ' ', '\x09'..'\x0D':
+    of ' ', '\t'..'\r':
       g.kind = gtWhitespace
-      while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
+      while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
     of '#':
       g.kind = gtComment
       inc(pos)
@@ -207,7 +239,7 @@ proc nimNextToken(g: var GeneralTokenizer) =
       if g.buf[pos] == '#':
         inc(pos)
         isDoc = true
-      if g.buf[pos] == '[':
+      if g.buf[pos] == '[' and g.lang == langNim:
         g.kind = gtLongComment
         var nesting = 0
         while true:
@@ -236,7 +268,7 @@ proc nimNextToken(g: var GeneralTokenizer) =
           else:
             inc pos
       else:
-        while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
+        while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
     of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
       var id = ""
       while g.buf[pos] in SymChars + {'_'}:
@@ -260,12 +292,15 @@ proc nimNextToken(g: var GeneralTokenizer) =
         else:
           g.kind = gtRawData
           inc(pos)
-          while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}):
+          while not (g.buf[pos] in {'\0', '\n', '\r'}):
             if g.buf[pos] == '"' and g.buf[pos+1] != '"': break
             inc(pos)
           if g.buf[pos] == '\"': inc(pos)
       else:
-        g.kind = nimGetKeyword(id)
+        if g.lang == langNim:
+          g.kind = nimGetKeyword(id)
+        elif isKeyword(keywords, id) >= 0:
+          g.kind = gtKeyword
     of '0':
       inc(pos)
       case g.buf[pos]
@@ -289,17 +324,18 @@ proc nimNextToken(g: var GeneralTokenizer) =
       pos = nimNumber(g, pos)
     of '\'':
       inc(pos)
-      g.kind = gtCharLit
-      while true:
-        case g.buf[pos]
-        of '\0', '\x0D', '\x0A':
-          break
-        of '\'':
-          inc(pos)
-          break
-        of '\\':
-          inc(pos, 2)
-        else: inc(pos)
+      if g.kind != gtPunctuation:
+        g.kind = gtCharLit
+        while true:
+          case g.buf[pos]
+          of '\0', '\r', '\n':
+            break
+          of '\'':
+            inc(pos)
+            break
+          of '\\':
+            inc(pos, 2)
+          else: inc(pos)
     of '\"':
       inc(pos)
       if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'):
@@ -320,7 +356,7 @@ proc nimNextToken(g: var GeneralTokenizer) =
         g.kind = gtStringLit
         while true:
           case g.buf[pos]
-          of '\0', '\x0D', '\x0A':
+          of '\0', '\r', '\n':
             break
           of '\"':
             inc(pos)
@@ -394,12 +430,6 @@ proc generalStrLit(g: var GeneralTokenizer, position: int): int =
         inc(pos)
   result = pos
 
-proc isKeyword(x: openArray[string], y: string): int =
-  binarySearch(x, y)
-
-proc isKeywordIgnoreCase(x: openArray[string], y: string): int =
-  binarySearch(x, y, cmpIgnoreCase)
-
 type
   TokenizerFlag = enum
     hasPreprocessor, hasNestedComments
@@ -432,7 +462,7 @@ proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string],
           g.state = gtNone
         else: inc(pos)
         break
-      of '\0', '\x0D', '\x0A':
+      of '\0', '\r', '\n':
         g.state = gtNone
         break
       of '\"':
@@ -442,14 +472,14 @@ proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string],
       else: inc(pos)
   else:
     case g.buf[pos]
-    of ' ', '\x09'..'\x0D':
+    of ' ', '\t'..'\r':
       g.kind = gtWhitespace
-      while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
+      while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
     of '/':
       inc(pos)
       if g.buf[pos] == '/':
         g.kind = gtComment
-        while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): inc(pos)
+        while not (g.buf[pos] in {'\0', '\n', '\r'}): inc(pos)
       elif g.buf[pos] == '*':
         g.kind = gtLongComment
         var nested = 0
@@ -469,6 +499,9 @@ proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string],
           of '\0':
             break
           else: inc(pos)
+      else:
+        g.kind = gtOperator
+        while g.buf[pos] in OpChars: inc(pos)
     of '#':
       inc(pos)
       if hasPreprocessor in flags:
@@ -589,9 +622,9 @@ proc javaNextToken(g: var GeneralTokenizer) =
 
 proc yamlPlainStrLit(g: var GeneralTokenizer, pos: var int) =
   g.kind = gtStringLit
-  while g.buf[pos] notin {'\0', '\x09'..'\x0D', ',', ']', '}'}:
+  while g.buf[pos] notin {'\0', '\t'..'\r', ',', ']', '}'}:
     if g.buf[pos] == ':' and
-        g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}:
+        g.buf[pos + 1] in {'\0', '\t'..'\r', ' '}:
       break
     inc(pos)
 
@@ -604,14 +637,14 @@ proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) =
     while g.buf[pos] in {'0'..'9'}: inc(pos)
   else: yamlPlainStrLit(g, pos)
   if g.kind == gtNone:
-    if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}:
+    if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
       g.kind = gtDecNumber
     elif g.buf[pos] == '.':
       inc(pos)
       if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
       else:
         while g.buf[pos] in {'0'..'9'}: inc(pos)
-        if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}:
+        if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
           g.kind = gtFloatNumber
     if g.kind == gtNone:
       if g.buf[pos] in {'e', 'E'}:
@@ -620,13 +653,13 @@ proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) =
         if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
         else:
           while g.buf[pos] in {'0'..'9'}: inc(pos)
-          if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}:
+          if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
             g.kind = gtFloatNumber
           else: yamlPlainStrLit(g, pos)
       else: yamlPlainStrLit(g, pos)
-  while g.buf[pos] notin {'\0', ',', ']', '}', '\x0A', '\x0D'}:
+  while g.buf[pos] notin {'\0', ',', ']', '}', '\n', '\r'}:
     inc(pos)
-    if g.buf[pos] notin {'\x09'..'\x0D', ' ', ',', ']', '}'}:
+    if g.buf[pos] notin {'\t'..'\r', ' ', ',', ']', '}'}:
       yamlPlainStrLit(g, pos)
       break
   # theoretically, we would need to parse indentation (like with block scalars)
@@ -651,19 +684,16 @@ proc yamlNextToken(g: var GeneralTokenizer) =
         of 'x':
           inc(pos)
           for i in 1..2:
-            {.unroll.}
             if g.buf[pos] in hexChars: inc(pos)
           break
         of 'u':
           inc(pos)
           for i in 1..4:
-            {.unroll.}
             if g.buf[pos] in hexChars: inc(pos)
           break
         of 'U':
           inc(pos)
           for i in 1..8:
-            {.unroll.}
             if g.buf[pos] in hexChars: inc(pos)
           break
         else: inc(pos)
@@ -698,13 +728,13 @@ proc yamlNextToken(g: var GeneralTokenizer) =
       while g.buf[pos] in {' ', '\t'}: inc(pos)
     of '#':
       g.kind = gtComment
-      while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
-    of '\x0A', '\x0D': discard
+      while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
+    of '\n', '\r': discard
     else:
       # illegal here. just don't parse a block scalar
       g.kind = gtNone
       g.state = gtOther
-    if g.buf[pos] in {'\x0A', '\x0D'} and g.state == gtCommand:
+    if g.buf[pos] in {'\n', '\r'} and g.state == gtCommand:
       g.state = gtLongStringLit
   elif g.state == gtLongStringLit:
     # beware, this is the only token where we actually have to parse
@@ -713,10 +743,10 @@ proc yamlNextToken(g: var GeneralTokenizer) =
     g.kind = gtLongStringLit
     # first, we have to find the parent indentation of the block scalar, so that
     # we know when to stop
-    assert g.buf[pos] in {'\x0A', '\x0D'}
+    assert g.buf[pos] in {'\n', '\r'}
     var lookbehind = pos - 1
     var headerStart = -1
-    while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}:
+    while lookbehind >= 0 and g.buf[lookbehind] notin {'\n', '\r'}:
       if headerStart == -1 and g.buf[lookbehind] in {'|', '>'}:
         headerStart = lookbehind
       dec(lookbehind)
@@ -727,12 +757,12 @@ proc yamlNextToken(g: var GeneralTokenizer) =
       # when the header is alone in a line, this line does not show the parent's
       # indentation, so we must go further. search the first previous line with
       # non-whitespace content.
-      while lookbehind >= 0 and g.buf[lookbehind] in {'\x0A', '\x0D'}:
+      while lookbehind >= 0 and g.buf[lookbehind] in {'\n', '\r'}:
         dec(lookbehind)
         while lookbehind >= 0 and
             g.buf[lookbehind] in {' ', '\t'}: dec(lookbehind)
       # now, find the beginning of the line...
-      while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}:
+      while lookbehind >= 0 and g.buf[lookbehind] notin {'\n', '\r'}:
         dec(lookbehind)
       # ... and its indentation
       indentation = 1
@@ -740,7 +770,7 @@ proc yamlNextToken(g: var GeneralTokenizer) =
     if lookbehind == -1: indentation = 0 # top level
     elif g.buf[lookbehind + 1] == '-' and g.buf[lookbehind + 2] == '-' and
         g.buf[lookbehind + 3] == '-' and
-        g.buf[lookbehind + 4] in {'\x09'..'\x0D', ' '}:
+        g.buf[lookbehind + 4] in {'\t'..'\r', ' '}:
       # this is a document start, therefore, we are at top level
       indentation = 0
     # because lookbehind was at newline char when calculating indentation, we're
@@ -748,7 +778,7 @@ proc yamlNextToken(g: var GeneralTokenizer) =
     let parentIndentation = indentation - 1
 
     # find first content
-    while g.buf[pos] in {' ', '\x0A', '\x0D'}:
+    while g.buf[pos] in {' ', '\n', '\r'}:
       if g.buf[pos] == ' ': inc(indentation)
       else: indentation = 0
       inc(pos)
@@ -765,12 +795,12 @@ proc yamlNextToken(g: var GeneralTokenizer) =
       if (indentation < minIndentation and g.buf[pos] == '#') or
           (indentation == 0 and g.buf[pos] == '.' and g.buf[pos + 1] == '.' and
           g.buf[pos + 2] == '.' and
-          g.buf[pos + 3] in {'\0', '\x09'..'\x0D', ' '}):
+          g.buf[pos + 3] in {'\0', '\t'..'\r', ' '}):
         # comment after end of block scalar, or end of document
         break
       minIndentation = min(indentation, minIndentation)
-      while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
-      while g.buf[pos] in {' ', '\x0A', '\x0D'}:
+      while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
+      while g.buf[pos] in {' ', '\n', '\r'}:
         if g.buf[pos] == ' ': inc(indentation)
         else: indentation = 0
         inc(pos)
@@ -779,30 +809,29 @@ proc yamlNextToken(g: var GeneralTokenizer) =
   elif g.state == gtOther:
     # gtOther means 'inside YAML document'
     case g.buf[pos]
-    of ' ', '\x09'..'\x0D':
+    of ' ', '\t'..'\r':
       g.kind = gtWhitespace
-      while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
+      while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
     of '#':
       g.kind = gtComment
       inc(pos)
-      while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
+      while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
     of '-':
       inc(pos)
-      if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}:
+      if g.buf[pos] in {'\0', ' ', '\t'..'\r'}:
         g.kind = gtPunctuation
       elif g.buf[pos] == '-' and
-          (pos == 1 or g.buf[pos - 2] in {'\x0A', '\x0D'}): # start of line
+          (pos == 1 or g.buf[pos - 2] in {'\n', '\r'}): # start of line
         inc(pos)
-        if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}:
+        if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\t'..'\r', ' '}:
           inc(pos)
           g.kind = gtKeyword
         else: yamlPossibleNumber(g, pos)
       else: yamlPossibleNumber(g, pos)
     of '.':
-      if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}:
+      if pos == 0 or g.buf[pos - 1] in {'\n', '\r'}:
         inc(pos)
         for i in 1..2:
-          {.unroll.}
           if g.buf[pos] != '.': break
           inc(pos)
         if pos == g.start + 3:
@@ -812,12 +841,12 @@ proc yamlNextToken(g: var GeneralTokenizer) =
       else: yamlPlainStrLit(g, pos)
     of '?':
       inc(pos)
-      if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}:
+      if g.buf[pos] in {'\0', ' ', '\t'..'\r'}:
         g.kind = gtPunctuation
       else: yamlPlainStrLit(g, pos)
     of ':':
       inc(pos)
-      if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', '\'', '\"'} or
+      if g.buf[pos] in {'\0', '\t'..'\r', ' ', '\'', '\"'} or
           (pos > 0 and g.buf[pos - 2] in {'}', ']', '\"', '\''}):
         g.kind = gtPunctuation
       else: yamlPlainStrLit(g, pos)
@@ -836,7 +865,7 @@ proc yamlNextToken(g: var GeneralTokenizer) =
       inc(pos)
       if g.buf[pos] == '<':
         # literal tag (e.g. `!<tag:yaml.org,2002:str>`)
-        while g.buf[pos] notin {'\0', '>', '\x09'..'\x0D', ' '}: inc(pos)
+        while g.buf[pos] notin {'\0', '>', '\t'..'\r', ' '}: inc(pos)
         if g.buf[pos] == '>': inc(pos)
       else:
         while g.buf[pos] in {'A'..'Z', 'a'..'z', '0'..'9', '-'}: inc(pos)
@@ -845,17 +874,17 @@ proc yamlNextToken(g: var GeneralTokenizer) =
           # prefixed tag (e.g. `!!str`)
           inc(pos)
           while g.buf[pos] notin
-              {'\0', '\x09'..'\x0D', ' ', ',', '[', ']', '{', '}'}: inc(pos)
-        of '\0', '\x09'..'\x0D', ' ': discard
+              {'\0', '\t'..'\r', ' ', ',', '[', ']', '{', '}'}: inc(pos)
+        of '\0', '\t'..'\r', ' ': discard
         else:
           # local tag (e.g. `!nim:system:int`)
-          while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos)
+          while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
     of '&':
       g.kind = gtLabel
-      while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos)
+      while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
     of '*':
       g.kind = gtReference
-      while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos)
+      while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
     of '|', '>':
       # this can lead to incorrect tokenization when | or > appear inside flow
       # content. checking whether we're inside flow content is not
@@ -871,18 +900,18 @@ proc yamlNextToken(g: var GeneralTokenizer) =
     # outside document
     case g.buf[pos]
     of '%':
-      if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}:
+      if pos == 0 or g.buf[pos - 1] in {'\n', '\r'}:
         g.kind = gtDirective
-        while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
+        while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
       else:
         g.state = gtOther
         yamlPlainStrLit(g, pos)
-    of ' ', '\x09'..'\x0D':
+    of ' ', '\t'..'\r':
       g.kind = gtWhitespace
-      while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
+      while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
     of '#':
       g.kind = gtComment
-      while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
+      while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
     of '\0': g.kind = gtEof
     else:
       g.kind = gtNone
@@ -890,7 +919,86 @@ proc yamlNextToken(g: var GeneralTokenizer) =
   g.length = pos - g.pos
   g.pos = pos
 
+proc pythonNextToken(g: var GeneralTokenizer) =
+  const
+    keywords: array[0..34, string] = [
+      "False", "None", "True", "and", "as", "assert", "async", "await",
+      "break", "class", "continue", "def", "del", "elif", "else", "except",
+      "finally", "for", "from", "global", "if", "import", "in", "is", "lambda",
+      "nonlocal", "not", "or", "pass", "raise", "return", "try", "while",
+      "with", "yield"]
+  nimNextToken(g, keywords)
+
+proc cmdNextToken(g: var GeneralTokenizer, dollarPrompt = false) =
+  var pos = g.pos
+  g.start = g.pos
+  if g.state == low(TokenClass):
+    g.state = if dollarPrompt: gtPrompt else: gtProgram
+  case g.buf[pos]
+  of ' ', '\t'..'\r':
+    g.kind = gtWhitespace
+    while g.buf[pos] in {' ', '\t'..'\r'}:
+      if g.buf[pos] == '\n':
+        g.state = if dollarPrompt: gtPrompt else: gtProgram
+      inc(pos)
+  of '\'', '"':
+    g.kind = gtOption
+    let q = g.buf[pos]
+    inc(pos)
+    while g.buf[pos] notin {q, '\0'}:
+      inc(pos)
+    if g.buf[pos] == q: inc(pos)
+  of '#':
+    g.kind = gtComment
+    while g.buf[pos] notin {'\n', '\0'}:
+      inc(pos)
+  of '&', '|':
+    g.kind = gtOperator
+    inc(pos)
+    if g.buf[pos] == g.buf[pos-1]: inc(pos)
+    g.state = gtProgram
+  of '(':
+    g.kind = gtOperator
+    g.state = gtProgram
+    inc(pos)
+  of ')':
+    g.kind = gtOperator
+    inc(pos)
+  of ';':
+    g.state = gtProgram
+    g.kind = gtOperator
+    inc(pos)
+  of '\0': g.kind = gtEof
+  elif dollarPrompt and g.state == gtPrompt:
+    if g.buf[pos] == '$' and g.buf[pos+1] in {' ', '\t'}:
+      g.kind = gtPrompt
+      inc pos, 2
+      g.state = gtProgram
+    else:
+      g.kind = gtProgramOutput
+      while g.buf[pos] notin {'\n', '\0'}:
+        inc(pos)
+  else:
+    if g.state == gtProgram:
+      g.kind = gtProgram
+      g.state = gtOption
+    else:
+      g.kind = gtOption
+    while g.buf[pos] notin {' ', '\t'..'\r', '&', '|', '(', ')', '\'', '"', '\0'}:
+      if g.buf[pos] == ';' and g.buf[pos+1] == ' ':
+        # (check space because ';' can be used inside arguments in Win bat)
+        break
+      if g.kind == gtOption and g.buf[pos] in {'/', '\\', '.'}:
+        g.kind = gtIdentifier  # for file/dir name
+      elif g.kind == gtProgram and g.buf[pos] == '=':
+        g.kind = gtIdentifier  # for env variable setting at beginning of line
+        g.state = gtProgram
+      inc(pos)
+  g.length = pos - g.pos
+  g.pos = pos
+
 proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
+  g.lang = lang
   case lang
   of langNone: assert false
   of langNim: nimNextToken(g)
@@ -899,13 +1007,28 @@ proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
   of langC: cNextToken(g)
   of langJava: javaNextToken(g)
   of langYaml: yamlNextToken(g)
+  of langPython: pythonNextToken(g)
+  of langCmd: cmdNextToken(g)
+  of langConsole: cmdNextToken(g, dollarPrompt=true)
+
+proc tokenize*(text: string, lang: SourceLanguage): seq[(string, TokenClass)] =
+  var g: GeneralTokenizer
+  initGeneralTokenizer(g, text)
+  var prevPos = 0
+  while true:
+    getNextToken(g, lang)
+    if g.kind == gtEof:
+      break
+    var s = text[prevPos ..< g.pos]
+    result.add (s, g.kind)
+    prevPos = g.pos
 
 when isMainModule:
   var keywords: seq[string]
   # Try to work running in both the subdir or at the root.
   for filename in ["doc/keywords.txt", "../../../doc/keywords.txt"]:
     try:
-      let input = string(readFile(filename))
+      let input = readFile(filename)
       keywords = input.splitWhitespace()
       break
     except: