summary refs log tree commit diff stats
path: root/lib/packages/docutils/highlite.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/packages/docutils/highlite.nim')
-rw-r--r--[-rwxr-xr-x]lib/packages/docutils/highlite.nim1066
1 files changed, 784 insertions, 282 deletions
diff --git a/lib/packages/docutils/highlite.nim b/lib/packages/docutils/highlite.nim
index 21dd1543a..f8376f46c 100755..100644
--- a/lib/packages/docutils/highlite.nim
+++ b/lib/packages/docutils/highlite.nim
@@ -1,6 +1,6 @@
 #
 #
-#            Nimrod's Runtime Library
+#            Nim's Runtime Library
 #        (c) Copyright 2012 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
@@ -10,101 +10,174 @@
 ## Source highlighter for programming or markup languages.
 ## Currently only few languages are supported, other languages may be added.
 ## The interface supports one language nested in another.
+##
+## You can use this to build your own syntax highlighting, check this example:
+##
+##   ```Nim
+##   let code = """for x in $int.high: echo x.ord mod 2 == 0"""
+##   var toknizr: GeneralTokenizer
+##   initGeneralTokenizer(toknizr, code)
+##   while true:
+##     getNextToken(toknizr, langNim)
+##     case toknizr.kind
+##     of gtEof: break  # End Of File (or string)
+##     of gtWhitespace:
+##       echo gtWhitespace # Maybe you want "visible" whitespaces?.
+##       echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
+##     of gtOperator:
+##       echo gtOperator # Maybe you want Operators to use a specific color?.
+##       echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
+##     # of gtSomeSymbol: syntaxHighlight("Comic Sans", "bold", "99px", "pink")
+##     else:
+##       echo toknizr.kind # All the kinds of tokens can be processed here.
+##       echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
+##   ```
+##
+## The proc `getSourceLanguage` can get the language `enum` from a string:
+##   ```Nim
+##   for l in ["C", "c++", "jAvA", "Nim", "c#"]: echo getSourceLanguage(l)
+##   ```
+##
+## There is also a `Cmd` pseudo-language supported, which is a simple generic
+## shell/cmdline tokenizer (UNIX shell/Powershell/Windows Command):
+## no escaping, no programming language constructs besides variable definition
+## at the beginning of line. It supports these operators:
+##   ```Cmd
+##   &  &&  |  ||  (  )  ''  ""  ;  # for comments
+##   ```
+##
+## Instead of escaping always use quotes like here
+## `nimgrep --ext:'nim|nims' file.name`:cmd: shows how to input ``|``.
+## Any argument that contains ``.`` or ``/`` or ``\`` will be treated
+## as a file or directory.
+##
+## In addition to `Cmd` there is also `Console` language for
+## displaying interactive sessions.
+## Lines with a command should start with ``$``, other lines are considered
+## as program output.
 
 import
-  strutils
+  std/strutils
+from std/algorithm import binarySearch
 
-type 
-  TTokenClass* = enum 
-    gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber, 
-    gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit, 
+when defined(nimPreviewSlimSystem):
+  import std/[assertions, syncio]
+
+
+type
+  SourceLanguage* = enum
+    langNone, langNim, langCpp, langCsharp, langC, langJava,
+    langYaml, langPython, langCmd, langConsole
+  TokenClass* = enum
+    gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber,
+    gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit,
     gtLongStringLit, gtCharLit, gtEscapeSequence, # escape sequence like \xff
-    gtOperator, gtPunctation, gtComment, gtLongComment, gtRegularExpression, 
-    gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler, 
-    gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel, 
-    gtReference, gtOther
-  TGeneralTokenizer* = object of TObject
-    kind*: TTokenClass
+    gtOperator, gtPunctuation, gtComment, gtLongComment, gtRegularExpression,
+    gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler,
+    gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel,
+    gtReference, gtPrompt, gtProgramOutput, gtProgram, gtOption, gtOther
+  GeneralTokenizer* = object of RootObj
+    kind*: TokenClass
     start*, length*: int
     buf: cstring
     pos: int
-    state: TTokenClass
-
-  TSourceLanguage* = enum 
-    langNone, langNimrod, langCpp, langCsharp, langC, langJava
-
-const 
-  sourceLanguageToStr*: array[TSourceLanguage, string] = ["none", "Nimrod", 
-    "C++", "C#", "C", "Java"]
-  tokenClassToStr*: array[TTokenClass, string] = ["Eof", "None", "Whitespace", 
-    "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber", 
-    "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit", 
-    "EscapeSequence", "Operator", "Punctation", "Comment", "LongComment", 
-    "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData", 
-    "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink", 
-    "Label", "Reference", "Other"]
-
-  nimrodKeywords = slurp("doc/keywords.txt").split
-
-proc getSourceLanguage*(name: string): TSourceLanguage = 
-  for i in countup(succ(low(TSourceLanguage)), high(TSourceLanguage)): 
-    if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0: 
+    state: TokenClass
+    lang: SourceLanguage
+
+const
+  sourceLanguageToStr*: array[SourceLanguage, string] = ["none",
+    "Nim", "C++", "C#", "C", "Java", "Yaml", "Python", "Cmd", "Console"]
+  sourceLanguageToAlpha*: array[SourceLanguage, string] = ["none",
+    "Nim", "cpp", "csharp", "C", "Java", "Yaml", "Python", "Cmd", "Console"]
+    ## list of languages spelled with alpabetic characters
+  tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace",
+    "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber",
+    "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit",
+    "EscapeSequence", "Operator", "Punctuation", "Comment", "LongComment",
+    "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData",
+    "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink",
+    "Label", "Reference", "Prompt", "ProgramOutput",
+    # start from lower-case if there is a corresponding RST role (see rst.nim)
+    "program", "option",
+    "Other"]
+
+  # The following list comes from doc/keywords.txt, make sure it is
+  # synchronized with this array by running the module itself as a test case.
+  nimKeywords = ["addr", "and", "as", "asm", "bind", "block",
+    "break", "case", "cast", "concept", "const", "continue", "converter",
+    "defer", "discard", "distinct", "div", "do",
+    "elif", "else", "end", "enum", "except", "export",
+    "finally", "for", "from", "func",
+    "if", "import", "in", "include",
+    "interface", "is", "isnot", "iterator", "let", "macro", "method",
+    "mixin", "mod", "nil", "not", "notin", "object", "of", "or", "out", "proc",
+    "ptr", "raise", "ref", "return", "shl", "shr", "static",
+    "template", "try", "tuple", "type", "using", "var", "when", "while",
+    "xor", "yield"]
+
+proc getSourceLanguage*(name: string): SourceLanguage =
+  for i in succ(low(SourceLanguage)) .. high(SourceLanguage):
+    if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0:
+      return i
+    if cmpIgnoreStyle(name, sourceLanguageToAlpha[i]) == 0:
       return i
   result = langNone
 
-proc initGeneralTokenizer*(g: var TGeneralTokenizer, buf: string) = 
-  g.buf = cstring(buf)
-  g.kind = low(TTokenClass)
+proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) =
+  g.buf = buf
+  g.kind = low(TokenClass)
   g.start = 0
   g.length = 0
-  g.state = low(TTokenClass)
-  var pos = 0                     # skip initial whitespace:
-  while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
-  g.pos = pos
+  g.state = low(TokenClass)
+  g.lang = low(SourceLanguage)
+  g.pos = 0
+
+proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: string) =
+  initGeneralTokenizer(g, cstring(buf))
 
-proc deinitGeneralTokenizer*(g: var TGeneralTokenizer) = 
-  nil
+proc deinitGeneralTokenizer*(g: var GeneralTokenizer) =
+  discard
 
-proc nimGetKeyword(id: string): TTokenClass = 
-  for k in nimrodKeywords:
+proc nimGetKeyword(id: string): TokenClass =
+  for k in nimKeywords:
     if cmpIgnoreStyle(id, k) == 0: return gtKeyword
   result = gtIdentifier
   when false:
     var i = getIdent(id)
     if (i.id >= ord(tokKeywordLow) - ord(tkSymbol)) and
-        (i.id <= ord(tokKeywordHigh) - ord(tkSymbol)): 
+        (i.id <= ord(tokKeywordHigh) - ord(tkSymbol)):
       result = gtKeyword
-    else: 
+    else:
       result = gtIdentifier
-  
-proc nimNumberPostfix(g: var TGeneralTokenizer, position: int): int = 
+
+proc nimNumberPostfix(g: var GeneralTokenizer, position: int): int =
   var pos = position
-  if g.buf[pos] == '\'': 
+  if g.buf[pos] == '\'':
     inc(pos)
     case g.buf[pos]
-    of 'f', 'F': 
+    of 'f', 'F':
       g.kind = gtFloatNumber
       inc(pos)
       if g.buf[pos] in {'0'..'9'}: inc(pos)
       if g.buf[pos] in {'0'..'9'}: inc(pos)
-    of 'i', 'I': 
+    of 'i', 'I':
       inc(pos)
       if g.buf[pos] in {'0'..'9'}: inc(pos)
       if g.buf[pos] in {'0'..'9'}: inc(pos)
-    else: 
-      nil
+    else:
+      discard
   result = pos
 
-proc nimNumber(g: var TGeneralTokenizer, position: int): int = 
+proc nimNumber(g: var GeneralTokenizer, position: int): int =
   const decChars = {'0'..'9', '_'}
   var pos = position
   g.kind = gtDecNumber
   while g.buf[pos] in decChars: inc(pos)
-  if g.buf[pos] == '.': 
+  if g.buf[pos] == '.':
     g.kind = gtFloatNumber
     inc(pos)
     while g.buf[pos] in decChars: inc(pos)
-  if g.buf[pos] in {'e', 'E'}: 
+  if g.buf[pos] in {'e', 'E'}:
     g.kind = gtFloatNumber
     inc(pos)
     if g.buf[pos] in {'+', '-'}: inc(pos)
@@ -112,362 +185,383 @@ proc nimNumber(g: var TGeneralTokenizer, position: int): int =
   result = nimNumberPostfix(g, pos)
 
 const
-  OpChars  = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.', 
-              '|', '=', '%', '&', '$', '@', '~', ':', '\x80'..'\xFF'}
+  OpChars  = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
+              '|', '=', '%', '&', '$', '@', '~', ':'}
+
+proc isKeyword(x: openArray[string], y: string): int =
+  binarySearch(x, y)
 
-proc nimNextToken(g: var TGeneralTokenizer) = 
-  const 
+proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) =
+  const
     hexChars = {'0'..'9', 'A'..'F', 'a'..'f', '_'}
     octChars = {'0'..'7', '_'}
     binChars = {'0'..'1', '_'}
     SymChars = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
   var pos = g.pos
   g.start = g.pos
-  if g.state == gtStringLit: 
-    g.kind = gtStringLit
-    while true: 
+  if g.state == gtStringLit:
+    if g.buf[pos] == '\\':
+      g.kind = gtEscapeSequence
+      inc(pos)
       case g.buf[pos]
-      of '\\': 
-        g.kind = gtEscapeSequence
+      of 'x', 'X':
         inc(pos)
+        if g.buf[pos] in hexChars: inc(pos)
+        if g.buf[pos] in hexChars: inc(pos)
+      of '0'..'9':
+        while g.buf[pos] in {'0'..'9'}: inc(pos)
+      of '\0':
+        g.state = gtNone
+      else: inc(pos)
+    else:
+      g.kind = gtStringLit
+      while true:
         case g.buf[pos]
-        of 'x', 'X': 
+        of '\\':
+          break
+        of '\0', '\r', '\n':
+          g.state = gtNone
+          break
+        of '\"':
           inc(pos)
-          if g.buf[pos] in hexChars: inc(pos)
-          if g.buf[pos] in hexChars: inc(pos)
-        of '0'..'9': 
-          while g.buf[pos] in {'0'..'9'}: inc(pos)
-        of '\0': 
           g.state = gtNone
+          break
         else: inc(pos)
-        break 
-      of '\0', '\x0D', '\x0A': 
-        g.state = gtNone
-        break 
-      of '\"': 
-        inc(pos)
-        g.state = gtNone
-        break 
-      else: inc(pos)
-  else: 
+  else:
     case g.buf[pos]
-    of ' ', '\x09'..'\x0D': 
+    of ' ', '\t'..'\r':
       g.kind = gtWhitespace
-      while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
-    of '#': 
+      while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
+    of '#':
       g.kind = gtComment
-      while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): inc(pos)
-    of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF': 
+      inc(pos)
+      var isDoc = false
+      if g.buf[pos] == '#':
+        inc(pos)
+        isDoc = true
+      if g.buf[pos] == '[' and g.lang == langNim:
+        g.kind = gtLongComment
+        var nesting = 0
+        while true:
+          case g.buf[pos]
+          of '\0': break
+          of '#':
+            if isDoc:
+              if g.buf[pos+1] == '#' and g.buf[pos+2] == '[':
+                inc nesting
+            elif g.buf[pos+1] == '[':
+              inc nesting
+            inc pos
+          of ']':
+            if isDoc:
+              if g.buf[pos+1] == '#' and g.buf[pos+2] == '#':
+                if nesting == 0:
+                  inc(pos, 3)
+                  break
+                dec nesting
+            elif g.buf[pos+1] == '#':
+              if nesting == 0:
+                inc(pos, 2)
+                break
+              dec nesting
+            inc pos
+          else:
+            inc pos
+      else:
+        while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
+    of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
       var id = ""
-      while g.buf[pos] in SymChars + {'_'}: 
+      while g.buf[pos] in SymChars + {'_'}:
         add(id, g.buf[pos])
         inc(pos)
-      if (g.buf[pos] == '\"'): 
-        if (g.buf[pos + 1] == '\"') and (g.buf[pos + 2] == '\"'): 
+      if (g.buf[pos] == '\"'):
+        if (g.buf[pos + 1] == '\"') and (g.buf[pos + 2] == '\"'):
           inc(pos, 3)
           g.kind = gtLongStringLit
-          while true: 
+          while true:
             case g.buf[pos]
-            of '\0': 
-              break 
-            of '\"': 
+            of '\0':
+              break
+            of '\"':
               inc(pos)
-              if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and 
-                  g.buf[pos+2] != '\"': 
+              if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and
+                  g.buf[pos+2] != '\"':
                 inc(pos, 2)
-                break 
+                break
             else: inc(pos)
-        else: 
+        else:
           g.kind = gtRawData
           inc(pos)
-          while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): 
+          while not (g.buf[pos] in {'\0', '\n', '\r'}):
             if g.buf[pos] == '"' and g.buf[pos+1] != '"': break
             inc(pos)
           if g.buf[pos] == '\"': inc(pos)
-      else: 
-        g.kind = nimGetKeyword(id)
-    of '0': 
+      else:
+        if g.lang == langNim:
+          g.kind = nimGetKeyword(id)
+        elif isKeyword(keywords, id) >= 0:
+          g.kind = gtKeyword
+    of '0':
       inc(pos)
       case g.buf[pos]
-      of 'b', 'B': 
+      of 'b', 'B':
+        g.kind = gtBinNumber
         inc(pos)
         while g.buf[pos] in binChars: inc(pos)
         pos = nimNumberPostfix(g, pos)
-      of 'x', 'X': 
+      of 'x', 'X':
+        g.kind = gtHexNumber
         inc(pos)
         while g.buf[pos] in hexChars: inc(pos)
         pos = nimNumberPostfix(g, pos)
-      of 'o', 'O': 
+      of 'o', 'O':
+        g.kind = gtOctNumber
         inc(pos)
         while g.buf[pos] in octChars: inc(pos)
         pos = nimNumberPostfix(g, pos)
       else: pos = nimNumber(g, pos)
-    of '1'..'9': 
+    of '1'..'9':
       pos = nimNumber(g, pos)
-    of '\'': 
+    of '\'':
       inc(pos)
-      g.kind = gtCharLit
-      while true: 
-        case g.buf[pos]
-        of '\0', '\x0D', '\x0A': 
-          break 
-        of '\'': 
-          inc(pos)
-          break 
-        of '\\': 
-          inc(pos, 2)
-        else: inc(pos)
-    of '\"': 
+      if g.kind != gtPunctuation:
+        g.kind = gtCharLit
+        while true:
+          case g.buf[pos]
+          of '\0', '\r', '\n':
+            break
+          of '\'':
+            inc(pos)
+            break
+          of '\\':
+            inc(pos, 2)
+          else: inc(pos)
+    of '\"':
       inc(pos)
-      if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'): 
+      if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'):
         inc(pos, 2)
         g.kind = gtLongStringLit
-        while true: 
+        while true:
           case g.buf[pos]
-          of '\0': 
-            break 
-          of '\"': 
+          of '\0':
+            break
+          of '\"':
             inc(pos)
-            if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and 
-                g.buf[pos+2] != '\"': 
+            if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and
+                g.buf[pos+2] != '\"':
               inc(pos, 2)
-              break 
+              break
           else: inc(pos)
-      else: 
+      else:
         g.kind = gtStringLit
-        while true: 
+        while true:
           case g.buf[pos]
-          of '\0', '\x0D', '\x0A': 
-            break 
-          of '\"': 
+          of '\0', '\r', '\n':
+            break
+          of '\"':
             inc(pos)
-            break 
-          of '\\': 
+            break
+          of '\\':
             g.state = g.kind
-            break 
+            break
           else: inc(pos)
-    of '(', ')', '[', ']', '{', '}', '`', ':', ',', ';': 
+    of '(', ')', '[', ']', '{', '}', '`', ':', ',', ';':
       inc(pos)
-      g.kind = gtPunctation
-    of '\0': 
+      g.kind = gtPunctuation
+    of '\0':
       g.kind = gtEof
-    else: 
-      if g.buf[pos] in OpChars: 
+    else:
+      if g.buf[pos] in OpChars:
         g.kind = gtOperator
         while g.buf[pos] in OpChars: inc(pos)
-      else: 
+      else:
         inc(pos)
         g.kind = gtNone
   g.length = pos - g.pos
-  if g.kind != gtEof and g.length <= 0:
+  if g.kind != gtEof and g.state != gtNone and g.length <= 0:
     assert false, "nimNextToken: produced an empty token"
   g.pos = pos
 
-proc generalNumber(g: var TGeneralTokenizer, position: int): int = 
+proc generalNumber(g: var GeneralTokenizer, position: int): int =
   const decChars = {'0'..'9'}
   var pos = position
   g.kind = gtDecNumber
   while g.buf[pos] in decChars: inc(pos)
-  if g.buf[pos] == '.': 
+  if g.buf[pos] == '.':
     g.kind = gtFloatNumber
     inc(pos)
     while g.buf[pos] in decChars: inc(pos)
-  if g.buf[pos] in {'e', 'E'}: 
+  if g.buf[pos] in {'e', 'E'}:
     g.kind = gtFloatNumber
     inc(pos)
     if g.buf[pos] in {'+', '-'}: inc(pos)
     while g.buf[pos] in decChars: inc(pos)
   result = pos
 
-proc generalStrLit(g: var TGeneralTokenizer, position: int): int = 
-  const 
+proc generalStrLit(g: var GeneralTokenizer, position: int): int =
+  const
     decChars = {'0'..'9'}
     hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
   var pos = position
   g.kind = gtStringLit
   var c = g.buf[pos]
   inc(pos)                    # skip " or '
-  while true: 
+  while true:
     case g.buf[pos]
-    of '\0': 
-      break 
-    of '\\': 
+    of '\0':
+      break
+    of '\\':
       inc(pos)
       case g.buf[pos]
-      of '\0': 
-        break 
-      of '0'..'9': 
+      of '\0':
+        break
+      of '0'..'9':
         while g.buf[pos] in decChars: inc(pos)
-      of 'x', 'X': 
+      of 'x', 'X':
         inc(pos)
         if g.buf[pos] in hexChars: inc(pos)
         if g.buf[pos] in hexChars: inc(pos)
       else: inc(pos, 2)
-    else: 
-      if g.buf[pos] == c: 
+    else:
+      if g.buf[pos] == c:
         inc(pos)
-        break 
-      else: 
+        break
+      else:
         inc(pos)
   result = pos
 
-proc isKeyword(x: openarray[string], y: string): int = 
-  var a = 0
-  var b = len(x) - 1
-  while a <= b: 
-    var mid = (a + b) div 2
-    var c = cmp(x[mid], y)
-    if c < 0: 
-      a = mid + 1
-    elif c > 0: 
-      b = mid - 1
-    else: 
-      return mid
-  result = - 1
-
-proc isKeywordIgnoreCase(x: openarray[string], y: string): int = 
-  var a = 0
-  var b = len(x) - 1
-  while a <= b: 
-    var mid = (a + b) div 2
-    var c = cmpIgnoreCase(x[mid], y)
-    if c < 0: 
-      a = mid + 1
-    elif c > 0: 
-      b = mid - 1
-    else: 
-      return mid
-  result = - 1
-
-type 
-  TTokenizerFlag = enum 
+type
+  TokenizerFlag = enum
     hasPreprocessor, hasNestedComments
-  TTokenizerFlags = set[TTokenizerFlag]
+  TokenizerFlags = set[TokenizerFlag]
 
-proc clikeNextToken(g: var TGeneralTokenizer, keywords: openarray[string], 
-                    flags: TTokenizerFlags) = 
-  const 
+proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string],
+                    flags: TokenizerFlags) =
+  const
     hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
     octChars = {'0'..'7'}
     binChars = {'0'..'1'}
     symChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '\x80'..'\xFF'}
   var pos = g.pos
   g.start = g.pos
-  if g.state == gtStringLit: 
+  if g.state == gtStringLit:
     g.kind = gtStringLit
-    while true: 
+    while true:
       case g.buf[pos]
-      of '\\': 
+      of '\\':
         g.kind = gtEscapeSequence
         inc(pos)
         case g.buf[pos]
-        of 'x', 'X': 
+        of 'x', 'X':
           inc(pos)
           if g.buf[pos] in hexChars: inc(pos)
           if g.buf[pos] in hexChars: inc(pos)
-        of '0'..'9': 
+        of '0'..'9':
           while g.buf[pos] in {'0'..'9'}: inc(pos)
-        of '\0': 
+        of '\0':
           g.state = gtNone
         else: inc(pos)
-        break 
-      of '\0', '\x0D', '\x0A': 
+        break
+      of '\0', '\r', '\n':
         g.state = gtNone
-        break 
-      of '\"': 
+        break
+      of '\"':
         inc(pos)
         g.state = gtNone
-        break 
+        break
       else: inc(pos)
-  else: 
+  else:
     case g.buf[pos]
-    of ' ', '\x09'..'\x0D': 
+    of ' ', '\t'..'\r':
       g.kind = gtWhitespace
-      while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
-    of '/': 
+      while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
+    of '/':
       inc(pos)
-      if g.buf[pos] == '/': 
+      if g.buf[pos] == '/':
         g.kind = gtComment
-        while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): inc(pos)
-      elif g.buf[pos] == '*': 
+        while not (g.buf[pos] in {'\0', '\n', '\r'}): inc(pos)
+      elif g.buf[pos] == '*':
         g.kind = gtLongComment
         var nested = 0
         inc(pos)
-        while true: 
+        while true:
           case g.buf[pos]
-          of '*': 
+          of '*':
             inc(pos)
-            if g.buf[pos] == '/': 
+            if g.buf[pos] == '/':
               inc(pos)
-              if nested == 0: break 
-          of '/': 
+              if nested == 0: break
+          of '/':
             inc(pos)
-            if g.buf[pos] == '*': 
+            if g.buf[pos] == '*':
               inc(pos)
               if hasNestedComments in flags: inc(nested)
-          of '\0': 
-            break 
+          of '\0':
+            break
           else: inc(pos)
-    of '#': 
+      else:
+        g.kind = gtOperator
+        while g.buf[pos] in OpChars: inc(pos)
+    of '#':
       inc(pos)
-      if hasPreprocessor in flags: 
+      if hasPreprocessor in flags:
         g.kind = gtPreprocessor
         while g.buf[pos] in {' ', '\t'}: inc(pos)
         while g.buf[pos] in symChars: inc(pos)
-      else: 
+      else:
         g.kind = gtOperator
-    of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF': 
+    of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
       var id = ""
-      while g.buf[pos] in SymChars: 
+      while g.buf[pos] in symChars:
         add(id, g.buf[pos])
         inc(pos)
       if isKeyword(keywords, id) >= 0: g.kind = gtKeyword
       else: g.kind = gtIdentifier
-    of '0': 
+    of '0':
       inc(pos)
       case g.buf[pos]
-      of 'b', 'B': 
+      of 'b', 'B':
         inc(pos)
         while g.buf[pos] in binChars: inc(pos)
         if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
-      of 'x', 'X': 
+      of 'x', 'X':
         inc(pos)
         while g.buf[pos] in hexChars: inc(pos)
         if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
-      of '0'..'7': 
+      of '0'..'7':
         inc(pos)
         while g.buf[pos] in octChars: inc(pos)
         if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
-      else: 
+      else:
         pos = generalNumber(g, pos)
         if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
-    of '1'..'9': 
+    of '1'..'9':
       pos = generalNumber(g, pos)
       if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
-    of '\'': 
+    of '\'':
       pos = generalStrLit(g, pos)
       g.kind = gtCharLit
-    of '\"': 
+    of '\"':
       inc(pos)
       g.kind = gtStringLit
-      while true: 
+      while true:
         case g.buf[pos]
-        of '\0': 
-          break 
-        of '\"': 
+        of '\0':
+          break
+        of '\"':
           inc(pos)
-          break 
-        of '\\': 
+          break
+        of '\\':
           g.state = g.kind
-          break 
+          break
         else: inc(pos)
-    of '(', ')', '[', ']', '{', '}', ':', ',', ';', '.': 
+    of '(', ')', '[', ']', '{', '}', ':', ',', ';', '.':
       inc(pos)
-      g.kind = gtPunctation
-    of '\0': 
+      g.kind = gtPunctuation
+    of '\0':
       g.kind = gtEof
-    else: 
-      if g.buf[pos] in OpChars: 
+    else:
+      if g.buf[pos] in OpChars:
         g.kind = gtOperator
         while g.buf[pos] in OpChars: inc(pos)
       else:
@@ -478,60 +572,468 @@ proc clikeNextToken(g: var TGeneralTokenizer, keywords: openarray[string],
     assert false, "clikeNextToken: produced an empty token"
   g.pos = pos
 
-proc cNextToken(g: var TGeneralTokenizer) = 
-  const 
-    keywords: array[0..36, string] = ["_Bool", "_Complex", "_Imaginary", "auto", 
-      "break", "case", "char", "const", "continue", "default", "do", "double", 
-      "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int", 
-      "long", "register", "restrict", "return", "short", "signed", "sizeof", 
-      "static", "struct", "switch", "typedef", "union", "unsigned", "void", 
+proc cNextToken(g: var GeneralTokenizer) =
+  const
+    keywords: array[0..36, string] = ["_Bool", "_Complex", "_Imaginary", "auto",
+      "break", "case", "char", "const", "continue", "default", "do", "double",
+      "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int",
+      "long", "register", "restrict", "return", "short", "signed", "sizeof",
+      "static", "struct", "switch", "typedef", "union", "unsigned", "void",
       "volatile", "while"]
   clikeNextToken(g, keywords, {hasPreprocessor})
 
-proc cppNextToken(g: var TGeneralTokenizer) = 
-  const 
-    keywords: array[0..47, string] = ["asm", "auto", "break", "case", "catch", 
-      "char", "class", "const", "continue", "default", "delete", "do", "double", 
-      "else", "enum", "extern", "float", "for", "friend", "goto", "if", 
-      "inline", "int", "long", "new", "operator", "private", "protected", 
-      "public", "register", "return", "short", "signed", "sizeof", "static", 
-      "struct", "switch", "template", "this", "throw", "try", "typedef", 
+proc cppNextToken(g: var GeneralTokenizer) =
+  const
+    keywords: array[0..47, string] = ["asm", "auto", "break", "case", "catch",
+      "char", "class", "const", "continue", "default", "delete", "do", "double",
+      "else", "enum", "extern", "float", "for", "friend", "goto", "if",
+      "inline", "int", "long", "new", "operator", "private", "protected",
+      "public", "register", "return", "short", "signed", "sizeof", "static",
+      "struct", "switch", "template", "this", "throw", "try", "typedef",
       "union", "unsigned", "virtual", "void", "volatile", "while"]
   clikeNextToken(g, keywords, {hasPreprocessor})
 
-proc csharpNextToken(g: var TGeneralTokenizer) = 
-  const 
-    keywords: array[0..76, string] = ["abstract", "as", "base", "bool", "break", 
-      "byte", "case", "catch", "char", "checked", "class", "const", "continue", 
-      "decimal", "default", "delegate", "do", "double", "else", "enum", "event", 
-      "explicit", "extern", "false", "finally", "fixed", "float", "for", 
-      "foreach", "goto", "if", "implicit", "in", "int", "interface", "internal", 
-      "is", "lock", "long", "namespace", "new", "null", "object", "operator", 
-      "out", "override", "params", "private", "protected", "public", "readonly", 
-      "ref", "return", "sbyte", "sealed", "short", "sizeof", "stackalloc", 
-      "static", "string", "struct", "switch", "this", "throw", "true", "try", 
-      "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using", 
+proc csharpNextToken(g: var GeneralTokenizer) =
+  const
+    keywords: array[0..76, string] = ["abstract", "as", "base", "bool", "break",
+      "byte", "case", "catch", "char", "checked", "class", "const", "continue",
+      "decimal", "default", "delegate", "do", "double", "else", "enum", "event",
+      "explicit", "extern", "false", "finally", "fixed", "float", "for",
+      "foreach", "goto", "if", "implicit", "in", "int", "interface", "internal",
+      "is", "lock", "long", "namespace", "new", "null", "object", "operator",
+      "out", "override", "params", "private", "protected", "public", "readonly",
+      "ref", "return", "sbyte", "sealed", "short", "sizeof", "stackalloc",
+      "static", "string", "struct", "switch", "this", "throw", "true", "try",
+      "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using",
       "virtual", "void", "volatile", "while"]
   clikeNextToken(g, keywords, {hasPreprocessor})
 
-proc javaNextToken(g: var TGeneralTokenizer) = 
-  const 
-    keywords: array[0..52, string] = ["abstract", "assert", "boolean", "break", 
-      "byte", "case", "catch", "char", "class", "const", "continue", "default", 
-      "do", "double", "else", "enum", "extends", "false", "final", "finally", 
-      "float", "for", "goto", "if", "implements", "import", "instanceof", "int", 
-      "interface", "long", "native", "new", "null", "package", "private", 
-      "protected", "public", "return", "short", "static", "strictfp", "super", 
-      "switch", "synchronized", "this", "throw", "throws", "transient", "true", 
+proc javaNextToken(g: var GeneralTokenizer) =
+  const
+    keywords: array[0..52, string] = ["abstract", "assert", "boolean", "break",
+      "byte", "case", "catch", "char", "class", "const", "continue", "default",
+      "do", "double", "else", "enum", "extends", "false", "final", "finally",
+      "float", "for", "goto", "if", "implements", "import", "instanceof", "int",
+      "interface", "long", "native", "new", "null", "package", "private",
+      "protected", "public", "return", "short", "static", "strictfp", "super",
+      "switch", "synchronized", "this", "throw", "throws", "transient", "true",
       "try", "void", "volatile", "while"]
   clikeNextToken(g, keywords, {})
 
-proc getNextToken*(g: var TGeneralTokenizer, lang: TSourceLanguage) = 
+proc yamlPlainStrLit(g: var GeneralTokenizer, pos: var int) =
+  g.kind = gtStringLit
+  while g.buf[pos] notin {'\0', '\t'..'\r', ',', ']', '}'}:
+    if g.buf[pos] == ':' and
+        g.buf[pos + 1] in {'\0', '\t'..'\r', ' '}:
+      break
+    inc(pos)
+
+proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) =
+  g.kind = gtNone
+  if g.buf[pos] == '-': inc(pos)
+  if g.buf[pos] == '0': inc(pos)
+  elif g.buf[pos] in '1'..'9':
+    inc(pos)
+    while g.buf[pos] in {'0'..'9'}: inc(pos)
+  else: yamlPlainStrLit(g, pos)
+  if g.kind == gtNone:
+    if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
+      g.kind = gtDecNumber
+    elif g.buf[pos] == '.':
+      inc(pos)
+      if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
+      else:
+        while g.buf[pos] in {'0'..'9'}: inc(pos)
+        if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
+          g.kind = gtFloatNumber
+    if g.kind == gtNone:
+      if g.buf[pos] in {'e', 'E'}:
+        inc(pos)
+        if g.buf[pos] in {'-', '+'}: inc(pos)
+        if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
+        else:
+          while g.buf[pos] in {'0'..'9'}: inc(pos)
+          if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
+            g.kind = gtFloatNumber
+          else: yamlPlainStrLit(g, pos)
+      else: yamlPlainStrLit(g, pos)
+  while g.buf[pos] notin {'\0', ',', ']', '}', '\n', '\r'}:
+    inc(pos)
+    if g.buf[pos] notin {'\t'..'\r', ' ', ',', ']', '}'}:
+      yamlPlainStrLit(g, pos)
+      break
+  # theoretically, we would need to parse indentation (like with block scalars)
+  # because of possible multiline flow scalars that start with number-like
+  # content, but that is far too troublesome. I think it is fine that the
+  # highlighter is sloppy here.
+
+proc yamlNextToken(g: var GeneralTokenizer) =
+  const
+    hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
+  var pos = g.pos
+  g.start = g.pos
+  if g.state == gtStringLit:
+    g.kind = gtStringLit
+    while true:
+      case g.buf[pos]
+      of '\\':
+        if pos != g.pos: break
+        g.kind = gtEscapeSequence
+        inc(pos)
+        case g.buf[pos]
+        of 'x':
+          inc(pos)
+          for i in 1..2:
+            if g.buf[pos] in hexChars: inc(pos)
+          break
+        of 'u':
+          inc(pos)
+          for i in 1..4:
+            if g.buf[pos] in hexChars: inc(pos)
+          break
+        of 'U':
+          inc(pos)
+          for i in 1..8:
+            if g.buf[pos] in hexChars: inc(pos)
+          break
+        else: inc(pos)
+        break
+      of '\0':
+        g.state = gtOther
+        break
+      of '\"':
+        inc(pos)
+        g.state = gtOther
+        break
+      else: inc(pos)
+  elif g.state == gtCharLit:
+    # abusing gtCharLit as single-quoted string lit
+    g.kind = gtStringLit
+    inc(pos) # skip the starting '
+    while true:
+      case g.buf[pos]
+      of '\'':
+        inc(pos)
+        if g.buf[pos] == '\'':
+          inc(pos)
+          g.kind = gtEscapeSequence
+        else: g.state = gtOther
+        break
+      else: inc(pos)
+  elif g.state == gtCommand:
+    # gtCommand means 'block scalar header'
+    case g.buf[pos]
+    of ' ', '\t':
+      g.kind = gtWhitespace
+      while g.buf[pos] in {' ', '\t'}: inc(pos)
+    of '#':
+      g.kind = gtComment
+      while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
+    of '\n', '\r': discard
+    else:
+      # illegal here. just don't parse a block scalar
+      g.kind = gtNone
+      g.state = gtOther
+    if g.buf[pos] in {'\n', '\r'} and g.state == gtCommand:
+      g.state = gtLongStringLit
+  elif g.state == gtLongStringLit:
+    # beware, this is the only token where we actually have to parse
+    # indentation.
+
+    g.kind = gtLongStringLit
+    # first, we have to find the parent indentation of the block scalar, so that
+    # we know when to stop
+    assert g.buf[pos] in {'\n', '\r'}
+    var lookbehind = pos - 1
+    var headerStart = -1
+    while lookbehind >= 0 and g.buf[lookbehind] notin {'\n', '\r'}:
+      if headerStart == -1 and g.buf[lookbehind] in {'|', '>'}:
+        headerStart = lookbehind
+      dec(lookbehind)
+    assert headerStart != -1
+    var indentation = 1
+    while g.buf[lookbehind + indentation] == ' ': inc(indentation)
+    if g.buf[lookbehind + indentation] in {'|', '>'}:
+      # when the header is alone in a line, this line does not show the parent's
+      # indentation, so we must go further. search the first previous line with
+      # non-whitespace content.
+      while lookbehind >= 0 and g.buf[lookbehind] in {'\n', '\r'}:
+        dec(lookbehind)
+        while lookbehind >= 0 and
+            g.buf[lookbehind] in {' ', '\t'}: dec(lookbehind)
+      # now, find the beginning of the line...
+      while lookbehind >= 0 and g.buf[lookbehind] notin {'\n', '\r'}:
+        dec(lookbehind)
+      # ... and its indentation
+      indentation = 1
+      while g.buf[lookbehind + indentation] == ' ': inc(indentation)
+    if lookbehind == -1: indentation = 0 # top level
+    elif g.buf[lookbehind + 1] == '-' and g.buf[lookbehind + 2] == '-' and
+        g.buf[lookbehind + 3] == '-' and
+        g.buf[lookbehind + 4] in {'\t'..'\r', ' '}:
+      # this is a document start, therefore, we are at top level
+      indentation = 0
+    # because lookbehind was at newline char when calculating indentation, we're
+    # off by one. fix that. top level's parent will have indentation of -1.
+    let parentIndentation = indentation - 1
+
+    # find first content
+    while g.buf[pos] in {' ', '\n', '\r'}:
+      if g.buf[pos] == ' ': inc(indentation)
+      else: indentation = 0
+      inc(pos)
+    var minIndentation = indentation
+
+    # for stupid edge cases, we must check whether an explicit indentation depth
+    # is given at the header.
+    while g.buf[headerStart] in {'>', '|', '+', '-'}: inc(headerStart)
+    if g.buf[headerStart] in {'0'..'9'}:
+      minIndentation = min(minIndentation, ord(g.buf[headerStart]) - ord('0'))
+
+    # process content lines
+    while indentation > parentIndentation and g.buf[pos] != '\0':
+      if (indentation < minIndentation and g.buf[pos] == '#') or
+          (indentation == 0 and g.buf[pos] == '.' and g.buf[pos + 1] == '.' and
+          g.buf[pos + 2] == '.' and
+          g.buf[pos + 3] in {'\0', '\t'..'\r', ' '}):
+        # comment after end of block scalar, or end of document
+        break
+      minIndentation = min(indentation, minIndentation)
+      while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
+      while g.buf[pos] in {' ', '\n', '\r'}:
+        if g.buf[pos] == ' ': inc(indentation)
+        else: indentation = 0
+        inc(pos)
+
+    g.state = gtOther
+  elif g.state == gtOther:
+    # gtOther means 'inside YAML document'
+    case g.buf[pos]
+    of ' ', '\t'..'\r':
+      g.kind = gtWhitespace
+      while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
+    of '#':
+      g.kind = gtComment
+      inc(pos)
+      while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
+    of '-':
+      inc(pos)
+      if g.buf[pos] in {'\0', ' ', '\t'..'\r'}:
+        g.kind = gtPunctuation
+      elif g.buf[pos] == '-' and
+          (pos == 1 or g.buf[pos - 2] in {'\n', '\r'}): # start of line
+        inc(pos)
+        if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\t'..'\r', ' '}:
+          inc(pos)
+          g.kind = gtKeyword
+        else: yamlPossibleNumber(g, pos)
+      else: yamlPossibleNumber(g, pos)
+    of '.':
+      if pos == 0 or g.buf[pos - 1] in {'\n', '\r'}:
+        inc(pos)
+        for i in 1..2:
+          if g.buf[pos] != '.': break
+          inc(pos)
+        if pos == g.start + 3:
+          g.kind = gtKeyword
+          g.state = gtNone
+        else: yamlPlainStrLit(g, pos)
+      else: yamlPlainStrLit(g, pos)
+    of '?':
+      inc(pos)
+      if g.buf[pos] in {'\0', ' ', '\t'..'\r'}:
+        g.kind = gtPunctuation
+      else: yamlPlainStrLit(g, pos)
+    of ':':
+      inc(pos)
+      if g.buf[pos] in {'\0', '\t'..'\r', ' ', '\'', '\"'} or
+          (pos > 0 and g.buf[pos - 2] in {'}', ']', '\"', '\''}):
+        g.kind = gtPunctuation
+      else: yamlPlainStrLit(g, pos)
+    of '[', ']', '{', '}', ',':
+      inc(pos)
+      g.kind = gtPunctuation
+    of '\"':
+      inc(pos)
+      g.state = gtStringLit
+      g.kind = gtStringLit
+    of '\'':
+      g.state = gtCharLit
+      g.kind = gtNone
+    of '!':
+      g.kind = gtTagStart
+      inc(pos)
+      if g.buf[pos] == '<':
+        # literal tag (e.g. `!<tag:yaml.org,2002:str>`)
+        while g.buf[pos] notin {'\0', '>', '\t'..'\r', ' '}: inc(pos)
+        if g.buf[pos] == '>': inc(pos)
+      else:
+        while g.buf[pos] in {'A'..'Z', 'a'..'z', '0'..'9', '-'}: inc(pos)
+        case g.buf[pos]
+        of '!':
+          # prefixed tag (e.g. `!!str`)
+          inc(pos)
+          while g.buf[pos] notin
+              {'\0', '\t'..'\r', ' ', ',', '[', ']', '{', '}'}: inc(pos)
+        of '\0', '\t'..'\r', ' ': discard
+        else:
+          # local tag (e.g. `!nim:system:int`)
+          while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
+    of '&':
+      g.kind = gtLabel
+      while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
+    of '*':
+      g.kind = gtReference
+      while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
+    of '|', '>':
+      # this can lead to incorrect tokenization when | or > appear inside flow
+      # content. checking whether we're inside flow content is not
+      # chomsky type-3, so we won't do that here.
+      g.kind = gtCommand
+      g.state = gtCommand
+      inc(pos)
+      while g.buf[pos] in {'0'..'9', '+', '-'}: inc(pos)
+    of '0'..'9': yamlPossibleNumber(g, pos)
+    of '\0': g.kind = gtEof
+    else: yamlPlainStrLit(g, pos)
+  else:
+    # outside document
+    case g.buf[pos]
+    of '%':
+      if pos == 0 or g.buf[pos - 1] in {'\n', '\r'}:
+        g.kind = gtDirective
+        while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
+      else:
+        g.state = gtOther
+        yamlPlainStrLit(g, pos)
+    of ' ', '\t'..'\r':
+      g.kind = gtWhitespace
+      while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
+    of '#':
+      g.kind = gtComment
+      while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
+    of '\0': g.kind = gtEof
+    else:
+      g.kind = gtNone
+      g.state = gtOther
+  g.length = pos - g.pos
+  g.pos = pos
+
+proc pythonNextToken(g: var GeneralTokenizer) =
+  const
+    keywords: array[0..34, string] = [
+      "False", "None", "True", "and", "as", "assert", "async", "await",
+      "break", "class", "continue", "def", "del", "elif", "else", "except",
+      "finally", "for", "from", "global", "if", "import", "in", "is", "lambda",
+      "nonlocal", "not", "or", "pass", "raise", "return", "try", "while",
+      "with", "yield"]
+  nimNextToken(g, keywords)
+
+proc cmdNextToken(g: var GeneralTokenizer, dollarPrompt = false) =
+  var pos = g.pos
+  g.start = g.pos
+  if g.state == low(TokenClass):
+    g.state = if dollarPrompt: gtPrompt else: gtProgram
+  case g.buf[pos]
+  of ' ', '\t'..'\r':
+    g.kind = gtWhitespace
+    while g.buf[pos] in {' ', '\t'..'\r'}:
+      if g.buf[pos] == '\n':
+        g.state = if dollarPrompt: gtPrompt else: gtProgram
+      inc(pos)
+  of '\'', '"':
+    g.kind = gtOption
+    let q = g.buf[pos]
+    inc(pos)
+    while g.buf[pos] notin {q, '\0'}:
+      inc(pos)
+    if g.buf[pos] == q: inc(pos)
+  of '#':
+    g.kind = gtComment
+    while g.buf[pos] notin {'\n', '\0'}:
+      inc(pos)
+  of '&', '|':
+    g.kind = gtOperator
+    inc(pos)
+    if g.buf[pos] == g.buf[pos-1]: inc(pos)
+    g.state = gtProgram
+  of '(':
+    g.kind = gtOperator
+    g.state = gtProgram
+    inc(pos)
+  of ')':
+    g.kind = gtOperator
+    inc(pos)
+  of ';':
+    g.state = gtProgram
+    g.kind = gtOperator
+    inc(pos)
+  of '\0': g.kind = gtEof
+  elif dollarPrompt and g.state == gtPrompt:
+    if g.buf[pos] == '$' and g.buf[pos+1] in {' ', '\t'}:
+      g.kind = gtPrompt
+      inc pos, 2
+      g.state = gtProgram
+    else:
+      g.kind = gtProgramOutput
+      while g.buf[pos] notin {'\n', '\0'}:
+        inc(pos)
+  else:
+    if g.state == gtProgram:
+      g.kind = gtProgram
+      g.state = gtOption
+    else:
+      g.kind = gtOption
+    while g.buf[pos] notin {' ', '\t'..'\r', '&', '|', '(', ')', '\'', '"', '\0'}:
+      if g.buf[pos] == ';' and g.buf[pos+1] == ' ':
+        # (check space because ';' can be used inside arguments in Win bat)
+        break
+      if g.kind == gtOption and g.buf[pos] in {'/', '\\', '.'}:
+        g.kind = gtIdentifier  # for file/dir name
+      elif g.kind == gtProgram and g.buf[pos] == '=':
+        g.kind = gtIdentifier  # for env variable setting at beginning of line
+        g.state = gtProgram
+      inc(pos)
+  g.length = pos - g.pos
+  g.pos = pos
+
+proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
+  g.lang = lang
   case lang
   of langNone: assert false
-  of langNimrod: nimNextToken(g)
+  of langNim: nimNextToken(g)
   of langCpp: cppNextToken(g)
   of langCsharp: csharpNextToken(g)
   of langC: cNextToken(g)
   of langJava: javaNextToken(g)
-  
+  of langYaml: yamlNextToken(g)
+  of langPython: pythonNextToken(g)
+  of langCmd: cmdNextToken(g)
+  of langConsole: cmdNextToken(g, dollarPrompt=true)
+
+proc tokenize*(text: string, lang: SourceLanguage): seq[(string, TokenClass)] =
+  var g: GeneralTokenizer
+  initGeneralTokenizer(g, text)
+  var prevPos = 0
+  while true:
+    getNextToken(g, lang)
+    if g.kind == gtEof:
+      break
+    var s = text[prevPos ..< g.pos]
+    result.add (s, g.kind)
+    prevPos = g.pos
+
+when isMainModule:
+  var keywords: seq[string]
+  # Try to work running in both the subdir or at the root.
+  for filename in ["doc/keywords.txt", "../../../doc/keywords.txt"]:
+    try:
+      let input = readFile(filename)
+      keywords = input.splitWhitespace()
+      break
+    except:
+      echo filename, " not found"
+  doAssert(keywords.len > 0, "Couldn't read any keywords.txt file!")
+  for i in 0..min(keywords.len, nimKeywords.len)-1:
+    doAssert keywords[i] == nimKeywords[i], "Unexpected keyword"
+  doAssert keywords.len == nimKeywords.len, "No matching lengths"