path: root/lib/packages/docutils/highlite.nim


 
#
#
#            Nimrod's Runtime Library
#        (c) Copyright 2012 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## Source highlighter for programming or markup languages.
## Currently only few languages are supported, other languages may be added.
## The interface supports one language nested in another.

import
  strutils

type 
  TTokenClass* = enum 
    gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber, 
    gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit, 
    gtLongStringLit, gtCharLit, gtEscapeSequence, # escape sequence like \xff
    gtOperator, gtPunctation, gtComment, gtLongComment, gtRegularExpression, 
    gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler, 
    gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel, 
    gtReference, gtOther
  TGeneralTokenizer* = object of TObject
    kind*: TTokenClass
    start*, length*: int
    buf: cstring
    pos: int
    state: TTokenClass

  TSourceLanguage* = enum 
    langNone, langNimrod, langCpp, langCsharp, langC, langJava

const 
  sourceLanguageToStr*: array[TSourceLanguage, string] = ["none", "Nimrod", 
    "C++", "C#", "C", "Java"]
  tokenClassToStr*: array[TTokenClass, string] = ["Eof", "None", "Whitespace", 
    "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber", 
    "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit", 
    "EscapeSequence", "Operator", "Punctation", "Comment", "LongComment", 
    "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData", 
    "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink", 
    "Label", "Reference", "Other"]

  # The following list comes from doc/keywords.txt, make sure it is
  # synchronized with this array by running the module itself as a test case.
  nimrodKeywords = ["addr", "and", "as", "asm", "atomic", "bind", "block",
    "break", "case", "cast", "const", "continue", "converter", "discard",
    "distinct", "div", "do", "elif", "else", "end", "enum", "except", "export",
    "finally", "for", "from", "generic", "if", "import", "in", "include",
    "interface", "is", "isnot", "iterator", "lambda", "let", "macro", "method",
    "mixin", "mod", "nil", "not", "notin", "object", "of", "or", "out", "proc",
    "ptr", "raise", "ref", "return", "shared", "shl", "shr", "static",
    "template", "try", "tuple", "type", "using", "var", "when", "while", "with",
    "without", "xor", "yield"]

proc getSourceLanguage*(name: string): TSourceLanguage = 
  for i in countup(succ(low(TSourceLanguage)), high(TSourceLanguage)): 
    if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0: 
      return i
  result = langNone

proc initGeneralTokenizer*(g: var TGeneralTokenizer, buf: string) = 
  g.buf = cstring(buf)
  g.kind = low(TTokenClass)
  g.start = 0
  g.length = 0
  g.state = low(TTokenClass)
  var pos = 0                     # skip initial whitespace:
  while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
  g.pos = pos

proc deinitGeneralTokenizer*(g: var TGeneralTokenizer) = 
  discard

proc nimGetKeyword(id: string): TTokenClass = 
  for k in nimrodKeywords:
    if cmpIgnoreStyle(id, k) == 0: return gtKeyword
  result = gtIdentifier
  when false:
    var i = getIdent(id)
    if (i.id >= ord(tokKeywordLow) - ord(tkSymbol)) and
        (i.id <= ord(tokKeywordHigh) - ord(tkSymbol)): 
      result = gtKeyword
    else: 
      result = gtIdentifier
  
proc nimNumberPostfix(g: var TGeneralTokenizer, position: int): int = 
  var pos = position
  if g.buf[pos] == '\'': 
    inc(pos)
    case g.buf[pos]
    of 'f', 'F': 
      g.kind = gtFloatNumber
      inc(pos)
      if g.buf[pos] in {'0'..'9'}: inc(pos)
      if g.buf[pos] in {'0'..'9'}: inc(pos)
    of 'i', 'I': 
      inc(pos)
      if g.buf[pos] in {'0'..'9'}: inc(pos)
      if g.buf[pos] in {'0'..'9'}: inc(pos)
    else: 
      discard
  result = pos

proc nimNumber(g: var TGeneralTokenizer, position: int): int = 
  const decChars = {'0'..'9', '_'}
  var pos = position
  g.kind = gtDecNumber
  while g.buf[pos] in decChars: inc(pos)
  if g.buf[pos] == '.': 
    g.kind = gtFloatNumber
    inc(pos)
    while g.buf[pos] in decChars: inc(pos)
  if g.buf[pos] in {'e', 'E'}: 
    g.kind = gtFloatNumber
    inc(pos)
    if g.buf[pos] in {'+', '-'}: inc(pos)
    while g.buf[pos] in decChars: inc(pos)
  result = nimNumberPostfix(g, pos)

const
  OpChars  = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.', 
              '|', '=', '%', '&', '$', '@', '~', ':', '\x80'..'\xFF'}

proc nimNextToken(g: var TGeneralTokenizer) = 
  const 
    hexChars = {'0'..'9', 'A'..'F', 'a'..'f', '_'}
    octChars = {'0'..'7', '_'}
    binChars = {'0'..'1', '_'}
    SymChars = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
  var pos = g.pos
  g.start = g.pos
  if g.state == gtStringLit: 
    g.kind = gtStringLit
    while true: 
      case g.buf[pos]
      of '\\': 
        g.kind = gtEscapeSequence
        inc(pos)
        case g.buf[pos]
        of 'x', 'X': 
          inc(pos)
          if g.buf[pos] in hexChars: inc(pos)
          if g.buf[pos] in hexChars: inc(pos)
        of '0'..'9': 
          while g.buf[pos] in {'0'..'9'}: inc(pos)
        of '\0': 
          g.state = gtNone
        else: inc(pos)
        break 
      of '\0', '\x0D', '\x0A': 
        g.state = gtNone
        break 
      of '\"': 
        inc(pos)
        g.state = gtNone
        break 
      else: inc(pos)
  else: 
    case g.buf[pos]
    of ' ', '\x09'..'\x0D': 
      g.kind = gtWhitespace
      while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
    of '#': 
      g.kind = gtComment
      while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): inc(pos)
    of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF': 
      var id = ""
      while g.buf[pos] in SymChars + {'_'}: 
        add(id, g.buf[pos])
        inc(pos)
      if (g.buf[pos] == '\"'): 
        if (g.buf[pos + 1] == '\"') and (g.buf[pos + 2] == '\"'): 
          inc(pos, 3)
          g.kind = gtLongStringLit
          while true: 
            case g.buf[pos]
            of '\0': 
              break 
            of '\"': 
              inc(pos)
              if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and 
                  g.buf[pos+2] != '\"': 
                inc(pos, 2)
                break 
            else: inc(pos)
        else: 
          g.kind = gtRawData
          inc(pos)
          while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): 
            if g.buf[pos] == '"' and g.buf[pos+1] != '"': break
            inc(pos)
          if g.buf[pos] == '\"': inc(pos)
      else: 
        g.kind = nimGetKeyword(id)
    of '0': 
      inc(pos)
      case g.buf[pos]
      of 'b', 'B': 
        inc(pos)
        while g.buf[pos] in binChars: inc(pos)
        pos = nimNumberPostfix(g, pos)
      of 'x', 'X': 
        inc(pos)
        while g.buf[pos] in hexChars: inc(pos)
        pos = nimNumberPostfix(g, pos)
      of 'o', 'O': 
        inc(pos)
        while g.buf[pos] in octChars: inc(pos)
        pos = nimNumberPostfix(g, pos)
      else: pos = nimNumber(g, pos)
    of '1'..'9': 
      pos = nimNumber(g, pos)
    of '\'': 
      inc(pos)
      g.kind = gtCharLit
      while true: 
        case g.buf[pos]
        of '\0', '\x0D', '\x0A': 
          break 
        of '\'': 
          inc(pos)
          break 
        of '\\': 
          inc(pos, 2)
        else: inc(pos)
    of '\"': 
      inc(pos)
      if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'): 
        inc(pos, 2)
        g.kind = gtLongStringLit
        while true: 
          case g.buf[pos]
          of '\0': 
            break 
          of '\"': 
            inc(pos)
            if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and 
                g.buf[pos+2] != '\"': 
              inc(pos, 2)
              break 
          else: inc(pos)
      else: 
        g.kind = gtStringLit
        while true: 
          case g.buf[pos]
          of '\0', '\x0D', '\x0A': 
            break 
          of '\"': 
            inc(pos)
            break 
          of '\\': 
            g.state = g.kind
            break 
          else: inc(pos)
    of '(', ')', '[', ']', '{', '}', '`', ':', ',', ';': 
      inc(pos)
      g.kind = gtPunctation
    of '\0': 
      g.kind = gtEof
    else: 
      if g.buf[pos] in OpChars: 
        g.kind = gtOperator
        while g.buf[pos] in OpChars: inc(pos)
      else: 
        inc(pos)
        g.kind = gtNone
  g.length = pos - g.pos
  if g.kind != gtEof and g.length <= 0:
    assert false, "nimNextToken: produced an empty token"
  g.pos = pos

proc generalNumber(g: var TGeneralTokenizer, position: int): int = 
  const decChars = {'0'..'9'}
  var pos = position
  g.kind = gtDecNumber
  while g.buf[pos] in decChars: inc(pos)
  if g.buf[pos] == '.': 
    g.kind = gtFloatNumber
    inc(pos)
    while g.buf[pos] in decChars: inc(pos)
  if g.buf[pos] in {'e', 'E'}: 
    g.kind = gtFloatNumber
    inc(pos)
    if g.buf[pos] in {'+', '-'}: inc(pos)
    while g.buf[pos] in decChars: inc(pos)
  result = pos

proc generalStrLit(g: var TGeneralTokenizer, position: int): int = 
  const 
    decChars = {'0'..'9'}
    hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  var pos = position
  g.kind = gtStringLit
  var c = g.buf[pos]
  inc(pos)                    # skip " or '
  while true: 
    case g.buf[pos]
    of '\0': 
      break 
    of '\\': 
      inc(pos)
      case g.buf[pos]
      of '\0': 
        break 
      of '0'..'9': 
        while g.buf[pos] in decChars: inc(pos)
      of 'x', 'X': 
        inc(pos)
        if g.buf[pos] in hexChars: inc(pos)
        if g.buf[pos] in hexChars: inc(pos)
      else: inc(pos, 2)
    else: 
      if g.buf[pos] == c: 
        inc(pos)
        break 
      else: 
        inc(pos)
  result = pos

proc isKeyword(x: openArray[string], y: string): int = 
  var a = 0
  var b = len(x) - 1
  while a <= b: 
    var mid = (a + b) div 2
    var c = cmp(x[mid], y)
    if c < 0: 
      a = mid + 1
    elif c > 0: 
      b = mid - 1
    else: 
      return mid
  result = - 1

proc isKeywordIgnoreCase(x: openArray[string], y: string): int = 
  var a = 0
  var b = len(x) - 1
  while a <= b: 
    var mid = (a + b) div 2
    var c = cmpIgnoreCase(x[mid], y)
    if c < 0: 
      a = mid + 1
    elif c > 0: 
      b = mid - 1
    else: 
      return mid
  result = - 1

type 
  TTokenizerFlag = enum 
    hasPreprocessor, hasNestedComments
  TTokenizerFlags = set[TTokenizerFlag]

proc clikeNextToken(g: var TGeneralTokenizer, keywords: openArray[string], 
                    flags: TTokenizerFlags) = 
  const 
    hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
    octChars = {'0'..'7'}
    binChars = {'0'..'1'}
    symChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '\x80'..'\xFF'}
  var pos = g.pos
  g.start = g.pos
  if g.state == gtStringLit: 
    g.kind = gtStringLit
    while true: 
      case g.buf[pos]
      of '\\': 
        g.kind = gtEscapeSequence
        inc(pos)
        case g.buf[pos]
        of 'x', 'X': 
          inc(pos)
          if g.buf[pos] in hexChars: inc(pos)
          if g.buf[pos] in hexChars: inc(pos)
        of '0'..'9': 
          while g.buf[pos] in {'0'..'9'}: inc(pos)
        of '\0': 
          g.state = gtNone
        else: inc(pos)
        break 
      of '\0', '\x0D', '\x0A': 
        g.state = gtNone
        break 
      of '\"': 
        inc(pos)
        g.state = gtNone
        break 
      else: inc(pos)
  else: 
    case g.buf[pos]
    of ' ', '\x09'..'\x0D': 
      g.kind = gtWhitespace
      while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
    of '/': 
      inc(pos)
      if g.buf[pos] == '/': 
        g.kind = gtComment
        while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): inc(pos)
      elif g.buf[pos] == '*': 
        g.kind = gtLongComment
        var nested = 0
        inc(pos)
        while true: 
          case g.buf[pos]
          of '*': 
            inc(pos)
            if g.buf[pos] == '/': 
              inc(pos)
              if nested == 0: break 
          of '/': 
            inc(pos)
            if g.buf[pos] == '*': 
              inc(pos)
              if hasNestedComments in flags: inc(nested)
          of '\0': 
            break 
          else: inc(pos)
    of '#': 
      inc(pos)
      if hasPreprocessor in flags: 
        g.kind = gtPreprocessor
        while g.buf[pos] in {' ', '\t'}: inc(pos)
        while g.buf[pos] in symChars: inc(pos)
      else: 
        g.kind = gtOperator
    of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF': 
      var id = ""
      while g.buf[pos] in symChars: 
        add(id, g.buf[pos])
        inc(pos)
      if isKeyword(keywords, id) >= 0: g.kind = gtKeyword
      else: g.kind = gtIdentifier
    of '0': 
      inc(pos)
      case g.buf[pos]
      of 'b', 'B': 
        inc(pos)
        while g.buf[pos] in binChars: inc(pos)
        if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
      of 'x', 'X': 
        inc(pos)
        while g.buf[pos] in hexChars: inc(pos)
        if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
      of '0'..'7': 
        inc(pos)
        while g.buf[pos] in octChars: inc(pos)
        if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
      else: 
        pos = generalNumber(g, pos)
        if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
    of '1'..'9': 
      pos = generalNumber(g, pos)
      if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
    of '\'': 
      pos = generalStrLit(g, pos)
      g.kind = gtCharLit
    of '\"': 
      inc(pos)
      g.kind = gtStringLit
      while true: 
        case g.buf[pos]
        of '\0': 
          break 
        of '\"': 
          inc(pos)
          break 
        of '\\': 
          g.state = g.kind
          break 
        else: inc(pos)
    of '(', ')', '[', ']', '{', '}', ':', ',', ';', '.': 
      inc(pos)
      g.kind = gtPunctation
    of '\0': 
      g.kind = gtEof
    else: 
      if g.buf[pos] in OpChars: 
        g.kind = gtOperator
        while g.buf[pos] in OpChars: inc(pos)
      else:
        inc(pos)
        g.kind = gtNone
  g.length = pos - g.pos
  if g.kind != gtEof and g.length <= 0:
    assert false, "clikeNextToken: produced an empty token"
  g.pos = pos

proc cNextToken(g: var TGeneralTokenizer) = 
  const 
    keywords: array[0..36, string] = ["_Bool", "_Complex", "_Imaginary", "auto", 
      "break", "case", "char", "const", "continue", "default", "do", "double", 
      "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int", 
      "long", "register", "restrict", "return", "short", "signed", "sizeof", 
      "static", "struct", "switch", "typedef", "union", "unsigned", "void", 
      "volatile", "while"]
  clikeNextToken(g, keywords, {hasPreprocessor})

proc cppNextToken(g: var TGeneralTokenizer) = 
  const 
    keywords: array[0..47, string] = ["asm", "auto", "break", "case", "catch", 
      "char", "class", "const", "continue", "default", "delete", "do", "double", 
      "else", "enum", "extern", "float", "for", "friend", "goto", "if", 
      "inline", "int", "long", "new", "operator", "private", "protected", 
      "public", "register", "return", "short", "signed", "sizeof", "static", 
      "struct", "switch", "template", "this", "throw", "try", "typedef", 
      "union", "unsigned", "virtual", "void", "volatile", "while"]
  clikeNextToken(g, keywords, {hasPreprocessor})

proc csharpNextToken(g: var TGeneralTokenizer) = 
  const 
    keywords: array[0..76, string] = ["abstract", "as", "base", "bool", "break", 
      "byte", "case", "catch", "char", "checked", "class", "const", "continue", 
      "decimal", "default", "delegate", "do", "double", "else", "enum", "event", 
      "explicit", "extern", "false", "finally", "fixed", "float", "for", 
      "foreach", "goto", "if", "implicit", "in", "int", "interface", "internal", 
      "is", "lock", "long", "namespace", "new", "null", "object", "operator", 
      "out", "override", "params", "private", "protected", "public", "readonly", 
      "ref", "return", "sbyte", "sealed", "short", "sizeof", "stackalloc", 
      "static", "string", "struct", "switch", "this", "throw", "true", "try", 
      "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using", 
      "virtual", "void", "volatile", "while"]
  clikeNextToken(g, keywords, {hasPreprocessor})

proc javaNextToken(g: var TGeneralTokenizer) = 
  const 
    keywords: array[0..52, string] = ["abstract", "assert", "boolean", "break", 
      "byte", "case", "catch", "char", "class", "const", "continue", "default", 
      "do", "double", "else", "enum", "extends", "false", "final", "finally", 
      "float", "for", "goto", "if", "implements", "import", "instanceof", "int", 
      "interface", "long", "native", "new", "null", "package", "private", 
      "protected", "public", "return", "short", "static", "strictfp", "super", 
      "switch", "synchronized", "this", "throw", "throws", "transient", "true", 
      "try", "void", "volatile", "while"]
  clikeNextToken(g, keywords, {})

proc getNextToken*(g: var TGeneralTokenizer, lang: TSourceLanguage) = 
  case lang
  of langNone: assert false
  of langNimrod: nimNextToken(g)
  of langCpp: cppNextToken(g)
  of langCsharp: csharpNextToken(g)
  of langC: cNextToken(g)
  of langJava: javaNextToken(g)
  
when isMainModule:
  var keywords: seq[string]
  # Try to work running in both the subdir or at the root.
  for filename in ["doc/keywords.txt", "../../../doc/keywords.txt"]:
    except: echo filename, " not found"
    let input = string(readFile(filename))
    keywords = input.split()
    break
  doAssert (not keywords.isNil, "Couldn't read any keywords.txt file!")
  doAssert keywords.len == nimrodKeywords.len, "No matching lengths"
  for i in 0..keywords.len-1:
    #echo keywords[i], " == ", nimrodKeywords[i]
    doAssert keywords[i] == nimrodKeywords[i], "Unexpected keyword"
#
#
#            Nimrod's Runtime Library
#        (c) Copyright 2012 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## Source highlighter for programming or markup languages.
## Currently only few languages are supported, other languages may be added.
## The interface supports one language nested in another.

import
  strutils

type 
  TTokenClass* = enum 
    gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber, 
    gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit, 
    gtLongStringLit, gtCharLit, gtEscapeSequence, # escape sequence like \xff
    gtOperator, gtPunctation, gtComment, gtLongComment, gtRegularExpression, 
    gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler, 
    gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel, 
    gtReference, gtOther
  TGeneralTokenizer* = object of TObject
    kind*: TTokenClass
    start*, length*: int
    buf: cstring
    pos: int
    state: TTokenClass

  TSourceLanguage* = enum 
    langNone, langNimrod, langCpp, langCsharp, langC, langJava

const 
  sourceLanguageToStr*: array[TSourceLanguage, string] = ["none", "Nimrod", 
    "C++", "C#", "C", "Java"]
  tokenClassToStr*: array[TTokenClass, string] = ["Eof", "None", "Whitespace", 
    "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber", 
    "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit", 
    "EscapeSequence", "Operator", "Punctation", "Comment", "LongComment", 
    "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData", 
    "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink", 
    "Label", "Reference", "Other"]

  # The following list comes from doc/keywords.txt, make sure it is
  # synchronized with this array by running the module itself as a test case.
  nimrodKeywords = ["addr", "and", "as", "asm", "atomic", "bind", "block",
    "break", "case", "cast", "const", "continue", "converter", "discard",
    "distinct", "div", "do", "elif", "else", "end", "enum", "except", "export",
    "finally", "for", "from", "generic", "if", "import", "in", "include",
    "interface", "is", "isnot", "iterator", "lambda", "let", "macro", "method",
    "mixin", "mod", "nil", "not", "notin", "object", "of", "or", "out", "proc",
    "ptr", "raise", "ref", "return", "shared", "shl", "shr", "static",
    "template", "try", "tuple", "type", "using", "var", "when", "while", "with",
    "without", "xor", "yield"]

proc getSourceLanguage*(name: string): TSourceLanguage = 
  for i in countup(succ(low(TSourceLanguage)), high(TSourceLanguage)): 
    if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0: 
      return i
  result = langNone

proc initGeneralTokenizer*(g: var TGeneralTokenizer, buf: string) = 
  g.buf = cstring(buf)
  g.kind = low(TTokenClass)
  g.start = 0
  g.length = 0
  g.state = low(TTokenClass)
  var pos = 0                     # skip initial whitespace:
  while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
  g.pos = pos

proc deinitGeneralTokenizer*(g: var TGeneralTokenizer) = 
  discard

proc nimGetKeyword(id: string): TTokenClass = 
  for k in nimrodKeywords:
    if cmpIgnoreStyle(id, k) == 0: return gtKeyword
  result = gtIdentifier
  when false:
    var i = getIdent(id)
    if (i.id >= ord(tokKeywordLow) - ord(tkSymbol)) and
        (i.id <= ord(tokKeywordHigh) - ord(tkSymbol)): 
      result = gtKeyword
    else: 
      result = gtIdentifier
  
proc nimNumberPostfix(g: var TGeneralTokenizer, position: int): int = 
  var pos = position
  if g.buf[pos] == '\'': 
    inc(pos)
    case g.buf[pos]
    of 'f', 'F': 
      g.kind = gtFloatNumber
      inc(pos)
      if g.buf[pos] in {'0'..'9'}: inc(pos)
      if g.buf[pos] in {'0'..'9'}: inc(pos)
    of 'i', 'I': 
      inc(pos)
      if g.buf[pos] in {'0'..'9'}: inc(pos)
      if g.buf[pos] in {'0'..'9'}: inc(pos)
    else: 
      discard
  result = pos

proc nimNumber(g: var TGeneralTokenizer, position: int): int = 
  const decChars = {'0'..'9', '_'}
  var pos = position
  g.kind = gtDecNumber
  while g.buf[pos] in decChars: inc(pos)
  if g.buf[pos] == '.': 
    g.kind = gtFloatNumber
    inc(pos)
    while g.buf[pos] in decChars: inc(pos)
  if g.buf[pos] in {'e', 'E'}: 
    g.kind = gtFloatNumber
    inc(pos)
    if g.buf[pos] in {'+', '-'}: inc(pos)
    while g.buf[pos] in decChars: inc(pos)
  result = nimNumberPostfix(g, pos)

const
  OpChars  = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.', 
              '|', '=', '%', '&', '$', '@', '~', ':', '\x80'..'\xFF'}

proc nimNextToken(g: var TGeneralTokenizer) = 
  const 
    hexChars = {'0'..'9', 'A'..'F', 'a'..'f', '_'}
    octChars = {'0'..'7', '_'}
    binChars = {'0'..'1', '_'}
    SymChars = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
  var pos = g.pos
  g.start = g.pos
  if g.state == gtStringLit: 
    g.kind = gtStringLit
    while true: 
      case g.buf[pos]
      of '\\': 
        g.kind = gtEscapeSequence
        inc(pos)
        case g.buf[pos]
        of 'x', 'X': 
          inc(pos)
          if g.buf[pos] in hexChars: inc(pos)
          if g.buf[pos] in hexChars: inc(pos)
        of '0'..'9': 
          while g.buf[pos] in {'0'..'9'}: inc(pos)
        of '\0': 
          g.state = gtNone
        else: inc(pos)
        break 
      of '\0', '\x0D', '\x0A': 
        g.state = gtNone
        break 
      of '\"': 
        inc(pos)
        g.state = gtNone
        break 
      else: inc(pos)
  else: 
    case g.buf[pos]
    of ' ', '\x09'..'\x0D': 
      g.kind = gtWhitespace
      while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
    of '#': 
      g.kind = gtComment
      while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): inc(pos)
    of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF': 
      var id = ""
      while g.buf[pos] in SymChars + {'_'}: 
        add(id, g.buf[pos])
        inc(pos)
      if (g.buf[pos] == '\"'): 
        if (g.buf[pos + 1] == '\"') and (g.buf[pos + 2] == '\"'): 
          inc(pos, 3)
          g.kind = gtLongStringLit
          while true: 
            case g.buf[pos]
            of '\0': 
              break 
            of '\"': 
              inc(pos)
              if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and 
                  g.buf[pos+2] != '\"': 
                inc(pos, 2)
                break 
            else: inc(pos)
        else: 
          g.kind = gtRawData
          inc(pos)
          while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): 
            if g.buf[pos] == '"' and g.buf[pos+1] != '"': break
            inc(pos)
          if g.buf[pos] == '\"': inc(pos)
      else: 
        g.kind = nimGetKeyword(id)
    of '0': 
      inc(pos)
      case g.buf[pos]
      of 'b', 'B': 
        inc(pos)
        while g.buf[pos] in binChars: inc(pos)
        pos = nimNumberPostfix(g, pos)
      of 'x', 'X': 
        inc(pos)
        while g.buf[pos] in hexChars: inc(pos)
        pos = nimNumberPostfix(g, pos)
      of 'o', 'O': 
        inc(pos)
        while g.buf[pos] in octChars: inc(pos)
        pos = nimNumberPostfix(g, pos)
      else: pos = nimNumber(g, pos)
    of '1'..'9': 
      pos = nimNumber(g, pos)
    of '\'': 
      inc(pos)
      g.kind = gtCharLit
      while true: 
        case g.buf[pos]
        of '\0', '\x0D', '\x0A': 
          break 
        of '\'': 
          inc(pos)
          break 
        of '\\': 
          inc(pos, 2)
        else: inc(pos)
    of '\"': 
      inc(pos)
      if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'): 
        inc(pos, 2)
        g.kind = gtLongStringLit
        while true: 
          case g.buf[pos]
          of '\0': 
            break 
          of '\"': 
            inc(pos)
            if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and 
                g.buf[pos+2] != '\"': 
              inc(pos, 2)
              break 
          else: inc(pos)
      else: 
        g.kind = gtStringLit
        while true: 
          case g.buf[pos]
          of '\0', '\x0D', '\x0A': 
            break 
          of '\"': 
            inc(pos)
            break 
          of '\\': 
            g.state = g.kind
            break 
          else: inc(pos)
    of '(', ')', '[', ']', '{', '}', '`', ':', ',', ';': 
      inc(pos)
      g.kind = gtPunctation
    of '\0': 
      g.kind = gtEof
    else: 
      if g.buf[pos] in OpChars: 
        g.kind = gtOperator
        while g.buf[pos] in OpChars: inc(pos)
      else: 
        inc(pos)
        g.kind = gtNone
  g.length = pos - g.pos
  if g.kind != gtEof and g.length <= 0:
    assert false, "nimNextToken: produced an empty token"
  g.pos = pos

proc generalNumber(g: var TGeneralTokenizer, position: int): int = 
  const decChars = {'0'..'9'}
  var pos = position
  g.kind = gtDecNumber
  while g.buf[pos] in decChars: inc(pos)
  if g.buf[pos] == '.': 
    g.kind = gtFloatNumber
    inc(pos)
    while g.buf[pos] in decChars: inc(pos)
  if g.buf[pos] in {'e', 'E'}: 
    g.kind = gtFloatNumber
    inc(pos)
    if g.buf[pos] in {'+', '-'}: inc(pos)
    while g.buf[pos] in decChars: inc(pos)
  result = pos

proc generalStrLit(g: var TGeneralTokenizer, position: int): int = 
  const 
    decChars = {'0'..'9'}
    hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  var pos = position
  g.kind = gtStringLit
  var c = g.buf[pos]
  inc(pos)                    # skip " or '
  while true: 
    case g.buf[pos]
    of '\0': 
      break 
    of '\\': 
      inc(pos)
      case g.buf[pos]
      of '\0': 
        break 
      of '0'..'9': 
        while g.buf[pos] in decChars: inc(pos)
      of 'x', 'X': 
        inc(pos)
        if g.buf[pos] in hexChars: inc(pos)
        if g.buf[pos] in hexChars: inc(pos)
      else: inc(pos, 2)
    else: 
      if g.buf[pos] == c: 
        inc(pos)
        break 
      else: 
        inc(pos)
  result = pos

proc isKeyword(x: openArray[string], y: string): int = 
  var a = 0
  var b = len(x) - 1
  while a <= b: 
    var mid = (a + b) div 2
    var c = cmp(x[mid], y)
    if c < 0: 
      a = mid + 1
    elif c > 0: 
      b = mid - 1
    else: 
      return mid
  result = - 1

proc isKeywordIgnoreCase(x: openArray[string], y: string): int = 
  var a = 0
  var b = len(x) - 1
  while a <= b: 
    var mid = (a + b) div 2
    var c = cmpIgnoreCase(x[mid], y)
    if c < 0: 
      a = mid + 1
    elif c > 0: 
      b = mid - 1
    else: 
      return mid
  result = - 1

type 
  TTokenizerFlag = enum 
    hasPreprocessor, hasNestedComments
  TTokenizerFlags = set[TTokenizerFlag]

proc clikeNextToken(g: var TGeneralTokenizer, keywords: openArray[string], 
                    flags: TTokenizerFlags) = 
  const 
    hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
    octChars = {'0'..'7'}
    binChars = {'0'..'1'}
    symChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '\x80'..'\xFF'}
  var pos = g.pos
  g.start = g.pos
  if g.state == gtStringLit: 
    g.kind = gtStringLit
    while true: 
      case g.buf[pos]
      of '\\': 
        g.kind = gtEscapeSequence
        inc(pos)
        case g.buf[pos]
        of 'x', 'X': 
          inc(pos)
          if g.buf[pos] in hexChars: inc(pos)
          if g.buf[pos] in hexChars: inc(pos)
        of '0'..'9': 
          while g.buf[pos] in {'0'..'9'}: inc(pos)
        of '\0': 
          g.state = gtNone
        else: inc(pos)
        break 
      of '\0', '\x0D', '\x0A': 
        g.state = gtNone
        break 
      of '\"': 
        inc(pos)
        g.state = gtNone
        break 
      else: inc(pos)
  else: 
    case g.buf[pos]
    of ' ', '\x09'..'\x0D': 
      g.kind = gtWhitespace
      while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
    of '/': 
      inc(pos)
      if g.buf[pos] == '/': 
        g.kind = gtComment
        while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): inc(pos)
      elif g.buf[pos] == '*': 
        g.kind = gtLongComment
        var nested = 0
        inc(pos)
        while true: 
          case g.buf[pos]
          of '*': 
            inc(pos)
            if g.buf[pos] == '/': 
              inc(pos)
              if nested == 0: break 
          of '/': 
            inc(pos)
            if g.buf[pos] == '*': 
              inc(pos)
              if hasNestedComments in flags: inc(nested)
          of '\0': 
            break 
          else: inc(pos)
    of '#': 
      inc(pos)
      if hasPreprocessor in flags: 
        g.kind = gtPreprocessor
        while g.buf[pos] in {' ', '\t'}: inc(pos)
        while g.buf[pos] in symChars: inc(pos)
      else: 
        g.kind = gtOperator
    of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF': 
      var id = ""
      while g.buf[pos] in symChars: 
        add(id, g.buf[pos])
        inc(pos)
      if isKeyword(keywords, id) >= 0: g.kind = gtKeyword
      else: g.kind = gtIdentifier
    of '0': 
      inc(pos)
      case g.buf[pos]
      of 'b', 'B': 
        inc(pos)
        while g.buf[pos] in binChars: inc(pos)
        if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
      of 'x', 'X': 
        inc(pos)
        while g.buf[pos] in hexChars: inc(pos)
        if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
      of '0'..'7': 
        inc(pos)
        while g.buf[pos] in octChars: inc(pos)
        if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
      else: 
        pos = generalNumber(g, pos)
        if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
    of '1'..'9': 
      pos = generalNumber(g, pos)
      if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
    of '\'': 
      pos = generalStrLit(g, pos)
      g.kind = gtCharLit
    of '\"': 
      inc(pos)
      g.kind = gtStringLit
      while true: 
        case g.buf[pos]
        of '\0': 
          break 
        of '\"': 
          inc(pos)
          break 
        of '\\': 
          g.state = g.kind
          break 
        else: inc(pos)
    of '(', ')', '[', ']', '{', '}', ':', ',', ';', '.': 
      inc(pos)
      g.kind = gtPunctation
    of '\0': 
      g.kind = gtEof
    else: 
      if g.buf[pos] in OpChars: 
        g.kind = gtOperator
        while g.buf[pos] in OpChars: inc(pos)
      else:
        inc(pos)
        g.kind = gtNone
  g.length = pos - g.pos
  if g.kind != gtEof and g.length <= 0:
    assert false, "clikeNextToken: produced an empty token"
  g.pos = pos

proc cNextToken(g: var TGeneralTokenizer) = 
  const 
    keywords: array[0..36, string] = ["_Bool", "_Complex", "_Imaginary", "auto", 
      "break", "case", "char", "const", "continue", "default", "do", "double", 
      "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int", 
      "long", "register", "restrict", "return", "short", "signed", "sizeof", 
      "static", "struct", "switch", "typedef", "union", "unsigned", "void", 
      "volatile", "while"]
  clikeNextToken(g, keywords, {hasPreprocessor})

proc cppNextToken(g: var TGeneralTokenizer) = 
  const 
    keywords: array[0..47, string] = ["asm", "auto", "break", "case", "catch", 
      "char", "class", "const", "continue", "default", "delete", "do", "double", 
      "else", "enum", "extern", "float", "for", "friend", "goto", "if", 
      "inline", "int", "long", "new", "operator", "private", "protected", 
      "public", "register", "return", "short", "signed", "sizeof", "static", 
      "struct", "switch", "template", "this", "throw", "try", "typedef", 
      "union", "unsigned", "virtual", "void", "volatile", "while"]
  clikeNextToken(g, keywords, {hasPreprocessor})

proc csharpNextToken(g: var TGeneralTokenizer) = 
  const 
    keywords: array[0..76, string] = ["abstract", "as", "base", "bool", "break", 
      "byte", "case", "catch", "char", "checked", "class", "const", "continue", 
      "decimal", "default", "delegate", "do", "double", "else", "enum", "event", 
      "explicit", "extern", "false", "finally", "fixed", "float", "for", 
      "foreach", "goto", "if", "implicit", "in", "int", "interface", "internal", 
      "is", "lock", "long", "namespace", "new", "null", "object", "operator", 
      "out", "override", "params", "private", "protected", "public", "readonly", 
      "ref", "return", "sbyte", "sealed", "short", "sizeof", "stackalloc", 
      "static", "string", "struct", "switch", "this", "throw", "true", "try", 
      "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using", 
      "virtual", "void", "volatile", "while"]
  clikeNextToken(g, keywords, {hasPreprocessor})

proc javaNextToken(g: var TGeneralTokenizer) = 
  const 
    keywords: array[0..52, string] = ["abstract", "assert", "boolean", "break", 
      "byte", "case", "catch", "char", "class", "const", "continue", "default", 
      "do", "double", "else", "enum", "extends", "false", "final", "finally", 
      "float", "for", "goto", "if", "implements", "import", "instanceof", "int", 
      "interface", "long", "native", "new", "null", "package", "private", 
      "protected", "public", "return", "short", "static", "strictfp", "super", 
      "switch", "synchronized", "this", "throw", "throws", "transient", "true", 
      "try", "void", "volatile", "while"]
  clikeNextToken(g, keywords, {})

proc getNextToken*(g: var TGeneralTokenizer, lang: TSourceLanguage) = 
  case lang
  of langNone: assert false
  of langNimrod: nimNextToken(g)
  of langCpp: cppNextToken(g)
  of langCsharp: csharpNextToken(g)
  of langC: cNextToken(g)
  of langJava: javaNextToken(g)
  
when isMainModule:
  var keywords: seq[string]
  # Try to work running in both the subdir or at the root.
  for filename in ["doc/keywords.txt", "../../../doc/keywords.txt"]:
    except: echo filename, " not found"
    let input = string(readFile(filename))
    keywords = input.split()
    break
  doAssert (not keywords.isNil, "Couldn't read any keywords.txt file!")
  doAssert keywords.len == nimrodKeywords.len, "No matching lengths"
  for i in 0..keywords.len-1:
    #echo keywords[i], " == ", nimrodKeywords[i]
    doAssert keywords[i] == nimrodKeywords[i], "Unexpected keyword"