//
//
//           The Nimrod Compiler
//        (c) Copyright 2008 Andreas Rumpf
//
//    See the file "copying.txt", included in this
//    distribution, for details about the copyright.
//
unit highlite;

// Source highlighter for programming or markup languages.
// Currently only few languages are supported, other languages may be added.
// The interface supports one language nested in another.

interface

{$include 'config.inc'}

uses
  charsets, nsystem, sysutils, nhashes, options, msgs, strutils, platform,
  idents, lexbase, wordrecg, scanner;

type
  TTokenClass = (
    gtEof,
    gtNone,
    gtWhitespace,
    gtDecNumber,
    gtBinNumber,
    gtHexNumber,
    gtOctNumber,
    gtFloatNumber,
    gtIdentifier,
    gtKeyword,
    gtStringLit,
    gtLongStringLit,
    gtCharLit,
    gtEscapeSequence,         // escape sequence like \xff
    gtOperator,
    gtPunctation,
    gtComment,
    gtLongComment,
    gtRegularExpression,
    gtTagStart,
    gtTagEnd,
    gtKey,
    gtValue,
    gtRawData,
    gtAssembler,
    gtPreprocessor,
    gtDirective,
    gtCommand,
    gtRule,
    gtHyperlink,
    gtLabel,
    gtReference,
    gtOther
  );
  TGeneralTokenizer = object(NObject)
    kind: TTokenClass;
    start, len: int;
    // private:
    buf: PChar;
    pos: int;
    state: TTokenClass;
  end;
  TSourceLanguage = (
    langNone,
    langNimrod,
    langCpp,
    langCsharp,
    langC,
    langJava
  );
const
  sourceLanguageToStr: array [TSourceLanguage] of string = (
    'none', 'Nimrod', 'C++', 'C#', 'C'+'', 'Java'
  );
  tokenClassToStr: array [TTokenClass] of string = (
    'Eof',
    'None',
    'Whitespace',
    'DecNumber',
    'BinNumber',
    'HexNumber',
    'OctNumber',
    'FloatNumber',
    'Identifier',
    'Keyword',
    'StringLit',
    'LongStringLit',
    'CharLit',
    'EscapeSequence',
    'Operator',
    'Punctation',
    'Comment',
    'LongComment',
    'RegularExpression',
    'TagStart',
    'TagEnd',
    'Key',
    'Value',
    'RawData',
    'Assembler',
    'Preprocessor',
    'Directive',
    'Command',
    'Rule',
    'Hyperlink',
    'Label',
    'Reference',
    'Other'
  );

function getSourceLanguage(const name: string): TSourceLanguage;

procedure initGeneralTokenizer(var g: TGeneralTokenizer;
                               const buf: string);
procedure deinitGeneralTokenizer(var g: TGeneralTokenizer);
procedure getNextToken(var g: TGeneralTokenizer; lang: TSourceLanguage);

implementation

function getSourceLanguage(const name: string): TSourceLanguage;
var
  i: TSourceLanguage;
begin
  for i := succ(low(TSourceLanguage)) to high(TSourceLanguage) do
    if cmpIgnoreStyle(name, sourceLanguageToStr[i]) = 0 then begin
      result := i; exit
    end;
  result := langNone
end;

procedure initGeneralTokenizer(var g: TGeneralTokenizer;
                               const buf: string);
var
  pos: int;
begin
{@ignore} fillChar(g, sizeof(g), 0); {@emit}
  g.buf := PChar(buf);
  g.kind := low(TTokenClass);
  g.start := 0;
  g.len := 0;
  g.state := low(TTokenClass);
  pos := 0;
  // skip initial whitespace:
  while g.buf[pos] in [' ', #9..#13] do inc(pos);
  g.pos := pos;
end;

procedure deinitGeneralTokenizer(var g: TGeneralTokenizer);
begin
end;

function nimGetKeyword(const id: string): TTokenClass;
var
  i: PIdent;
begin
  i := getIdent(id);
  if (i.id >= ord(tokKeywordLow)-ord(tkSymbol)) and
     (i.id <= ord(tokKeywordHigh)-ord(tkSymbol)) then
    result := gtKeyword
  else
    result := gtIdentifier
end;

function nimNumberPostfix(var g: TGeneralTokenizer; position: int): int;
var
  pos: int;
begin
  pos := position;
  if g.buf[pos] = '''' then begin
    inc(pos);
    case g.buf[pos] of
      'f', 'F': begin
        g.kind := gtFloatNumber;
        inc(pos);
        if g.buf[pos] in ['0'..'9'] then inc(pos);
        if g.buf[pos] in ['0'..'9'] then inc(pos);
      end;
      'i', 'I': begin
        inc(pos);
        if g.buf[pos] in ['0'..'9'] then inc(pos);
        if g.buf[pos] in ['0'..'9'] then inc(pos);
      end;
      else begin end
    end
  end;
  result := pos;
end;

function nimNumber(var g: TGeneralTokenizer; position: int): int;
const
  decChars = ['0'..'9', '_'];
var
  pos: int;
begin
  pos := position;
  g.kind := gtDecNumber;
  while g.buf[pos] in decChars do inc(pos);
  if g.buf[pos] = '.' then begin
    g.kind := gtFloatNumber;
    inc(pos);
    while g.buf[pos] in decChars do inc(pos);
  end;
  if g.buf[pos] in ['e', 'E'] then begin
    g.kind := gtFloatNumber;
    inc(pos);
    if g.buf[pos] in ['+', '-'] then inc(pos);
    while g.buf[pos] in decChars do inc(pos);
  end;
  result := nimNumberPostfix(g, pos);
end;

procedure nimNextToken(var g: TGeneralTokenizer);
const
  hexChars = ['0'..'9', 'A'..'F', 'a'..'f', '_'];
  octChars = ['0'..'7', '_'];
  binChars = ['0'..'1', '_'];
var
  pos: int;
  id: string;
begin
  pos := g.pos;
  g.start := g.pos;
  if g.state = gtStringLit then begin
    g.kind := gtStringLit;
    while true do begin
      case g.buf[pos] of
        '\': begin
          g.kind := gtEscapeSequence;
          inc(pos);
          case g.buf[pos] of
            'x', 'X': begin
              inc(pos);
              if g.buf[pos] in hexChars then inc(pos);
              if g.buf[pos] in hexChars then inc(pos);
            end;
            '0'..'9': while g.buf[pos] in ['0'..'9'] do inc(pos);
            #0: g.state := gtNone;
            else inc(pos);
          end;
          break
        end;
        #0, #13, #10: begin g.state := gtNone; break end;
        '"': begin
          inc(pos);
          g.state := gtNone;
          break
        end;
        else inc(pos)
      end
    end
  end
  else begin
    case g.buf[pos] of
      ' ', #9..#13: begin
        g.kind := gtWhitespace;
        while g.buf[pos] in [' ', #9..#13] do inc(pos);
      end;
      '#': begin
        g.kind := gtComment;
        while not (g.buf[pos] in [#0, #10, #13]) do inc(pos);
      end;
      'a'..'z', 'A'..'Z', '_', #128..#255: begin
        id := '';
        while g.buf[pos] in scanner.SymChars+['_'] do begin
          addChar(id, g.buf[pos]);
          inc(pos)
        end;
        if (g.buf[pos] = '"') then begin
          if (g.buf[pos+1] = '"') and (g.buf[pos+2] = '"') then begin
            inc(pos, 3);
            g.kind := gtLongStringLit;
            while true do begin
              case g.buf[pos] of
                #0: break;
                '"': begin
                  inc(pos);
                  if (g.buf[pos] = '"') and (g.buf[pos+1] = '"') then begin
                    inc(pos, 2);
                    break
                  end
                end;
                else inc(pos);
              end
            end
          end
          else begin
            g.kind := gtRawData;
            inc(pos);
            while not (g.buf[pos] in [#0, '"', #10, #13]) do inc(pos);
            if g.buf[pos] = '"' then inc(pos);
          end
        end
        else begin
          g.kind := nimGetKeyword(id);
        end
      end;
      '0': begin
        inc(pos);
        case g.buf[pos] of
          'b', 'B': begin
            inc(pos);
            while g.buf[pos] in binChars do inc(pos);
            pos := nimNumberPostfix(g, pos);
          end;
          'x', 'X': begin
            inc(pos);
            while g.buf[pos] in hexChars do inc(pos);
            pos := nimNumberPostfix(g, pos);
          end;
          'o', 'O': begin
            inc(pos);
            while g.buf[pos] in octChars do inc(pos);
            pos := nimNumberPostfix(g, pos);
          end;
          else
            pos := nimNumber(g, pos);
        end
      end;
      '1'..'9': begin
        pos := nimNumber(g, pos);
      end;
      '''': begin
        inc(pos);
        g.kind := gtCharLit;
        while true do begin
          case g.buf[pos] of
            #0, #13, #10: break;
            '''': begin inc(pos); break end;
            '\': begin inc(pos, 2); end;
            else inc(pos);
          end
        end
      end;
      '"': begin
        inc(pos);
        if (g.buf[pos] = '"') and (g.buf[pos+1] = '"') then begin
          inc(pos, 2);
          g.kind := gtLongStringLit;
          while true do begin
            case g.buf[pos] of
              #0: break;
              '"': begin
                inc(pos);
                if (g.buf[pos] = '"') and (g.buf[pos+1] = '"') then begin
                  inc(pos, 2);
                  break
                end
              end;
              else inc(pos);
            end
          end
        end
        else begin
          g.kind := gtStringLit;
          while true do begin
            case g.buf[pos] of
              #0, #13, #10: break;
              '"': begin inc(pos); break end;
              '\': begin g.state := g.kind; break end;
              else inc(pos);
            end
          end
        end
      end;
      '(', ')', '[', ']', '{', '}', '`', ':', ',', ';': begin
        inc(pos);
        g.kind := gtPunctation
      end;
      #0: g.kind := gtEof;
      else if g.buf[pos] in scanner.OpChars then begin
        g.kind := gtOperator;
        while g.buf[pos] in scanner.OpChars do inc(pos);
      end
      else begin
        inc(pos);
        g.kind := gtNone
      end;
    end
  end;
  g.len := pos - g.pos;
  if (g.kind <> gtEof) and (g.len <= 0) then 
    InternalError('nimNextToken: ' + toString(g.buf));
  g.pos := pos;
end;

// ------------------------------- helpers ------------------------------------

function generalNumber(var g: TGeneralTokenizer; position: int): int;
const
  decChars = ['0'..'9'];
var
  pos: int;
begin
  pos := position;
  g.kind := gtDecNumber;
  while g.buf[pos] in decChars do inc(pos);
  if g.buf[pos] = '.' then begin
    g.kind := gtFloatNumber;
    inc(pos);
    while g.buf[pos] in decChars do inc(pos);
  end;
  if g.buf[pos] in ['e', 'E'] then begin
    g.kind := gtFloatNumber;
    inc(pos);
    if g.buf[pos] in ['+', '-'] then inc(pos);
    while g.buf[pos] in decChars do inc(pos);
  end;
  result := pos;
end;

function generalStrLit(var g: TGeneralTokenizer; position: int): int;
const
  decChars = ['0'..'9'];
  hexChars = ['0'..'9', 'A'..'F', 'a'..'f'];
var
  pos: int;
  c: Char;
begin
  pos := position;
  g.kind := gtStringLit;
  c := g.buf[pos];
  inc(pos); // skip " or '
  while true do begin
    case g.buf[pos] of
      #0: break;
      '\': begin
        inc(pos);
        case g.buf[pos] of
          #0: break;
          '0'..'9': while g.buf[pos] in decChars do inc(pos);
          'x', 'X': begin
            inc(pos);
            if g.buf[pos] in hexChars then inc(pos);
            if g.buf[pos] in hexChars then inc(pos);
          end;
          else inc(pos, 2)
        end
      end;
      else if g.buf[pos] = c then begin
        inc(pos); break;
      end
      else
        inc(pos);
    end
  end;
  result := pos;
end;

function isKeyword(const x: array of string; const y: string): int;
var
  a, b, mid, c: int;
begin
  a := 0;
  b := length(x)-1;
  while a <= b do begin
    mid := (a + b) div 2;
    c := cmp(x[mid], y);
    if c < 0 then
      a := mid + 1
    else if c > 0 then
      b := mid - 1
    else begin
      result := mid;
      exit
    end
  end;
  result := -1
end;

function isKeywordIgnoreCase(const x: array of string; const y: string): int;
var
  a, b, mid, c: int;
begin
  a := 0;
  b := length(x)-1;
  while a <= b do begin
    mid := (a + b) div 2;
    c := cmpIgnoreCase(x[mid], y);
    if c < 0 then
      a := mid + 1
    else if c > 0 then
      b := mid - 1
    else begin
      result := mid;
      exit
    end
  end;
  result := -1
end;

// ---------------------------------------------------------------------------

type
  TTokenizerFlag = (hasPreprocessor, hasNestedComments);
  TTokenizerFlags = set of TTokenizerFlag;

procedure clikeNextToken(var g: TGeneralTokenizer;
                         const keywords: array of string;
                         flags: TTokenizerFlags);
const
  hexChars = ['0'..'9', 'A'..'F', 'a'..'f'];
  octChars = ['0'..'7'];
  binChars = ['0'..'1'];
  symChars = ['A'..'Z', 'a'..'z', '0'..'9', '_', #128..#255];
var
  pos, nested: int;
  id: string;
begin
  pos := g.pos;
  g.start := g.pos;
  if g.state = gtStringLit then begin
    g.kind := gtStringLit;
    while true do begin
      case g.buf[pos] of
        '\': begin
          g.kind := gtEscapeSequence;
          inc(pos);
          case g.buf[pos] of
            'x', 'X': begin
              inc(pos);
              if g.buf[pos] in hexChars then inc(pos);
              if g.buf[pos] in hexChars then inc(pos);
            end;
            '0'..'9': while g.buf[pos] in ['0'..'9'] do inc(pos);
            #0: g.state := gtNone;
            else inc(pos);
          end;
          break
        end;
        #0, #13, #10: begin g.state := gtNone; break end;
        '"': begin
          inc(pos);
          g.state := gtNone;
          break
        end;
        else inc(pos)
      end
    end
  end
  else begin
    case g.buf[pos] of
      ' ', #9..#13: begin
        g.kind := gtWhitespace;
        while g.buf[pos] in [' ', #9..#13] do inc(pos);
      end;
      '/': begin
        inc(pos);
        if g.buf[pos] = '/' then begin
          g.kind := gtComment;
          while not (g.buf[pos] in [#0, #10, #13]) do inc(pos);
        end
        else if g.buf[pos] = '*' then begin
          g.kind := gtLongComment;
          nested := 0;
          inc(pos);
          while true do begin
            case g.buf[pos] of
              '*': begin
                inc(pos);
                if g.buf[pos] = '/' then begin
                  inc(pos);
                  if nested = 0 then break
                end;
              end;
              '/': begin
                inc(pos);
                if g.buf[pos] = '*' then begin
                  inc(pos);
                  if hasNestedComments in flags then inc(nested);
                end
              end;
              #0: break;
              else inc(pos);
            end
          end
        end
      end;
      '#': begin
        inc(pos);
        if hasPreprocessor in flags then begin
          g.kind := gtPreprocessor;
          while g.buf[pos] in [' ', Tabulator] do inc(pos);
          while g.buf[pos] in symChars do inc(pos);
        end
        else
          g.kind := gtOperator
      end;
      'a'..'z', 'A'..'Z', '_', #128..#255: begin
        id := '';
        while g.buf[pos] in SymChars do begin
          addChar(id, g.buf[pos]);
          inc(pos)
        end;
        if isKeyword(keywords, id) >= 0 then g.kind := gtKeyword
        else g.kind := gtIdentifier;
      end;
      '0': begin
        inc(pos);
        case g.buf[pos] of
          'b', 'B': begin
            inc(pos);
            while g.buf[pos] in binChars do inc(pos);
            if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos);
          end;
          'x', 'X': begin
            inc(pos);
            while g.buf[pos] in hexChars do inc(pos);
            if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos);
          end;
          '0'..'7': begin
            inc(pos);
            while g.buf[pos] in octChars do inc(pos);
            if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos);
          end;
          else begin
            pos := generalNumber(g, pos);
            if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos);
          end
        end
      end;
      '1'..'9': begin
        pos := generalNumber(g, pos);
        if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos);
      end;
      '''': begin
        pos := generalStrLit(g, pos);
        g.kind := gtCharLit;
      end;
      '"': begin
        inc(pos);
        g.kind := gtStringLit;
        while true do begin
          case g.buf[pos] of
            #0: break;
            '"': begin inc(pos); break end;
            '\': begin g.state := g.kind; break end;
            else inc(pos);
          end
        end
      end;
      '(', ')', '[', ']', '{', '}', ':', ',', ';', '.': begin
        inc(pos);
        g.kind := gtPunctation
      end;
      #0: g.kind := gtEof;
      else if g.buf[pos] in scanner.OpChars then begin
        g.kind := gtOperator;
        while g.buf[pos] in scanner.OpChars do inc(pos);
      end
      else begin
        inc(pos);
        g.kind := gtNone
      end;
    end
  end;
  g.len := pos - g.pos;
  if (g.kind <> gtEof) and (g.len <= 0) then InternalError('clikeNextToken');
  g.pos := pos;
end;

// --------------------------------------------------------------------------

procedure cNextToken(var g: TGeneralTokenizer);
const
  keywords: array [0..36] of string = (
    '_Bool', '_Complex', '_Imaginary',
    'auto', 'break', 'case', 'char', 'const', 'continue', 'default', 'do',
    'double', 'else', 'enum', 'extern', 'float', 'for', 'goto', 'if',
    'inline', 'int', 'long', 'register', 'restrict', 'return', 'short',
    'signed', 'sizeof', 'static', 'struct', 'switch', 'typedef', 'union',
    'unsigned', 'void', 'volatile', 'while'
  );
begin
  clikeNextToken(g, keywords, {@set}[hasPreprocessor]);
end;

procedure cppNextToken(var g: TGeneralTokenizer);
const
  keywords: array [0..47] of string = (
    'asm', 'auto', 'break', 'case', 'catch', 'char', 'class', 'const',
    'continue', 'default', 'delete', 'do', 'double', 'else', 'enum', 'extern',
    'float', 'for', 'friend', 'goto', 'if', 'inline', 'int', 'long', 'new',
    'operator', 'private', 'protected', 'public', 'register', 'return',
    'short', 'signed', 'sizeof', 'static', 'struct', 'switch', 'template',
    'this', 'throw', 'try', 'typedef', 'union', 'unsigned', 'virtual', 'void',
    'volatile', 'while'
  );
begin
  clikeNextToken(g, keywords, {@set}[hasPreprocessor]);
end;

procedure csharpNextToken(var g: TGeneralTokenizer);
const
  keywords: array [0..76] of string = (
    'abstract', 'as', 'base', 'bool', 'break', 'byte', 'case', 'catch',
    'char', 'checked', 'class', 'const', 'continue', 'decimal', 'default',
    'delegate', 'do', 'double', 'else', 'enum', 'event', 'explicit', 'extern',
    'false', 'finally', 'fixed', 'float', 'for', 'foreach', 'goto', 'if',
    'implicit', 'in', 'int', 'interface', 'internal', 'is', 'lock', 'long',
    'namespace', 'new', 'null', 'object', 'operator', 'out', 'override',
    'params', 'private', 'protected', 'public', 'readonly', 'ref', 'return',
    'sbyte', 'sealed', 'short', 'sizeof', 'stackalloc', 'static', 'string',
    'struct', 'switch', 'this', 'throw', 'true', 'try', 'typeof', 'uint',
    'ulong', 'unchecked', 'unsafe', 'ushort', 'using', 'virtual', 'void',
    'volatile', 'while'
  );
begin
  clikeNextToken(g, keywords, {@set}[hasPreprocessor]);
end;

procedure javaNextToken(var g: TGeneralTokenizer);
const
  keywords: array [0..52] of string = (
    'abstract', 'assert', 'boolean', 'break', 'byte', 'case', 'catch',
    'char', 'class', 'const', 'continue', 'default', 'do', 'double', 'else',
    'enum', 'extends', 'false', 'final', 'finally', 'float', 'for', 'goto',
    'if', 'implements', 'import', 'instanceof', 'int', 'interface', 'long',
    'native', 'new', 'null', 'package', 'private', 'protected', 'public',
    'return', 'short', 'static', 'strictfp', 'super', 'switch',
    'synchronized', 'this', 'throw', 'throws', 'transient', 'true', 'try',
    'void', 'volatile', 'while'
  );
begin
  clikeNextToken(g, keywords, {@set}[]);
end;

procedure getNextToken(var g: TGeneralTokenizer; lang: TSourceLanguage);
begin
  case lang of
    langNimrod: nimNextToken(g);
    langCpp: cppNextToken(g);
    langCsharp: csharpNextToken(g);
    langC: cNextToken(g);
    langJava: javaNextToken(g);
    else InternalError('getNextToken');
  end
end;

end.