//
//
// The Nimrod Compiler
// (c) Copyright 2008 Andreas Rumpf
//
// See the file "copying.txt", included in this
// distribution, for details about the copyright.
//
unit highlite;
// Source highlighter for programming or markup languages.
// Currently only few languages are supported, other languages may be added.
// The interface supports one language nested in another.
interface
{$include 'config.inc'}
uses
charsets, nsystem, sysutils, nhashes, options, msgs, strutils, platform,
idents, lexbase, wordrecg, scanner;
type
TTokenClass = (
gtEof,
gtNone,
gtWhitespace,
gtDecNumber,
gtBinNumber,
gtHexNumber,
gtOctNumber,
gtFloatNumber,
gtIdentifier,
gtKeyword,
gtStringLit,
gtLongStringLit,
gtCharLit,
gtEscapeSequence, // escape sequence like \xff
gtOperator,
gtPunctation,
gtComment,
gtLongComment,
gtRegularExpression,
gtTagStart,
gtTagEnd,
gtKey,
gtValue,
gtRawData,
gtAssembler,
gtPreprocessor,
gtDirective,
gtCommand,
gtRule,
gtHyperlink,
gtLabel,
gtReference,
gtOther
);
TGeneralTokenizer = object(NObject)
kind: TTokenClass;
start, len: int;
// private:
buf: PChar;
pos: int;
state: TTokenClass;
end;
TSourceLanguage = (
langNone,
langNimrod,
langCpp,
langCsharp,
langC,
langJava
);
const
sourceLanguageToStr: array [TSourceLanguage] of string = (
'none', 'Nimrod', 'C++', 'C#', 'C'+'', 'Java'
);
tokenClassToStr: array [TTokenClass] of string = (
'Eof',
'None',
'Whitespace',
'DecNumber',
'BinNumber',
'HexNumber',
'OctNumber',
'FloatNumber',
'Identifier',
'Keyword',
'StringLit',
'LongStringLit',
'CharLit',
'EscapeSequence',
'Operator',
'Punctation',
'Comment',
'LongComment',
'RegularExpression',
'TagStart',
'TagEnd',
'Key',
'Value',
'RawData',
'Assembler',
'Preprocessor',
'Directive',
'Command',
'Rule',
'Hyperlink',
'Label',
'Reference',
'Other'
);
function getSourceLanguage(const name: string): TSourceLanguage;
procedure initGeneralTokenizer(var g: TGeneralTokenizer;
const buf: string);
procedure deinitGeneralTokenizer(var g: TGeneralTokenizer);
procedure getNextToken(var g: TGeneralTokenizer; lang: TSourceLanguage);
implementation
function getSourceLanguage(const name: string): TSourceLanguage;
var
i: TSourceLanguage;
begin
for i := succ(low(TSourceLanguage)) to high(TSourceLanguage) do
if cmpIgnoreStyle(name, sourceLanguageToStr[i]) = 0 then begin
result := i; exit
end;
result := langNone
end;
procedure initGeneralTokenizer(var g: TGeneralTokenizer;
const buf: string);
var
pos: int;
begin
{@ignore} fillChar(g, sizeof(g), 0); {@emit}
g.buf := PChar(buf);
g.kind := low(TTokenClass);
g.start := 0;
g.len := 0;
g.state := low(TTokenClass);
pos := 0;
// skip initial whitespace:
while g.buf[pos] in [' ', #9..#13] do inc(pos);
g.pos := pos;
end;
procedure deinitGeneralTokenizer(var g: TGeneralTokenizer);
begin
end;
function nimGetKeyword(const id: string): TTokenClass;
var
i: PIdent;
begin
i := getIdent(id);
if (i.id >= ord(tokKeywordLow)-ord(tkSymbol)) and
(i.id <= ord(tokKeywordHigh)-ord(tkSymbol)) then
result := gtKeyword
else
result := gtIdentifier
end;
function nimNumberPostfix(var g: TGeneralTokenizer; position: int): int;
var
pos: int;
begin
pos := position;
if g.buf[pos] = '''' then begin
inc(pos);
case g.buf[pos] of
'f', 'F': begin
g.kind := gtFloatNumber;
inc(pos);
if g.buf[pos] in ['0'..'9'] then inc(pos);
if g.buf[pos] in ['0'..'9'] then inc(pos);
end;
'i', 'I': begin
inc(pos);
if g.buf[pos] in ['0'..'9'] then inc(pos);
if g.buf[pos] in ['0'..'9'] then inc(pos);
end;
else begin end
end
end;
result := pos;
end;
function nimNumber(var g: TGeneralTokenizer; position: int): int;
const
decChars = ['0'..'9', '_'];
var
pos: int;
begin
pos := position;
g.kind := gtDecNumber;
while g.buf[pos] in decChars do inc(pos);
if g.buf[pos] = '.' then begin
g.kind := gtFloatNumber;
inc(pos);
while g.buf[pos] in decChars do inc(pos);
end;
if g.buf[pos] in ['e', 'E'] then begin
g.kind := gtFloatNumber;
inc(pos);
if g.buf[pos] in ['+', '-'] then inc(pos);
while g.buf[pos] in decChars do inc(pos);
end;
result := nimNumberPostfix(g, pos);
end;
procedure nimNextToken(var g: TGeneralTokenizer);
const
hexChars = ['0'..'9', 'A'..'F', 'a'..'f', '_'];
octChars = ['0'..'7', '_'];
binChars = ['0'..'1', '_'];
var
pos: int;
id: string;
begin
pos := g.pos;
g.start := g.pos;
if g.state = gtStringLit then begin
g.kind := gtStringLit;
while true do begin
case g.buf[pos] of
'\': begin
g.kind := gtEscapeSequence;
inc(pos);
case g.buf[pos] of
'x', 'X': begin
inc(pos);
if g.buf[pos] in hexChars then inc(pos);
if g.buf[pos] in hexChars then inc(pos);
end;
'0'..'9': while g.buf[pos] in ['0'..'9'] do inc(pos);
#0: g.state := gtNone;
else inc(pos);
end;
break
end;
#0, #13, #10: begin g.state := gtNone; break end;
'"': begin
inc(pos);
g.state := gtNone;
break
end;
else inc(pos)
end
end
end
else begin
case g.buf[pos] of
' ', #9..#13: begin
g.kind := gtWhitespace;
while g.buf[pos] in [' ', #9..#13] do inc(pos);
end;
'#': begin
g.kind := gtComment;
while not (g.buf[pos] in [#0, #10, #13]) do inc(pos);
end;
'a'..'z', 'A'..'Z', '_', #128..#255: begin
id := '';
while g.buf[pos] in scanner.SymChars+['_'] do begin
addChar(id, g.buf[pos]);
inc(pos)
end;
if (g.buf[pos] = '"') then begin
if (g.buf[pos+1] = '"') and (g.buf[pos+2] = '"') then begin
inc(pos, 3);
g.kind := gtLongStringLit;
while true do begin
case g.buf[pos] of
#0: break;
'"': begin
inc(pos);
if (g.buf[pos] = '"') and (g.buf[pos+1] = '"') then begin
inc(pos, 2);
break
end
end;
else inc(pos);
end
end
end
else begin
g.kind := gtRawData;
inc(pos);
while not (g.buf[pos] in [#0, '"', #10, #13]) do inc(pos);
if g.buf[pos] = '"' then inc(pos);
end
end
else begin
g.kind := nimGetKeyword(id);
end
end;
'0': begin
inc(pos);
case g.buf[pos] of
'b', 'B': begin
inc(pos);
while g.buf[pos] in binChars do inc(pos);
pos := nimNumberPostfix(g, pos);
end;
'x', 'X': begin
inc(pos);
while g.buf[pos] in hexChars do inc(pos);
pos := nimNumberPostfix(g, pos);
end;
'o', 'O': begin
inc(pos);
while g.buf[pos] in octChars do inc(pos);
pos := nimNumberPostfix(g, pos);
end;
else
pos := nimNumber(g, pos);
end
end;
'1'..'9': begin
pos := nimNumber(g, pos);
end;
'''': begin
inc(pos);
g.kind := gtCharLit;
while true do begin
case g.buf[pos] of
#0, #13, #10: break;
'''': begin inc(pos); break end;
'\': begin inc(pos, 2); end;
else inc(pos);
end
end
end;
'"': begin
inc(pos);
if (g.buf[pos] = '"') and (g.buf[pos+1] = '"') then begin
inc(pos, 2);
g.kind := gtLongStringLit;
while true do begin
case g.buf[pos] of
#0: break;
'"': begin
inc(pos);
if (g.buf[pos] = '"') and (g.buf[pos+1] = '"') then begin
inc(pos, 2);
break
end
end;
else inc(pos);
end
end
end
else begin
g.kind := gtStringLit;
while true do begin
case g.buf[pos] of
#0, #13, #10: break;
'"': begin inc(pos); break end;
'\': begin g.state := g.kind; break end;
else inc(pos);
end
end
end
end;
'(', ')', '[', ']', '{', '}', '`', ':', ',', ';': begin
inc(pos);
g.kind := gtPunctation
end;
#0: g.kind := gtEof;
else if g.buf[pos] in scanner.OpChars then begin
g.kind := gtOperator;
while g.buf[pos] in scanner.OpChars do inc(pos);
end
else begin
inc(pos);
g.kind := gtNone
end;
end
end;
g.len := pos - g.pos;
if (g.kind <> gtEof) and (g.len <= 0) then
InternalError('nimNextToken: ' + toString(g.buf));
g.pos := pos;
end;
// ------------------------------- helpers ------------------------------------
function generalNumber(var g: TGeneralTokenizer; position: int): int;
const
decChars = ['0'..'9'];
var
pos: int;
begin
pos := position;
g.kind := gtDecNumber;
while g.buf[pos] in decChars do inc(pos);
if g.buf[pos] = '.' then begin
g.kind := gtFloatNumber;
inc(pos);
while g.buf[pos] in decChars do inc(pos);
end;
if g.buf[pos] in ['e', 'E'] then begin
g.kind := gtFloatNumber;
inc(pos);
if g.buf[pos] in ['+', '-'] then inc(pos);
while g.buf[pos] in decChars do inc(pos);
end;
result := pos;
end;
function generalStrLit(var g: TGeneralTokenizer; position: int): int;
const
decChars = ['0'..'9'];
hexChars = ['0'..'9', 'A'..'F', 'a'..'f'];
var
pos: int;
c: Char;
begin
pos := position;
g.kind := gtStringLit;
c := g.buf[pos];
inc(pos); // skip " or '
while true do begin
case g.buf[pos] of
#0: break;
'\': begin
inc(pos);
case g.buf[pos] of
#0: break;
'0'..'9': while g.buf[pos] in decChars do inc(pos);
'x', 'X': begin
inc(pos);
if g.buf[pos] in hexChars then inc(pos);
if g.buf[pos] in hexChars then inc(pos);
end;
else inc(pos, 2)
end
end;
else if g.buf[pos] = c then begin
inc(pos); break;
end
else
inc(pos);
end
end;
result := pos;
end;
function isKeyword(const x: array of string; const y: string): int;
var
a, b, mid, c: int;
begin
a := 0;
b := length(x)-1;
while a <= b do begin
mid := (a + b) div 2;
c := cmp(x[mid], y);
if c < 0 then
a := mid + 1
else if c > 0 then
b := mid - 1
else begin
result := mid;
exit
end
end;
result := -1
end;
function isKeywordIgnoreCase(const x: array of string; const y: string): int;
var
a, b, mid, c: int;
begin
a := 0;
b := length(x)-1;
while a <= b do begin
mid := (a + b) div 2;
c := cmpIgnoreCase(x[mid], y);
if c < 0 then
a := mid + 1
else if c > 0 then
b := mid - 1
else begin
result := mid;
exit
end
end;
result := -1
end;
// ---------------------------------------------------------------------------
type
TTokenizerFlag = (hasPreprocessor, hasNestedComments);
TTokenizerFlags = set of TTokenizerFlag;
procedure clikeNextToken(var g: TGeneralTokenizer;
const keywords: array of string;
flags: TTokenizerFlags);
const
hexChars = ['0'..'9', 'A'..'F', 'a'..'f'];
octChars = ['0'..'7'];
binChars = ['0'..'1'];
symChars = ['A'..'Z', 'a'..'z', '0'..'9', '_', #128..#255];
var
pos, nested: int;
id: string;
begin
pos := g.pos;
g.start := g.pos;
if g.state = gtStringLit then begin
g.kind := gtStringLit;
while true do begin
case g.buf[pos] of
'\': begin
g.kind := gtEscapeSequence;
inc(pos);
case g.buf[pos] of
'x', 'X': begin
inc(pos);
if g.buf[pos] in hexChars then inc(pos);
if g.buf[pos] in hexChars then inc(pos);
end;
'0'..'9': while g.buf[pos] in ['0'..'9'] do inc(pos);
#0: g.state := gtNone;
else inc(pos);
end;
break
end;
#0, #13, #10: begin g.state := gtNone; break end;
'"': begin
inc(pos);
g.state := gtNone;
break
end;
else inc(pos)
end
end
end
else begin
case g.buf[pos] of
' ', #9..#13: begin
g.kind := gtWhitespace;
while g.buf[pos] in [' ', #9..#13] do inc(pos);
end;
'/': begin
inc(pos);
if g.buf[pos] = '/' then begin
g.kind := gtComment;
while not (g.buf[pos] in [#0, #10, #13]) do inc(pos);
end
else if g.buf[pos] = '*' then begin
g.kind := gtLongComment;
nested := 0;
inc(pos);
while true do begin
case g.buf[pos] of
'*': begin
inc(pos);
if g.buf[pos] = '/' then begin
inc(pos);
if nested = 0 then break
end;
end;
'/': begin
inc(pos);
if g.buf[pos] = '*' then begin
inc(pos);
if hasNestedComments in flags then inc(nested);
end
end;
#0: break;
else inc(pos);
end
end
end
end;
'#': begin
inc(pos);
if hasPreprocessor in flags then begin
g.kind := gtPreprocessor;
while g.buf[pos] in [' ', Tabulator] do inc(pos);
while g.buf[pos] in symChars do inc(pos);
end
else
g.kind := gtOperator
end;
'a'..'z', 'A'..'Z', '_', #128..#255: begin
id := '';
while g.buf[pos] in SymChars do begin
addChar(id, g.buf[pos]);
inc(pos)
end;
if isKeyword(keywords, id) >= 0 then g.kind := gtKeyword
else g.kind := gtIdentifier;
end;
'0': begin
inc(pos);
case g.buf[pos] of
'b', 'B': begin
inc(pos);
while g.buf[pos] in binChars do inc(pos);
if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos);
end;
'x', 'X': begin
inc(pos);
while g.buf[pos] in hexChars do inc(pos);
if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos);
end;
'0'..'7': begin
inc(pos);
while g.buf[pos] in octChars do inc(pos);
if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos);
end;
else begin
pos := generalNumber(g, pos);
if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos);
end
end
end;
'1'..'9': begin
pos := generalNumber(g, pos);
if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos);
end;
'''': begin
pos := generalStrLit(g, pos);
g.kind := gtCharLit;
end;
'"': begin
inc(pos);
g.kind := gtStringLit;
while true do begin
case g.buf[pos] of
#0: break;
'"': begin inc(pos); break end;
'\': begin g.state := g.kind; break end;
else inc(pos);
end
end
end;
'(', ')', '[', ']', '{', '}', ':', ',', ';', '.': begin
inc(pos);
g.kind := gtPunctation
end;
#0: g.kind := gtEof;
else if g.buf[pos] in scanner.OpChars then begin
g.kind := gtOperator;
while g.buf[pos] in scanner.OpChars do inc(pos);
end
else begin
inc(pos);
g.kind := gtNone
end;
end
end;
g.len := pos - g.pos;
if (g.kind <> gtEof) and (g.len <= 0) then InternalError('clikeNextToken');
g.pos := pos;
end;
// --------------------------------------------------------------------------
procedure cNextToken(var g: TGeneralTokenizer);
const
keywords: array [0..36] of string = (
'_Bool', '_Complex', '_Imaginary',
'auto', 'break', 'case', 'char', 'const', 'continue', 'default', 'do',
'double', 'else', 'enum', 'extern', 'float', 'for', 'goto', 'if',
'inline', 'int', 'long', 'register', 'restrict', 'return', 'short',
'signed', 'sizeof', 'static', 'struct', 'switch', 'typedef', 'union',
'unsigned', 'void', 'volatile', 'while'
);
begin
clikeNextToken(g, keywords, {@set}[hasPreprocessor]);
end;
procedure cppNextToken(var g: TGeneralTokenizer);
const
keywords: array [0..47] of string = (
'asm', 'auto', 'break', 'case', 'catch', 'char', 'class', 'const',
'continue', 'default', 'delete', 'do', 'double', 'else', 'enum', 'extern',
'float', 'for', 'friend', 'goto', 'if', 'inline', 'int', 'long', 'new',
'operator', 'private', 'protected', 'public', 'register', 'return',
'short', 'signed', 'sizeof', 'static', 'struct', 'switch', 'template',
'this', 'throw', 'try', 'typedef', 'union', 'unsigned', 'virtual', 'void',
'volatile', 'while'
);
begin
clikeNextToken(g, keywords, {@set}[hasPreprocessor]);
end;
procedure csharpNextToken(var g: TGeneralTokenizer);
const
keywords: array [0..76] of string = (
'abstract', 'as', 'base', 'bool', 'break', 'byte', 'case', 'catch',
'char', 'checked', 'class', 'const', 'continue', 'decimal', 'default',
'delegate', 'do', 'double', 'else', 'enum', 'event', 'explicit', 'extern',
'false', 'finally', 'fixed', 'float', 'for', 'foreach', 'goto', 'if',
'implicit', 'in', 'int', 'interface', 'internal', 'is', 'lock', 'long',
'namespace', 'new', 'null', 'object', 'operator', 'out', 'override',
'params', 'private', 'protected', 'public', 'readonly', 'ref', 'return',
'sbyte', 'sealed', 'short', 'sizeof', 'stackalloc', 'static', 'string',
'struct', 'switch', 'this', 'throw', 'true', 'try', 'typeof', 'uint',
'ulong', 'unchecked', 'unsafe', 'ushort', 'using', 'virtual', 'void',
'volatile', 'while'
);
begin
clikeNextToken(g, keywords, {@set}[hasPreprocessor]);
end;
procedure javaNextToken(var g: TGeneralTokenizer);
const
keywords: array [0..52] of string = (
'abstract', 'assert', 'boolean', 'break', 'byte', 'case', 'catch',
'char', 'class', 'const', 'continue', 'default', 'do', 'double', 'else',
'enum', 'extends', 'false', 'final', 'finally', 'float', 'for', 'goto',
'if', 'implements', 'import', 'instanceof', 'int', 'interface', 'long',
'native', 'new', 'null', 'package', 'private', 'protected', 'public',
'return', 'short', 'static', 'strictfp', 'super', 'switch',
'synchronized', 'this', 'throw', 'throws', 'transient', 'true', 'try',
'void', 'volatile', 'while'
);
begin
clikeNextToken(g, keywords, {@set}[]);
end;
procedure getNextToken(var g: TGeneralTokenizer; lang: TSourceLanguage);
begin
case lang of
langNimrod: nimNextToken(g);
langCpp: cppNextToken(g);
langCsharp: csharpNextToken(g);
langC: cNextToken(g);
langJava: javaNextToken(g);
else InternalError('getNextToken');
end
end;
end.