diff options
Diffstat (limited to 'nim/highlite.pas')
-rwxr-xr-x | nim/highlite.pas | 743 |
1 files changed, 0 insertions, 743 deletions
diff --git a/nim/highlite.pas b/nim/highlite.pas deleted file mode 100755 index fa760d2a2..000000000 --- a/nim/highlite.pas +++ /dev/null @@ -1,743 +0,0 @@ -// -// -// The Nimrod Compiler -// (c) Copyright 2008 Andreas Rumpf -// -// See the file "copying.txt", included in this -// distribution, for details about the copyright. -// -unit highlite; - -// Source highlighter for programming or markup languages. -// Currently only few languages are supported, other languages may be added. -// The interface supports one language nested in another. - -interface - -{$include 'config.inc'} - -uses - charsets, nsystem, sysutils, nhashes, options, msgs, strutils, platform, - idents, lexbase, wordrecg, scanner; - -type - TTokenClass = ( - gtEof, - gtNone, - gtWhitespace, - gtDecNumber, - gtBinNumber, - gtHexNumber, - gtOctNumber, - gtFloatNumber, - gtIdentifier, - gtKeyword, - gtStringLit, - gtLongStringLit, - gtCharLit, - gtEscapeSequence, // escape sequence like \xff - gtOperator, - gtPunctation, - gtComment, - gtLongComment, - gtRegularExpression, - gtTagStart, - gtTagEnd, - gtKey, - gtValue, - gtRawData, - gtAssembler, - gtPreprocessor, - gtDirective, - gtCommand, - gtRule, - gtHyperlink, - gtLabel, - gtReference, - gtOther - ); - TGeneralTokenizer = object(NObject) - kind: TTokenClass; - start, len: int; - // private: - buf: PChar; - pos: int; - state: TTokenClass; - end; - TSourceLanguage = ( - langNone, - langNimrod, - langCpp, - langCsharp, - langC, - langJava - ); -const - sourceLanguageToStr: array [TSourceLanguage] of string = ( - 'none', 'Nimrod', 'C++', 'C#', 'C'+'', 'Java' - ); - tokenClassToStr: array [TTokenClass] of string = ( - 'Eof', - 'None', - 'Whitespace', - 'DecNumber', - 'BinNumber', - 'HexNumber', - 'OctNumber', - 'FloatNumber', - 'Identifier', - 'Keyword', - 'StringLit', - 'LongStringLit', - 'CharLit', - 'EscapeSequence', - 'Operator', - 'Punctation', - 'Comment', - 'LongComment', - 'RegularExpression', - 'TagStart', - 'TagEnd', - 'Key', - 'Value', - 'RawData', - 'Assembler', - 'Preprocessor', - 'Directive', - 'Command', - 'Rule', - 'Hyperlink', - 'Label', - 'Reference', - 'Other' - ); - -function getSourceLanguage(const name: string): TSourceLanguage; - -procedure initGeneralTokenizer(var g: TGeneralTokenizer; - const buf: string); -procedure deinitGeneralTokenizer(var g: TGeneralTokenizer); -procedure getNextToken(var g: TGeneralTokenizer; lang: TSourceLanguage); - -implementation - -function getSourceLanguage(const name: string): TSourceLanguage; -var - i: TSourceLanguage; -begin - for i := succ(low(TSourceLanguage)) to high(TSourceLanguage) do - if cmpIgnoreStyle(name, sourceLanguageToStr[i]) = 0 then begin - result := i; exit - end; - result := langNone -end; - -procedure initGeneralTokenizer(var g: TGeneralTokenizer; - const buf: string); -var - pos: int; -begin -{@ignore} fillChar(g, sizeof(g), 0); {@emit} - g.buf := PChar(buf); - g.kind := low(TTokenClass); - g.start := 0; - g.len := 0; - g.state := low(TTokenClass); - pos := 0; - // skip initial whitespace: - while g.buf[pos] in [' ', #9..#13] do inc(pos); - g.pos := pos; -end; - -procedure deinitGeneralTokenizer(var g: TGeneralTokenizer); -begin -end; - -function nimGetKeyword(const id: string): TTokenClass; -var - i: PIdent; -begin - i := getIdent(id); - if (i.id >= ord(tokKeywordLow)-ord(tkSymbol)) and - (i.id <= ord(tokKeywordHigh)-ord(tkSymbol)) then - result := gtKeyword - else - result := gtIdentifier -end; - -function nimNumberPostfix(var g: TGeneralTokenizer; position: int): int; -var - pos: int; -begin - pos := position; - if g.buf[pos] = '''' then begin - inc(pos); - case g.buf[pos] of - 'f', 'F': begin - g.kind := gtFloatNumber; - inc(pos); - if g.buf[pos] in ['0'..'9'] then inc(pos); - if g.buf[pos] in ['0'..'9'] then inc(pos); - end; - 'i', 'I': begin - inc(pos); - if g.buf[pos] in ['0'..'9'] then inc(pos); - if g.buf[pos] in ['0'..'9'] then inc(pos); - end; - else begin end - end - end; - result := pos; -end; - -function nimNumber(var g: TGeneralTokenizer; position: int): int; -const - decChars = ['0'..'9', '_']; -var - pos: int; -begin - pos := position; - g.kind := gtDecNumber; - while g.buf[pos] in decChars do inc(pos); - if g.buf[pos] = '.' then begin - g.kind := gtFloatNumber; - inc(pos); - while g.buf[pos] in decChars do inc(pos); - end; - if g.buf[pos] in ['e', 'E'] then begin - g.kind := gtFloatNumber; - inc(pos); - if g.buf[pos] in ['+', '-'] then inc(pos); - while g.buf[pos] in decChars do inc(pos); - end; - result := nimNumberPostfix(g, pos); -end; - -procedure nimNextToken(var g: TGeneralTokenizer); -const - hexChars = ['0'..'9', 'A'..'F', 'a'..'f', '_']; - octChars = ['0'..'7', '_']; - binChars = ['0'..'1', '_']; -var - pos: int; - id: string; -begin - pos := g.pos; - g.start := g.pos; - if g.state = gtStringLit then begin - g.kind := gtStringLit; - while true do begin - case g.buf[pos] of - '\': begin - g.kind := gtEscapeSequence; - inc(pos); - case g.buf[pos] of - 'x', 'X': begin - inc(pos); - if g.buf[pos] in hexChars then inc(pos); - if g.buf[pos] in hexChars then inc(pos); - end; - '0'..'9': while g.buf[pos] in ['0'..'9'] do inc(pos); - #0: g.state := gtNone; - else inc(pos); - end; - break - end; - #0, #13, #10: begin g.state := gtNone; break end; - '"': begin - inc(pos); - g.state := gtNone; - break - end; - else inc(pos) - end - end - end - else begin - case g.buf[pos] of - ' ', #9..#13: begin - g.kind := gtWhitespace; - while g.buf[pos] in [' ', #9..#13] do inc(pos); - end; - '#': begin - g.kind := gtComment; - while not (g.buf[pos] in [#0, #10, #13]) do inc(pos); - end; - 'a'..'z', 'A'..'Z', '_', #128..#255: begin - id := ''; - while g.buf[pos] in scanner.SymChars+['_'] do begin - addChar(id, g.buf[pos]); - inc(pos) - end; - if (g.buf[pos] = '"') then begin - if (g.buf[pos+1] = '"') and (g.buf[pos+2] = '"') then begin - inc(pos, 3); - g.kind := gtLongStringLit; - while true do begin - case g.buf[pos] of - #0: break; - '"': begin - inc(pos); - if (g.buf[pos] = '"') and (g.buf[pos+1] = '"') then begin - inc(pos, 2); - break - end - end; - else inc(pos); - end - end - end - else begin - g.kind := gtRawData; - inc(pos); - while not (g.buf[pos] in [#0, '"', #10, #13]) do inc(pos); - if g.buf[pos] = '"' then inc(pos); - end - end - else begin - g.kind := nimGetKeyword(id); - end - end; - '0': begin - inc(pos); - case g.buf[pos] of - 'b', 'B': begin - inc(pos); - while g.buf[pos] in binChars do inc(pos); - pos := nimNumberPostfix(g, pos); - end; - 'x', 'X': begin - inc(pos); - while g.buf[pos] in hexChars do inc(pos); - pos := nimNumberPostfix(g, pos); - end; - 'o', 'O': begin - inc(pos); - while g.buf[pos] in octChars do inc(pos); - pos := nimNumberPostfix(g, pos); - end; - else - pos := nimNumber(g, pos); - end - end; - '1'..'9': begin - pos := nimNumber(g, pos); - end; - '''': begin - inc(pos); - g.kind := gtCharLit; - while true do begin - case g.buf[pos] of - #0, #13, #10: break; - '''': begin inc(pos); break end; - '\': begin inc(pos, 2); end; - else inc(pos); - end - end - end; - '"': begin - inc(pos); - if (g.buf[pos] = '"') and (g.buf[pos+1] = '"') then begin - inc(pos, 2); - g.kind := gtLongStringLit; - while true do begin - case g.buf[pos] of - #0: break; - '"': begin - inc(pos); - if (g.buf[pos] = '"') and (g.buf[pos+1] = '"') then begin - inc(pos, 2); - break - end - end; - else inc(pos); - end - end - end - else begin - g.kind := gtStringLit; - while true do begin - case g.buf[pos] of - #0, #13, #10: break; - '"': begin inc(pos); break end; - '\': begin g.state := g.kind; break end; - else inc(pos); - end - end - end - end; - '(', ')', '[', ']', '{', '}', '`', ':', ',', ';': begin - inc(pos); - g.kind := gtPunctation - end; - #0: g.kind := gtEof; - else if g.buf[pos] in scanner.OpChars then begin - g.kind := gtOperator; - while g.buf[pos] in scanner.OpChars do inc(pos); - end - else begin - inc(pos); - g.kind := gtNone - end; - end - end; - g.len := pos - g.pos; - if (g.kind <> gtEof) and (g.len <= 0) then - InternalError('nimNextToken: ' + toString(g.buf)); - g.pos := pos; -end; - -// ------------------------------- helpers ------------------------------------ - -function generalNumber(var g: TGeneralTokenizer; position: int): int; -const - decChars = ['0'..'9']; -var - pos: int; -begin - pos := position; - g.kind := gtDecNumber; - while g.buf[pos] in decChars do inc(pos); - if g.buf[pos] = '.' then begin - g.kind := gtFloatNumber; - inc(pos); - while g.buf[pos] in decChars do inc(pos); - end; - if g.buf[pos] in ['e', 'E'] then begin - g.kind := gtFloatNumber; - inc(pos); - if g.buf[pos] in ['+', '-'] then inc(pos); - while g.buf[pos] in decChars do inc(pos); - end; - result := pos; -end; - -function generalStrLit(var g: TGeneralTokenizer; position: int): int; -const - decChars = ['0'..'9']; - hexChars = ['0'..'9', 'A'..'F', 'a'..'f']; -var - pos: int; - c: Char; -begin - pos := position; - g.kind := gtStringLit; - c := g.buf[pos]; - inc(pos); // skip " or ' - while true do begin - case g.buf[pos] of - #0: break; - '\': begin - inc(pos); - case g.buf[pos] of - #0: break; - '0'..'9': while g.buf[pos] in decChars do inc(pos); - 'x', 'X': begin - inc(pos); - if g.buf[pos] in hexChars then inc(pos); - if g.buf[pos] in hexChars then inc(pos); - end; - else inc(pos, 2) - end - end; - else if g.buf[pos] = c then begin - inc(pos); break; - end - else - inc(pos); - end - end; - result := pos; -end; - -function isKeyword(const x: array of string; const y: string): int; -var - a, b, mid, c: int; -begin - a := 0; - b := length(x)-1; - while a <= b do begin - mid := (a + b) div 2; - c := cmp(x[mid], y); - if c < 0 then - a := mid + 1 - else if c > 0 then - b := mid - 1 - else begin - result := mid; - exit - end - end; - result := -1 -end; - -function isKeywordIgnoreCase(const x: array of string; const y: string): int; -var - a, b, mid, c: int; -begin - a := 0; - b := length(x)-1; - while a <= b do begin - mid := (a + b) div 2; - c := cmpIgnoreCase(x[mid], y); - if c < 0 then - a := mid + 1 - else if c > 0 then - b := mid - 1 - else begin - result := mid; - exit - end - end; - result := -1 -end; - -// --------------------------------------------------------------------------- - -type - TTokenizerFlag = (hasPreprocessor, hasNestedComments); - TTokenizerFlags = set of TTokenizerFlag; - -procedure clikeNextToken(var g: TGeneralTokenizer; - const keywords: array of string; - flags: TTokenizerFlags); -const - hexChars = ['0'..'9', 'A'..'F', 'a'..'f']; - octChars = ['0'..'7']; - binChars = ['0'..'1']; - symChars = ['A'..'Z', 'a'..'z', '0'..'9', '_', #128..#255]; -var - pos, nested: int; - id: string; -begin - pos := g.pos; - g.start := g.pos; - if g.state = gtStringLit then begin - g.kind := gtStringLit; - while true do begin - case g.buf[pos] of - '\': begin - g.kind := gtEscapeSequence; - inc(pos); - case g.buf[pos] of - 'x', 'X': begin - inc(pos); - if g.buf[pos] in hexChars then inc(pos); - if g.buf[pos] in hexChars then inc(pos); - end; - '0'..'9': while g.buf[pos] in ['0'..'9'] do inc(pos); - #0: g.state := gtNone; - else inc(pos); - end; - break - end; - #0, #13, #10: begin g.state := gtNone; break end; - '"': begin - inc(pos); - g.state := gtNone; - break - end; - else inc(pos) - end - end - end - else begin - case g.buf[pos] of - ' ', #9..#13: begin - g.kind := gtWhitespace; - while g.buf[pos] in [' ', #9..#13] do inc(pos); - end; - '/': begin - inc(pos); - if g.buf[pos] = '/' then begin - g.kind := gtComment; - while not (g.buf[pos] in [#0, #10, #13]) do inc(pos); - end - else if g.buf[pos] = '*' then begin - g.kind := gtLongComment; - nested := 0; - inc(pos); - while true do begin - case g.buf[pos] of - '*': begin - inc(pos); - if g.buf[pos] = '/' then begin - inc(pos); - if nested = 0 then break - end; - end; - '/': begin - inc(pos); - if g.buf[pos] = '*' then begin - inc(pos); - if hasNestedComments in flags then inc(nested); - end - end; - #0: break; - else inc(pos); - end - end - end - end; - '#': begin - inc(pos); - if hasPreprocessor in flags then begin - g.kind := gtPreprocessor; - while g.buf[pos] in [' ', Tabulator] do inc(pos); - while g.buf[pos] in symChars do inc(pos); - end - else - g.kind := gtOperator - end; - 'a'..'z', 'A'..'Z', '_', #128..#255: begin - id := ''; - while g.buf[pos] in SymChars do begin - addChar(id, g.buf[pos]); - inc(pos) - end; - if isKeyword(keywords, id) >= 0 then g.kind := gtKeyword - else g.kind := gtIdentifier; - end; - '0': begin - inc(pos); - case g.buf[pos] of - 'b', 'B': begin - inc(pos); - while g.buf[pos] in binChars do inc(pos); - if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos); - end; - 'x', 'X': begin - inc(pos); - while g.buf[pos] in hexChars do inc(pos); - if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos); - end; - '0'..'7': begin - inc(pos); - while g.buf[pos] in octChars do inc(pos); - if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos); - end; - else begin - pos := generalNumber(g, pos); - if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos); - end - end - end; - '1'..'9': begin - pos := generalNumber(g, pos); - if g.buf[pos] in ['A'..'Z', 'a'..'z'] then inc(pos); - end; - '''': begin - pos := generalStrLit(g, pos); - g.kind := gtCharLit; - end; - '"': begin - inc(pos); - g.kind := gtStringLit; - while true do begin - case g.buf[pos] of - #0: break; - '"': begin inc(pos); break end; - '\': begin g.state := g.kind; break end; - else inc(pos); - end - end - end; - '(', ')', '[', ']', '{', '}', ':', ',', ';', '.': begin - inc(pos); - g.kind := gtPunctation - end; - #0: g.kind := gtEof; - else if g.buf[pos] in scanner.OpChars then begin - g.kind := gtOperator; - while g.buf[pos] in scanner.OpChars do inc(pos); - end - else begin - inc(pos); - g.kind := gtNone - end; - end - end; - g.len := pos - g.pos; - if (g.kind <> gtEof) and (g.len <= 0) then InternalError('clikeNextToken'); - g.pos := pos; -end; - -// -------------------------------------------------------------------------- - -procedure cNextToken(var g: TGeneralTokenizer); -const - keywords: array [0..36] of string = ( - '_Bool', '_Complex', '_Imaginary', - 'auto', 'break', 'case', 'char', 'const', 'continue', 'default', 'do', - 'double', 'else', 'enum', 'extern', 'float', 'for', 'goto', 'if', - 'inline', 'int', 'long', 'register', 'restrict', 'return', 'short', - 'signed', 'sizeof', 'static', 'struct', 'switch', 'typedef', 'union', - 'unsigned', 'void', 'volatile', 'while' - ); -begin - clikeNextToken(g, keywords, {@set}[hasPreprocessor]); -end; - -procedure cppNextToken(var g: TGeneralTokenizer); -const - keywords: array [0..47] of string = ( - 'asm', 'auto', 'break', 'case', 'catch', 'char', 'class', 'const', - 'continue', 'default', 'delete', 'do', 'double', 'else', 'enum', 'extern', - 'float', 'for', 'friend', 'goto', 'if', 'inline', 'int', 'long', 'new', - 'operator', 'private', 'protected', 'public', 'register', 'return', - 'short', 'signed', 'sizeof', 'static', 'struct', 'switch', 'template', - 'this', 'throw', 'try', 'typedef', 'union', 'unsigned', 'virtual', 'void', - 'volatile', 'while' - ); -begin - clikeNextToken(g, keywords, {@set}[hasPreprocessor]); -end; - -procedure csharpNextToken(var g: TGeneralTokenizer); -const - keywords: array [0..76] of string = ( - 'abstract', 'as', 'base', 'bool', 'break', 'byte', 'case', 'catch', - 'char', 'checked', 'class', 'const', 'continue', 'decimal', 'default', - 'delegate', 'do', 'double', 'else', 'enum', 'event', 'explicit', 'extern', - 'false', 'finally', 'fixed', 'float', 'for', 'foreach', 'goto', 'if', - 'implicit', 'in', 'int', 'interface', 'internal', 'is', 'lock', 'long', - 'namespace', 'new', 'null', 'object', 'operator', 'out', 'override', - 'params', 'private', 'protected', 'public', 'readonly', 'ref', 'return', - 'sbyte', 'sealed', 'short', 'sizeof', 'stackalloc', 'static', 'string', - 'struct', 'switch', 'this', 'throw', 'true', 'try', 'typeof', 'uint', - 'ulong', 'unchecked', 'unsafe', 'ushort', 'using', 'virtual', 'void', - 'volatile', 'while' - ); -begin - clikeNextToken(g, keywords, {@set}[hasPreprocessor]); -end; - -procedure javaNextToken(var g: TGeneralTokenizer); -const - keywords: array [0..52] of string = ( - 'abstract', 'assert', 'boolean', 'break', 'byte', 'case', 'catch', - 'char', 'class', 'const', 'continue', 'default', 'do', 'double', 'else', - 'enum', 'extends', 'false', 'final', 'finally', 'float', 'for', 'goto', - 'if', 'implements', 'import', 'instanceof', 'int', 'interface', 'long', - 'native', 'new', 'null', 'package', 'private', 'protected', 'public', - 'return', 'short', 'static', 'strictfp', 'super', 'switch', - 'synchronized', 'this', 'throw', 'throws', 'transient', 'true', 'try', - 'void', 'volatile', 'while' - ); -begin - clikeNextToken(g, keywords, {@set}[]); -end; - -procedure getNextToken(var g: TGeneralTokenizer; lang: TSourceLanguage); -begin - case lang of - langNimrod: nimNextToken(g); - langCpp: cppNextToken(g); - langCsharp: csharpNextToken(g); - langC: cNextToken(g); - langJava: javaNextToken(g); - else InternalError('getNextToken'); - end -end; - -end. |