diff options
Diffstat (limited to 'nim/scanner.pas')
-rwxr-xr-x | nim/scanner.pas | 1036 |
1 files changed, 0 insertions, 1036 deletions
diff --git a/nim/scanner.pas b/nim/scanner.pas deleted file mode 100755 index c03ae9224..000000000 --- a/nim/scanner.pas +++ /dev/null @@ -1,1036 +0,0 @@ -// -// -// The Nimrod Compiler -// (c) Copyright 2009 Andreas Rumpf -// -// See the file "copying.txt", included in this -// distribution, for details about the copyright. -// -unit scanner; - -// This scanner is handwritten for efficiency. I used an elegant buffering -// scheme which I have not seen anywhere else: -// We guarantee that a whole line is in the buffer. Thus only when scanning -// the \n or \r character we have to check wether we need to read in the next -// chunk. (\n or \r already need special handling for incrementing the line -// counter; choosing both \n and \r allows the scanner to properly read Unix, -// DOS or Macintosh text files, even when it is not the native format. - -interface - -{$include 'config.inc'} - -uses - charsets, nsystem, sysutils, nhashes, options, msgs, strutils, platform, - idents, lexbase, llstream, wordrecg; - -const - MaxLineLength = 80; // lines longer than this lead to a warning - - numChars: TCharSet = ['0'..'9','a'..'z','A'..'Z']; - SymChars: TCharSet = ['a'..'z', 'A'..'Z', '0'..'9', #128..#255]; - SymStartChars: TCharSet = ['a'..'z', 'A'..'Z', #128..#255]; - OpChars: TCharSet = ['+', '-', '*', '/', '\', '<', '>', '!', '?', '^', '.', - '|', '=', '%', '&', '$', '@', '~', #128..#255]; - -type - TTokType = (tkInvalid, tkEof, // order is important here! - tkSymbol, - // keywords: - //[[[cog - //from string import split, capitalize - //keywords = split(open("data/keywords.txt").read()) - //idents = "" - //strings = "" - //i = 1 - //for k in keywords: - // idents = idents + "tk" + capitalize(k) + ", " - // strings = strings + "'" + k + "', " - // if i % 4 == 0: - // idents = idents + "\n" - // strings = strings + "\n" - // i = i + 1 - //cog.out(idents) - //]]] - tkAddr, tkAnd, tkAs, tkAsm, - tkBind, tkBlock, tkBreak, tkCase, - tkCast, tkConst, tkContinue, tkConverter, - tkDiscard, tkDistinct, tkDiv, tkElif, - tkElse, tkEnd, tkEnum, tkExcept, - tkFinally, tkFor, tkFrom, tkGeneric, - tkIf, tkImplies, tkImport, tkIn, - tkInclude, tkIs, tkIsnot, tkIterator, - tkLambda, tkMacro, tkMethod, tkMod, - tkNil, tkNot, tkNotin, tkObject, - tkOf, tkOr, tkOut, tkProc, - tkPtr, tkRaise, tkRef, tkReturn, - tkShl, tkShr, tkTemplate, tkTry, - tkTuple, tkType, tkVar, tkWhen, - tkWhile, tkWith, tkWithout, tkXor, - tkYield, - //[[[end]]] - tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit, - tkFloatLit, tkFloat32Lit, tkFloat64Lit, - tkStrLit, tkRStrLit, tkTripleStrLit, tkCallRStrLit, tkCallTripleStrLit, - tkCharLit, tkParLe, tkParRi, tkBracketLe, tkBracketRi, tkCurlyLe, tkCurlyRi, - tkBracketDotLe, tkBracketDotRi, // [. and .] - tkCurlyDotLe, tkCurlyDotRi, // {. and .} - tkParDotLe, tkParDotRi, // (. and .) - tkComma, tkSemiColon, tkColon, - tkEquals, tkDot, tkDotDot, tkHat, tkOpr, - tkComment, tkAccent, tkInd, tkSad, tkDed, - // pseudo token types used by the source renderers: - tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr - ); - TTokTypes = set of TTokType; -const - tokKeywordLow = succ(tkSymbol); - tokKeywordHigh = pred(tkIntLit); - tokOperators: TTokTypes = {@set}[tkOpr, tkSymbol, tkBracketLe, tkBracketRi, - tkIn, tkIs, tkIsNot, tkEquals, tkDot, tkHat, tkNot, tkAnd, tkOr, tkXor, - tkShl, tkShr, tkDiv, tkMod, tkNotIn]; - - TokTypeToStr: array [TTokType] of string = ( - 'tkInvalid', '[EOF]', - 'tkSymbol', - //[[[cog - //cog.out(strings) - //]]] - 'addr', 'and', 'as', 'asm', - 'bind', 'block', 'break', 'case', - 'cast', 'const', 'continue', 'converter', - 'discard', 'distinct', 'div', 'elif', - 'else', 'end', 'enum', 'except', - 'finally', 'for', 'from', 'generic', - 'if', 'implies', 'import', 'in', - 'include', 'is', 'isnot', 'iterator', - 'lambda', 'macro', 'method', 'mod', - 'nil', 'not', 'notin', 'object', - 'of', 'or', 'out', 'proc', - 'ptr', 'raise', 'ref', 'return', - 'shl', 'shr', 'template', 'try', - 'tuple', 'type', 'var', 'when', - 'while', 'with', 'without', 'xor', - 'yield', - //[[[end]]] - 'tkIntLit', 'tkInt8Lit', 'tkInt16Lit', 'tkInt32Lit', 'tkInt64Lit', - 'tkFloatLit', 'tkFloat32Lit', 'tkFloat64Lit', - 'tkStrLit', 'tkRStrLit', 'tkTripleStrLit', - 'tkCallRStrLit', 'tkCallTripleStrLit', - 'tkCharLit', - '('+'', ')'+'', '['+'', ']'+'', '{'+'', '}'+'', - '[.', '.]', '{.', '.}', '(.', '.)', ','+'', ';'+'', ':'+'', - '='+'', '.'+'', '..', '^'+'', 'tkOpr', - 'tkComment', '`'+'', '[new indentation]', '[same indentation]', - '[dedentation]', - 'tkSpaces', 'tkInfixOpr', 'tkPrefixOpr', 'tkPostfixOpr' - ); - -type - TNumericalBase = (base10, // base10 is listed as the first element, - // so that it is the correct default value - base2, - base8, - base16); - PToken = ^TToken; - TToken = object // a Nimrod token - tokType: TTokType; // the type of the token - indent: int; // the indentation; only valid if tokType = tkIndent - ident: PIdent; // the parsed identifier - iNumber: BiggestInt; // the parsed integer literal - fNumber: BiggestFloat; // the parsed floating point literal - base: TNumericalBase; // the numerical base; only valid for int - // or float literals - literal: string; // the parsed (string) literal; and - // documentation comments are here too - next: PToken; // next token; can be used for arbitrary look-ahead - end; - - PLexer = ^TLexer; - TLexer = object(TBaseLexer) - filename: string; - indentStack: array of int; // the indentation stack - dedent: int; // counter for DED token generation - indentAhead: int; // if > 0 an indendation has already been read - // this is needed because scanning comments - // needs so much look-ahead - end; - -var - gLinesCompiled: int; // all lines that have been compiled - -procedure pushInd(var L: TLexer; indent: int); -procedure popInd(var L: TLexer); - -function isKeyword(kind: TTokType): boolean; - -procedure openLexer(out lex: TLexer; const filename: string; - inputstream: PLLStream); - -procedure rawGetTok(var L: TLexer; var tok: TToken); -// reads in the next token into tok and skips it - -function getColumn(const L: TLexer): int; - -function getLineInfo(const L: TLexer): TLineInfo; - -procedure closeLexer(var lex: TLexer); - -procedure PrintTok(tok: PToken); -function tokToStr(tok: PToken): string; - -// auxiliary functions: -procedure lexMessage(const L: TLexer; const msg: TMsgKind; - const arg: string = ''); - -// the Pascal scanner uses this too: -procedure fillToken(var L: TToken); - -implementation - -function isKeyword(kind: TTokType): boolean; -begin - result := (kind >= tokKeywordLow) and (kind <= tokKeywordHigh) -end; - -procedure pushInd(var L: TLexer; indent: int); -var - len: int; -begin - len := length(L.indentStack); - setLength(L.indentStack, len+1); - if (indent > L.indentStack[len-1]) then - L.indentstack[len] := indent - else - InternalError('pushInd'); - //writeln('push indent ', indent); -end; - -procedure popInd(var L: TLexer); -var - len: int; -begin - len := length(L.indentStack); - setLength(L.indentStack, len-1); -end; - -function findIdent(const L: TLexer; indent: int): boolean; -var - i: int; -begin - for i := length(L.indentStack)-1 downto 0 do - if L.indentStack[i] = indent then begin result := true; exit end; - result := false -end; - -function tokToStr(tok: PToken): string; -begin - case tok.tokType of - tkIntLit..tkInt64Lit: - result := toString(tok.iNumber); - tkFloatLit..tkFloat64Lit: - result := toStringF(tok.fNumber); - tkInvalid, tkStrLit..tkCharLit, tkComment: - result := tok.literal; - tkParLe..tkColon, tkEof, tkInd, tkSad, tkDed, tkAccent: - result := tokTypeToStr[tok.tokType]; - else if (tok.ident <> nil) then - result := tok.ident.s - else begin - InternalError('tokToStr'); - result := '' - end - end -end; - -procedure PrintTok(tok: PToken); -begin - write(output, TokTypeToStr[tok.tokType]); - write(output, ' '+''); - writeln(output, tokToStr(tok)) -end; - -// ---------------------------------------------------------------------------- - -var - dummyIdent: PIdent; - -procedure fillToken(var L: TToken); -begin - L.TokType := tkInvalid; - L.iNumber := 0; - L.Indent := 0; - L.literal := ''; - L.fNumber := 0.0; - L.base := base10; - L.ident := dummyIdent; // this prevents many bugs! -end; - -procedure openLexer(out lex: TLexer; const filename: string; - inputstream: PLLStream); -begin -{@ignore} - FillChar(lex, sizeof(lex), 0); -{@emit} - openBaseLexer(lex, inputstream); -{@ignore} - setLength(lex.indentStack, 1); - lex.indentStack[0] := 0; -{@emit lex.indentStack := @[0]; } - lex.filename := filename; - lex.indentAhead := -1; -end; - -procedure closeLexer(var lex: TLexer); -begin - inc(gLinesCompiled, lex.LineNumber); - closeBaseLexer(lex); -end; - -function getColumn(const L: TLexer): int; -begin - result := getColNumber(L, L.bufPos) -end; - -function getLineInfo(const L: TLexer): TLineInfo; -begin - result := newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos)) -end; - -procedure lexMessage(const L: TLexer; const msg: TMsgKind; - const arg: string = ''); -begin - msgs.liMessage(getLineInfo(L), msg, arg) -end; - -procedure lexMessagePos(var L: TLexer; const msg: TMsgKind; pos: int; - const arg: string = ''); -var - info: TLineInfo; -begin - info := newLineInfo(L.filename, L.linenumber, pos - L.lineStart); - msgs.liMessage(info, msg, arg); -end; - -// ---------------------------------------------------------------------------- - -procedure matchUnderscoreChars(var L: TLexer; var tok: TToken; - const chars: TCharSet); -// matches ([chars]_)* -var - pos: int; - buf: PChar; -begin - pos := L.bufpos; // use registers for pos, buf - buf := L.buf; - repeat - if buf[pos] in chars then begin - addChar(tok.literal, buf[pos]); - Inc(pos) - end - else break; - if buf[pos] = '_' then begin - addChar(tok.literal, '_'); - Inc(pos); - end; - until false; - L.bufPos := pos; -end; - -function matchTwoChars(const L: TLexer; first: Char; - const second: TCharSet): Boolean; -begin - result := (L.buf[L.bufpos] = first) and (L.buf[L.bufpos+1] in Second); -end; - -function isFloatLiteral(const s: string): boolean; -var - i: int; -begin - for i := strStart to length(s)+strStart-1 do - if s[i] in ['.','e','E'] then begin - result := true; exit - end; - result := false -end; - -function GetNumber(var L: TLexer): TToken; -var - pos, endpos: int; - xi: biggestInt; -begin - // get the base: - result.tokType := tkIntLit; // int literal until we know better - result.literal := ''; - result.base := base10; // BUGFIX - pos := L.bufpos; - // make sure the literal is correct for error messages: - matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9']); - if (L.buf[L.bufpos] = '.') and (L.buf[L.bufpos+1] in ['0'..'9']) then begin - addChar(result.literal, '.'); - inc(L.bufpos); - //matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9']) - matchUnderscoreChars(L, result, ['0'..'9']); - if L.buf[L.bufpos] in ['e', 'E'] then begin - addChar(result.literal, 'e'); - inc(L.bufpos); - if L.buf[L.bufpos] in ['+', '-'] then begin - addChar(result.literal, L.buf[L.bufpos]); - inc(L.bufpos); - end; - matchUnderscoreChars(L, result, ['0'..'9']); - end - end; - endpos := L.bufpos; - if L.buf[endpos] = '''' then begin - //matchUnderscoreChars(L, result, ['''', 'f', 'F', 'i', 'I', '0'..'9']); - inc(endpos); - L.bufpos := pos; // restore position - case L.buf[endpos] of - 'f', 'F': begin - inc(endpos); - if (L.buf[endpos] = '6') and (L.buf[endpos+1] = '4') then begin - result.tokType := tkFloat64Lit; - inc(endpos, 2); - end - else if (L.buf[endpos] = '3') and (L.buf[endpos+1] = '2') then begin - result.tokType := tkFloat32Lit; - inc(endpos, 2); - end - else lexMessage(L, errInvalidNumber, result.literal); - end; - 'i', 'I': begin - inc(endpos); - if (L.buf[endpos] = '6') and (L.buf[endpos+1] = '4') then begin - result.tokType := tkInt64Lit; - inc(endpos, 2); - end - else if (L.buf[endpos] = '3') and (L.buf[endpos+1] = '2') then begin - result.tokType := tkInt32Lit; - inc(endpos, 2); - end - else if (L.buf[endpos] = '1') and (L.buf[endpos+1] = '6') then begin - result.tokType := tkInt16Lit; - inc(endpos, 2); - end - else if (L.buf[endpos] = '8') then begin - result.tokType := tkInt8Lit; - inc(endpos); - end - else lexMessage(L, errInvalidNumber, result.literal); - end; - else lexMessage(L, errInvalidNumber, result.literal); - end - end - else - L.bufpos := pos; // restore position - - try - if (L.buf[pos] = '0') and (L.buf[pos+1] in ['x','X','b','B','o','O','c','C']) - then begin - inc(pos, 2); - xi := 0; - // it may be a base prefix - case L.buf[pos-1] of - 'b', 'B': begin - result.base := base2; - while true do begin - case L.buf[pos] of - 'A'..'Z', 'a'..'z', '2'..'9', '.': begin - lexMessage(L, errInvalidNumber, result.literal); - inc(pos) - end; - '_': inc(pos); - '0', '1': begin - xi := shlu(xi, 1) or (ord(L.buf[pos]) - ord('0')); - inc(pos); - end; - else break; - end - end - end; - 'o', 'c', 'C': begin - result.base := base8; - while true do begin - case L.buf[pos] of - 'A'..'Z', 'a'..'z', '8'..'9', '.': begin - lexMessage(L, errInvalidNumber, result.literal); - inc(pos) - end; - '_': inc(pos); - '0'..'7': begin - xi := shlu(xi, 3) or (ord(L.buf[pos]) - ord('0')); - inc(pos); - end; - else break; - end - end - end; - 'O': lexMessage(L, errInvalidNumber, result.literal); - 'x', 'X': begin - result.base := base16; - while true do begin - case L.buf[pos] of - 'G'..'Z', 'g'..'z', '.': begin - lexMessage(L, errInvalidNumber, result.literal); - inc(pos); - end; - '_': inc(pos); - '0'..'9': begin - xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('0')); - inc(pos); - end; - 'a'..'f': begin - xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10); - inc(pos); - end; - 'A'..'F': begin - xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10); - inc(pos); - end; - else break; - end - end - end; - else InternalError(getLineInfo(L), 'getNumber'); - end; - // now look at the optional type suffix: - case result.tokType of - tkIntLit, tkInt64Lit: - result.iNumber := xi; - tkInt8Lit: - result.iNumber := biggestInt(int8(toU8(int(xi)))); - tkInt16Lit: - result.iNumber := biggestInt(toU16(int(xi))); - tkInt32Lit: - result.iNumber := biggestInt(toU32(xi)); - tkFloat32Lit: - result.fNumber := ({@cast}PFloat32(addr(xi)))^; - // note: this code is endian neutral! - // XXX: Test this on big endian machine! - tkFloat64Lit: - result.fNumber := ({@cast}PFloat64(addr(xi)))^; - else InternalError(getLineInfo(L), 'getNumber'); - end - end - else if isFloatLiteral(result.literal) - or (result.tokType = tkFloat32Lit) - or (result.tokType = tkFloat64Lit) then begin - result.fnumber := parseFloat(result.literal); - if result.tokType = tkIntLit then result.tokType := tkFloatLit; - end - else begin - result.iNumber := ParseBiggestInt(result.literal); - if (result.iNumber < low(int32)) or (result.iNumber > high(int32)) then - begin - if result.tokType = tkIntLit then result.tokType := tkInt64Lit - else if result.tokType <> tkInt64Lit then - lexMessage(L, errInvalidNumber, result.literal); - end - end; - except - on EInvalidValue do - lexMessage(L, errInvalidNumber, result.literal); - {@ignore} - on sysutils.EIntOverflow do - lexMessage(L, errNumberOutOfRange, result.literal); - {@emit} - on EOverflow do - lexMessage(L, errNumberOutOfRange, result.literal); - on EOutOfRange do - lexMessage(L, errNumberOutOfRange, result.literal); - end; - L.bufpos := endpos; -end; - -procedure handleHexChar(var L: TLexer; var xi: int); -begin - case L.buf[L.bufpos] of - '0'..'9': begin - xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0')); - inc(L.bufpos); - end; - 'a'..'f': begin - xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10); - inc(L.bufpos); - end; - 'A'..'F': begin - xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10); - inc(L.bufpos); - end; - else begin end // do nothing - end -end; - -procedure handleDecChars(var L: TLexer; var xi: int); -begin - while L.buf[L.bufpos] in ['0'..'9'] do begin - xi := (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0')); - inc(L.bufpos); - end; -end; - -procedure getEscapedChar(var L: TLexer; var tok: TToken); -var - xi: int; -begin - inc(L.bufpos); // skip '\' - case L.buf[L.bufpos] of - 'n', 'N': begin - if tok.toktype = tkCharLit then - lexMessage(L, errNnotAllowedInCharacter); - tok.literal := tok.literal +{&} tnl; - Inc(L.bufpos); - end; - 'r', 'R', 'c', 'C': begin addChar(tok.literal, CR); Inc(L.bufpos); end; - 'l', 'L': begin addChar(tok.literal, LF); Inc(L.bufpos); end; - 'f', 'F': begin addChar(tok.literal, FF); inc(L.bufpos); end; - 'e', 'E': begin addChar(tok.literal, ESC); Inc(L.bufpos); end; - 'a', 'A': begin addChar(tok.literal, BEL); Inc(L.bufpos); end; - 'b', 'B': begin addChar(tok.literal, BACKSPACE); Inc(L.bufpos); end; - 'v', 'V': begin addChar(tok.literal, VT); Inc(L.bufpos); end; - 't', 'T': begin addChar(tok.literal, Tabulator); Inc(L.bufpos); end; - '''', '"': begin addChar(tok.literal, L.buf[L.bufpos]); Inc(L.bufpos); end; - '\': begin addChar(tok.literal, '\'); Inc(L.bufpos) end; - 'x', 'X': begin - inc(L.bufpos); - xi := 0; - handleHexChar(L, xi); - handleHexChar(L, xi); - addChar(tok.literal, Chr(xi)); - end; - '0'..'9': begin - if matchTwoChars(L, '0', ['0'..'9']) then - // this warning will make it easier for newcomers: - lexMessage(L, warnOctalEscape); - xi := 0; - handleDecChars(L, xi); - if (xi <= 255) then - addChar(tok.literal, Chr(xi)) - else - lexMessage(L, errInvalidCharacterConstant) - end - else lexMessage(L, errInvalidCharacterConstant) - end -end; - -function HandleCRLF(var L: TLexer; pos: int): int; -begin - case L.buf[pos] of - CR: begin - if getColNumber(L, pos) > MaxLineLength then - lexMessagePos(L, hintLineTooLong, pos); - result := lexbase.HandleCR(L, pos) - end; - LF: begin - if getColNumber(L, pos) > MaxLineLength then - lexMessagePos(L, hintLineTooLong, pos); - result := lexbase.HandleLF(L, pos) - end; - else result := pos - end -end; - -procedure getString(var L: TLexer; var tok: TToken; rawMode: Boolean); -var - line, line2, pos: int; - c: Char; - buf: PChar; -begin - pos := L.bufPos + 1; // skip " - buf := L.buf; // put `buf` in a register - line := L.linenumber; // save linenumber for better error message - if (buf[pos] = '"') and (buf[pos+1] = '"') then begin - tok.tokType := tkTripleStrLit; - // long string literal: - inc(pos, 2); // skip "" - // skip leading newline: - pos := HandleCRLF(L, pos); - buf := L.buf; - repeat - case buf[pos] of - '"': begin - if (buf[pos+1] = '"') and (buf[pos+2] = '"') then - break; - addChar(tok.literal, '"'); - Inc(pos) - end; - CR, LF: begin - pos := HandleCRLF(L, pos); - buf := L.buf; - tok.literal := tok.literal +{&} tnl; - end; - lexbase.EndOfFile: begin - line2 := L.linenumber; - L.LineNumber := line; - lexMessagePos(L, errClosingTripleQuoteExpected, L.lineStart); - L.LineNumber := line2; - break - end - else begin - addChar(tok.literal, buf[pos]); - Inc(pos) - end - end - until false; - L.bufpos := pos + 3 // skip the three """ - end - else begin // ordinary string literal - if rawMode then tok.tokType := tkRStrLit - else tok.tokType := tkStrLit; - repeat - c := buf[pos]; - if c = '"' then begin - inc(pos); // skip '"' - break - end; - if c in [CR, LF, lexbase.EndOfFile] then begin - lexMessage(L, errClosingQuoteExpected); - break - end; - if (c = '\') and not rawMode then begin - L.bufPos := pos; - getEscapedChar(L, tok); - pos := L.bufPos; - end - else begin - addChar(tok.literal, c); - Inc(pos) - end - until false; - L.bufpos := pos; - end -end; - -procedure getCharacter(var L: TLexer; var tok: TToken); -var - c: Char; -begin - Inc(L.bufpos); // skip ' - c := L.buf[L.bufpos]; - case c of - #0..Pred(' '), '''': lexMessage(L, errInvalidCharacterConstant); - '\': getEscapedChar(L, tok); - else begin - tok.literal := c + ''; - Inc(L.bufpos); - end - end; - if L.buf[L.bufpos] <> '''' then lexMessage(L, errMissingFinalQuote); - inc(L.bufpos); // skip ' -end; - -{@ignore} -{$ifopt Q+} {$define Q_on} {$Q-} {$endif} -{$ifopt R+} {$define R_on} {$R-} {$endif} -{@emit} -procedure getSymbol(var L: TLexer; var tok: TToken); -var - pos: int; - c: Char; - buf: pchar; - h: THash; // hashing algorithm inlined -begin - h := 0; - pos := L.bufpos; - buf := L.buf; - while true do begin - c := buf[pos]; - case c of - 'a'..'z', '0'..'9', #128..#255: begin - h := h +{%} Ord(c); - h := h +{%} h shl 10; - h := h xor (h shr 6) - end; - 'A'..'Z': begin - c := chr(ord(c) + (ord('a')-ord('A'))); // toLower() - h := h +{%} Ord(c); - h := h +{%} h shl 10; - h := h xor (h shr 6) - end; - '_': begin end; - else break - end; - Inc(pos) - end; - h := h +{%} h shl 3; - h := h xor (h shr 11); - h := h +{%} h shl 15; - tok.ident := getIdent(addr(L.buf[L.bufpos]), pos-L.bufpos, h); - L.bufpos := pos; - if (tok.ident.id < ord(tokKeywordLow)-ord(tkSymbol)) or - (tok.ident.id > ord(tokKeywordHigh)-ord(tkSymbol)) then - tok.tokType := tkSymbol - else - tok.tokType := TTokType(tok.ident.id+ord(tkSymbol)); - if buf[pos] = '"' then begin - getString(L, tok, true); - if tok.tokType = tkRStrLit then tok.tokType := tkCallRStrLit - else tok.tokType := tkCallTripleStrLit - end -end; - -procedure getOperator(var L: TLexer; var tok: TToken); -var - pos: int; - c: Char; - buf: pchar; - h: THash; // hashing algorithm inlined -begin - pos := L.bufpos; - buf := L.buf; - h := 0; - while true do begin - c := buf[pos]; - if c in OpChars then begin - h := h +{%} Ord(c); - h := h +{%} h shl 10; - h := h xor (h shr 6) - end - else break; - Inc(pos) - end; - h := h +{%} h shl 3; - h := h xor (h shr 11); - h := h +{%} h shl 15; - tok.ident := getIdent(addr(L.buf[L.bufpos]), pos-L.bufpos, h); - if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh) then - tok.tokType := tkOpr - else - tok.tokType := TTokType(tok.ident.id - oprLow + ord(tkColon)); - L.bufpos := pos -end; -{@ignore} -{$ifdef Q_on} {$undef Q_on} {$Q+} {$endif} -{$ifdef R_on} {$undef R_on} {$R+} {$endif} -{@emit} - -procedure handleIndentation(var L: TLexer; var tok: TToken; indent: int); -var - i: int; -begin - tok.indent := indent; - i := high(L.indentStack); - if indent > L.indentStack[i] then - tok.tokType := tkInd - else if indent = L.indentStack[i] then - tok.tokType := tkSad - else begin - // check we have the indentation somewhere in the stack: - while (i >= 0) and (indent <> L.indentStack[i]) do begin - dec(i); - inc(L.dedent); - end; - dec(L.dedent); - tok.tokType := tkDed; - if i < 0 then begin - tok.tokType := tkSad; // for the parser it is better as SAD - lexMessage(L, errInvalidIndentation); - end - end -end; - -procedure scanComment(var L: TLexer; var tok: TToken); -var - buf: PChar; - pos, col: int; - indent: int; -begin - pos := L.bufpos; - buf := L.buf; - // a comment ends if the next line does not start with the # on the same - // column after only whitespace - tok.tokType := tkComment; - col := getColNumber(L, pos); - while true do begin - while not (buf[pos] in [CR, LF, lexbase.EndOfFile]) do begin - addChar(tok.literal, buf[pos]); inc(pos); - end; - pos := handleCRLF(L, pos); - buf := L.buf; - indent := 0; - while buf[pos] = ' ' do begin inc(pos); inc(indent) end; - if (buf[pos] = '#') and (col = indent) then begin - tok.literal := tok.literal +{&} nl; - end - else begin - if buf[pos] > ' ' then begin - L.indentAhead := indent; - inc(L.dedent) - end; - break - end - end; - L.bufpos := pos; -end; - -procedure skip(var L: TLexer; var tok: TToken); -var - buf: PChar; - indent, pos: int; -begin - pos := L.bufpos; - buf := L.buf; - repeat - case buf[pos] of - ' ': Inc(pos); - Tabulator: begin - lexMessagePos(L, errTabulatorsAreNotAllowed, pos); - inc(pos); // BUGFIX - end; - // newline is special: - CR, LF: begin - pos := HandleCRLF(L, pos); - buf := L.buf; - indent := 0; - while buf[pos] = ' ' do begin - Inc(pos); Inc(indent) - end; - if (buf[pos] > ' ') then begin - handleIndentation(L, tok, indent); - break; - end - end; - else break // EndOfFile also leaves the loop - end - until false; - L.bufpos := pos; -end; - -procedure rawGetTok(var L: TLexer; var tok: TToken); -var - c: Char; -begin - fillToken(tok); - if L.dedent > 0 then begin - dec(L.dedent); - if L.indentAhead >= 0 then begin - handleIndentation(L, tok, L.indentAhead); - L.indentAhead := -1; - end - else - tok.tokType := tkDed; - exit; - end; - // Skip whitespace, comments: - skip(L, tok); // skip - // got an documentation comment or tkIndent, return that: - if tok.toktype <> tkInvalid then exit; - - c := L.buf[L.bufpos]; - if c in SymStartChars - ['r', 'R', 'l'] then // common case first - getSymbol(L, tok) - else if c in ['0'..'9'] then - tok := getNumber(L) - else begin - case c of - '#': scanComment(L, tok); - ':': begin - tok.tokType := tkColon; - inc(L.bufpos); - end; - ',': begin - tok.toktype := tkComma; - Inc(L.bufpos) - end; - 'l': begin - // if we parsed exactly one character and its a small L (l), this - // is treated as a warning because it may be confused with the number 1 - if not (L.buf[L.bufpos+1] in (SymChars+['_'])) then - lexMessage(L, warnSmallLshouldNotBeUsed); - getSymbol(L, tok); - end; - 'r', 'R': begin - if L.buf[L.bufPos+1] = '"' then begin - Inc(L.bufPos); - getString(L, tok, true); - end - else getSymbol(L, tok); - end; - '(': begin - Inc(L.bufpos); - if (L.buf[L.bufPos] = '.') - and (L.buf[L.bufPos+1] <> '.') then begin - tok.toktype := tkParDotLe; - Inc(L.bufpos); - end - else - tok.toktype := tkParLe; - end; - ')': begin - tok.toktype := tkParRi; - Inc(L.bufpos) - end; - '[': begin - Inc(L.bufpos); - if (L.buf[L.bufPos] = '.') - and (L.buf[L.bufPos+1] <> '.') then begin - tok.toktype := tkBracketDotLe; - Inc(L.bufpos); - end - else - tok.toktype := tkBracketLe; - end; - ']': begin - tok.toktype := tkBracketRi; - Inc(L.bufpos) - end; - '.': begin - if L.buf[L.bufPos+1] = ']' then begin - tok.tokType := tkBracketDotRi; - Inc(L.bufpos, 2); - end - else if L.buf[L.bufPos+1] = '}' then begin - tok.tokType := tkCurlyDotRi; - Inc(L.bufpos, 2); - end - else if L.buf[L.bufPos+1] = ')' then begin - tok.tokType := tkParDotRi; - Inc(L.bufpos, 2); - end - else - getOperator(L, tok) - end; - '{': begin - Inc(L.bufpos); - if (L.buf[L.bufPos] = '.') - and (L.buf[L.bufPos+1] <> '.') then begin - tok.toktype := tkCurlyDotLe; - Inc(L.bufpos); - end - else - tok.toktype := tkCurlyLe; - end; - '}': begin - tok.toktype := tkCurlyRi; - Inc(L.bufpos) - end; - ';': begin - tok.toktype := tkSemiColon; - Inc(L.bufpos) - end; - '`': begin - tok.tokType := tkAccent; - Inc(L.bufpos); - end; - '"': getString(L, tok, false); - '''': begin - getCharacter(L, tok); - tok.tokType := tkCharLit; - end; - lexbase.EndOfFile: tok.toktype := tkEof; - else if c in OpChars then - getOperator(L, tok) - else begin - tok.literal := c + ''; - tok.tokType := tkInvalid; - lexMessage(L, errInvalidToken, c +{&} ' (\' +{&} toString(ord(c)) + ')'); - Inc(L.bufpos); - end - end - end -end; - -initialization - dummyIdent := getIdent(''); -end. |