// // // The Nimrod Compiler // (c) Copyright 2009 Andreas Rumpf // // See the file "copying.txt", included in this // distribution, for details about the copyright. // unit scanner; // This scanner is handwritten for efficiency. I used an elegant buffering // scheme which I have not seen anywhere else: // We guarantee that a whole line is in the buffer. Thus only when scanning // the \n or \r character we have to check wether we need to read in the next // chunk. (\n or \r already need special handling for incrementing the line // counter; choosing both \n and \r allows the scanner to properly read Unix, // DOS or Macintosh text files, even when it is not the native format. interface {$include 'config.inc'} uses charsets, nsystem, sysutils, nhashes, options, msgs, strutils, platform, idents, lexbase, llstream, wordrecg; const MaxLineLength = 80; // lines longer than this lead to a warning numChars: TCharSet = ['0'..'9','a'..'z','A'..'Z']; SymChars: TCharSet = ['a'..'z', 'A'..'Z', '0'..'9', #128..#255]; SymStartChars: TCharSet = ['a'..'z', 'A'..'Z', #128..#255]; OpChars: TCharSet = ['+', '-', '*', '/', '\', '<', '>', '!', '?', '^', '.', '|', '=', '%', '&', '$', '@', '~', #128..#255]; type TTokType = (tkInvalid, tkEof, // order is important here! tkSymbol, // keywords: //[[[cog //from string import split, capitalize //keywords = split(open("data/keywords.txt").read()) //idents = "" //strings = "" //i = 1 //for k in keywords: // idents = idents + "tk" + capitalize(k) + ", " // strings = strings + "'" + k + "', " // if i % 4 == 0: // idents = idents + "\n" // strings = strings + "\n" // i = i + 1 //cog.out(idents) //]]] tkAddr, tkAnd, tkAs, tkAsm, tkBind, tkBlock, tkBreak, tkCase, tkCast, tkConst, tkContinue, tkConverter, tkDiscard, tkDistinct, tkDiv, tkElif, tkElse, tkEnd, tkEnum, tkExcept, tkFinally, tkFor, tkFrom, tkGeneric, tkIf, tkImplies, tkImport, tkIn, tkInclude, tkIs, tkIsnot, tkIterator, tkLambda, tkMacro, tkMethod, tkMod, tkNil, tkNot, tkNotin, tkObject, tkOf, tkOr, tkOut, tkProc, tkPtr, tkRaise, tkRef, tkReturn, tkShl, tkShr, tkTemplate, tkTry, tkTuple, tkType, tkVar, tkWhen, tkWhile, tkWith, tkWithout, tkXor, tkYield, //[[[end]]] tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit, tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkStrLit, tkRStrLit, tkTripleStrLit, tkCallRStrLit, tkCallTripleStrLit, tkCharLit, tkParLe, tkParRi, tkBracketLe, tkBracketRi, tkCurlyLe, tkCurlyRi, tkBracketDotLe, tkBracketDotRi, // [. and .] tkCurlyDotLe, tkCurlyDotRi, // {. and .} tkParDotLe, tkParDotRi, // (. and .) tkComma, tkSemiColon, tkColon, tkEquals, tkDot, tkDotDot, tkHat, tkOpr, tkComment, tkAccent, tkInd, tkSad, tkDed, // pseudo token types used by the source renderers: tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr ); TTokTypes = set of TTokType; const tokKeywordLow = succ(tkSymbol); tokKeywordHigh = pred(tkIntLit); tokOperators: TTokTypes = {@set}[tkOpr, tkSymbol, tkBracketLe, tkBracketRi, tkIn, tkIs, tkIsNot, tkEquals, tkDot, tkHat, tkNot, tkAnd, tkOr, tkXor, tkShl, tkShr, tkDiv, tkMod, tkNotIn]; TokTypeToStr: array [TTokType] of string = ( 'tkInvalid', '[EOF]', 'tkSymbol', //[[[cog //cog.out(strings) //]]] 'addr', 'and', 'as', 'asm', 'bind', 'block', 'break', 'case', 'cast', 'const', 'continue', 'converter', 'discard', 'distinct', 'div', 'elif', 'else', 'end', 'enum', 'except', 'finally', 'for', 'from', 'generic', 'if', 'implies', 'import', 'in', 'include', 'is', 'isnot', 'iterator', 'lambda', 'macro', 'method', 'mod', 'nil', 'not', 'notin', 'object', 'of', 'or', 'out', 'proc', 'ptr', 'raise', 'ref', 'return', 'shl', 'shr', 'template', 'try', 'tuple', 'type', 'var', 'when', 'while', 'with', 'without', 'xor', 'yield', //[[[end]]] 'tkIntLit', 'tkInt8Lit', 'tkInt16Lit', 'tkInt32Lit', 'tkInt64Lit', 'tkFloatLit', 'tkFloat32Lit', 'tkFloat64Lit', 'tkStrLit', 'tkRStrLit', 'tkTripleStrLit', 'tkCallRStrLit', 'tkCallTripleStrLit', 'tkCharLit', '('+'', ')'+'', '['+'', ']'+'', '{'+'', '}'+'', '[.', '.]', '{.', '.}', '(.', '.)', ','+'', ';'+'', ':'+'', '='+'', '.'+'', '..', '^'+'', 'tkOpr', 'tkComment', '`'+'', '[new indentation]', '[same indentation]', '[dedentation]', 'tkSpaces', 'tkInfixOpr', 'tkPrefixOpr', 'tkPostfixOpr' ); type TNumericalBase = (base10, // base10 is listed as the first element, // so that it is the correct default value base2, base8, base16); PToken = ^TToken; TToken = object // a Nimrod token tokType: TTokType; // the type of the token indent: int; // the indentation; only valid if tokType = tkIndent ident: PIdent; // the parsed identifier iNumber: BiggestInt; // the parsed integer literal fNumber: BiggestFloat; // the parsed floating point literal base: TNumericalBase; // the numerical base; only valid for int // or float literals literal: string; // the parsed (string) literal; and // documentation comments are here too next: PToken; // next token; can be used for arbitrary look-ahead end; PLexer = ^TLexer; TLexer = object(TBaseLexer) filename: string; indentStack: array of int; // the indentation stack dedent: int; // counter for DED token generation indentAhead: int; // if > 0 an indendation has already been read // this is needed because scanning comments // needs so much look-ahead end; var gLinesCompiled: int; // all lines that have been compiled procedure pushInd(var L: TLexer; indent: int); procedure popInd(var L: TLexer); function isKeyword(kind: TTokType): boolean; procedure openLexer(out lex: TLexer; const filename: string; inputstream: PLLStream); procedure rawGetTok(var L: TLexer; var tok: TToken); // reads in the next token into tok and skips it function getColumn(const L: TLexer): int; function getLineInfo(const L: TLexer): TLineInfo; procedure closeLexer(var lex: TLexer); procedure PrintTok(tok: PToken); function tokToStr(tok: PToken): string; // auxiliary functions: procedure lexMessage(const L: TLexer; const msg: TMsgKind; const arg: string = ''); // the Pascal scanner uses this too: procedure fillToken(var L: TToken); implementation function isKeyword(kind: TTokType): boolean; begin result := (kind >= tokKeywordLow) and (kind <= tokKeywordHigh) end; procedure pushInd(var L: TLexer; indent: int); var len: int; begin len := length(L.indentStack); setLength(L.indentStack, len+1); if (indent > L.indentStack[len-1]) then L.indentstack[len] := indent else InternalError('pushInd'); //writeln('push indent ', indent); end; procedure popInd(var L: TLexer); var len: int; begin len := length(L.indentStack); setLength(L.indentStack, len-1); end; function findIdent(const L: TLexer; indent: int): boolean; var i: int; begin for i := length(L.indentStack)-1 downto 0 do if L.indentStack[i] = indent then begin result := true; exit end; result := false end; function tokToStr(tok: PToken): string; begin case tok.tokType of tkIntLit..tkInt64Lit: result := toString(tok.iNumber); tkFloatLit..tkFloat64Lit: result := toStringF(tok.fNumber); tkInvalid, tkStrLit..tkCharLit, tkComment: result := tok.literal; tkParLe..tkColon, tkEof, tkInd, tkSad, tkDed, tkAccent: result := tokTypeToStr[tok.tokType]; else if (tok.ident <> nil) then result := tok.ident.s else begin InternalError('tokToStr'); result := '' end end end; procedure PrintTok(tok: PToken); begin write(output, TokTypeToStr[tok.tokType]); write(output, ' '+''); writeln(output, tokToStr(tok)) end; // ---------------------------------------------------------------------------- var dummyIdent: PIdent; procedure fillToken(var L: TToken); begin L.TokType := tkInvalid; L.iNumber := 0; L.Indent := 0; L.literal := ''; L.fNumber := 0.0; L.base := base10; L.ident := dummyIdent; // this prevents many bugs! end; procedure openLexer(out lex: TLexer; const filename: string; inputstream: PLLStream); begin {@ignore} FillChar(lex, sizeof(lex), 0); {@emit} openBaseLexer(lex, inputstream); {@ignore} setLength(lex.indentStack, 1); lex.indentStack[0] := 0; {@emit lex.indentStack := @[0]; } lex.filename := filename; lex.indentAhead := -1; end; procedure closeLexer(var lex: TLexer); begin inc(gLinesCompiled, lex.LineNumber); closeBaseLexer(lex); end; function getColumn(const L: TLexer): int; begin result := getColNumber(L, L.bufPos) end; function getLineInfo(const L: TLexer): TLineInfo; begin result := newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos)) end; procedure lexMessage(const L: TLexer; const msg: TMsgKind; const arg: string = ''); begin msgs.liMessage(getLineInfo(L), msg, arg) end; procedure lexMessagePos(var L: TLexer; const msg: TMsgKind; pos: int; const arg: string = ''); var info: TLineInfo; begin info := newLineInfo(L.filename, L.linenumber, pos - L.lineStart); msgs.liMessage(info, msg, arg); end; // ---------------------------------------------------------------------------- procedure matchUnderscoreChars(var L: TLexer; var tok: TToken; const chars: TCharSet); // matches ([chars]_)* var pos: int; buf: PChar; begin pos := L.bufpos; // use registers for pos, buf buf := L.buf; repeat if buf[pos] in chars then begin addChar(tok.literal, buf[pos]); Inc(pos) end else break; if buf[pos] = '_' then begin addChar(tok.literal, '_'); Inc(pos); end; until false; L.bufPos := pos; end; function matchTwoChars(const L: TLexer; first: Char; const second: TCharSet): Boolean; begin result := (L.buf[L.bufpos] = first) and (L.buf[L.bufpos+1] in Second); end; function isFloatLiteral(const s: string): boolean; var i: int; begin for i := strStart to length(s)+strStart-1 do if s[i] in ['.','e','E'] then begin result := true; exit end; result := false end; function GetNumber(var L: TLexer): TToken; var pos, endpos: int; xi: biggestInt; begin // get the base: result.tokType := tkIntLit; // int literal until we know better result.literal := ''; result.base := base10; // BUGFIX pos := L.bufpos; // make sure the literal is correct for error messages: matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9']); if (L.buf[L.bufpos] = '.') and (L.buf[L.bufpos+1] in ['0'..'9']) then begin addChar(result.literal, '.'); inc(L.bufpos); //matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9']) matchUnderscoreChars(L, result, ['0'..'9']); if L.buf[L.bufpos] in ['e', 'E'] then begin addChar(result.literal, 'e'); inc(L.bufpos); if L.buf[L.bufpos] in ['+', '-'] then begin addChar(result.literal, L.buf[L.bufpos]); inc(L.bufpos); end; matchUnderscoreChars(L, result, ['0'..'9']); end end; endpos := L.bufpos; if L.buf[endpos] = '''' then begin //matchUnderscoreChars(L, result, ['''', 'f', 'F', 'i', 'I', '0'..'9']); inc(endpos); L.bufpos := pos; // restore position case L.buf[endpos] of 'f', 'F': begin inc(endpos); if (L.buf[endpos] = '6') and (L.buf[endpos+1] = '4') then begin result.tokType := tkFloat64Lit; inc(endpos, 2); end else if (L.buf[endpos] = '3') and (L.buf[endpos+1] = '2') then begin result.tokType := tkFloat32Lit; inc(endpos, 2); end else lexMessage(L, errInvalidNumber, result.literal); end; 'i', 'I': begin inc(endpos); if (L.buf[endpos] = '6') and (L.buf[endpos+1] = '4') then begin result.tokType := tkInt64Lit; inc(endpos, 2); end else if (L.buf[endpos] = '3') and (L.buf[endpos+1] = '2') then begin result.tokType := tkInt32Lit; inc(endpos, 2); end else if (L.buf[endpos] = '1') and (L.buf[endpos+1] = '6') then begin result.tokType := tkInt16Lit; inc(endpos, 2); end else if (L.buf[endpos] = '8') then begin result.tokType := tkInt8Lit; inc(endpos); end else lexMessage(L, errInvalidNumber, result.literal); end; else lexMessage(L, errInvalidNumber, result.literal); end end else L.bufpos := pos; // restore position try if (L.buf[pos] = '0') and (L.buf[pos+1] in ['x','X','b','B','o','O','c','C']) then begin inc(pos, 2); xi := 0; // it may be a base prefix case L.buf[pos-1] of 'b', 'B': begin result.base := base2; while true do begin case L.buf[pos] of 'A'..'Z', 'a'..'z', '2'..'9', '.': begin lexMessage(L, errInvalidNumber, result.literal); inc(pos) end; '_': inc(pos); '0', '1': begin xi := shlu(xi, 1) or (ord(L.buf[pos]) - ord('0')); inc(pos); end; else break; end end end; 'o', 'c', 'C': begin result.base := base8; while true do begin case L.buf[pos] of 'A'..'Z', 'a'..'z', '8'..'9', '.': begin lexMessage(L, errInvalidNumber, result.literal); inc(pos) end; '_': inc(pos); '0'..'7': begin xi := shlu(xi, 3) or (ord(L.buf[pos]) - ord('0')); inc(pos); end; else break; end end end; 'O': lexMessage(L, errInvalidNumber, result.literal); 'x', 'X': begin result.base := base16; while true do begin case L.buf[pos] of 'G'..'Z', 'g'..'z', '.': begin lexMessage(L, errInvalidNumber, result.literal); inc(pos); end; '_': inc(pos); '0'..'9': begin xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('0')); inc(pos); end; 'a'..'f': begin xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10); inc(pos); end; 'A'..'F': begin xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10); inc(pos); end; else break; end end end; else InternalError(getLineInfo(L), 'getNumber'); end; // now look at the optional type suffix: case result.tokType of tkIntLit, tkInt64Lit: result.iNumber := xi; tkInt8Lit: result.iNumber := biggestInt(int8(toU8(int(xi)))); tkInt16Lit: result.iNumber := biggestInt(toU16(int(xi))); tkInt32Lit: result.iNumber := biggestInt(toU32(xi)); tkFloat32Lit: result.fNumber := ({@cast}PFloat32(addr(xi)))^; // note: this code is endian neutral! // XXX: Test this on big endian machine! tkFloat64Lit: result.fNumber := ({@cast}PFloat64(addr(xi)))^; else InternalError(getLineInfo(L), 'getNumber'); end end else if isFloatLiteral(result.literal) or (result.tokType = tkFloat32Lit) or (result.tokType = tkFloat64Lit) then begin result.fnumber := parseFloat(result.literal); if result.tokType = tkIntLit then result.tokType := tkFloatLit; end else begin result.iNumber := ParseBiggestInt(result.literal); if (result.iNumber < low(int32)) or (result.iNumber > high(int32)) then begin if result.tokType = tkIntLit then result.tokType := tkInt64Lit else if result.tokType <> tkInt64Lit then lexMessage(L, errInvalidNumber, result.literal); end end; except on EInvalidValue do lexMessage(L, errInvalidNumber, result.literal); {@ignore} on sysutils.EIntOverflow do lexMessage(L, errNumberOutOfRange, result.literal); {@emit} on EOverflow do lexMessage(L, errNumberOutOfRange, result.literal); on EOutOfRange do lexMessage(L, errNumberOutOfRange, result.literal); end; L.bufpos := endpos; end; procedure handleHexChar(var L: TLexer; var xi: int); begin case L.buf[L.bufpos] of '0'..'9': begin xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0')); inc(L.bufpos); end; 'a'..'f': begin xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10); inc(L.bufpos); end; 'A'..'F': begin xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10); inc(L.bufpos); end; else begin end // do nothing end end; procedure handleDecChars(var L: TLexer; var xi: int); begin while L.buf[L.bufpos] in ['0'..'9'] do begin xi := (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0')); inc(L.bufpos); end; end; procedure getEscapedChar(var L: TLexer; var tok: TToken); var xi: int; begin inc(L.bufpos); // skip '\' case L.buf[L.bufpos] of 'n', 'N': begin if tok.toktype = tkCharLit then lexMessage(L, errNnotAllowedInCharacter); tok.literal := tok.literal +{&} tnl; Inc(L.bufpos); end; 'r', 'R', 'c', 'C': begin addChar(tok.literal, CR); Inc(L.bufpos); end; 'l', 'L': begin addChar(tok.literal, LF); Inc(L.bufpos); end; 'f', 'F': begin addChar(tok.literal, FF); inc(L.bufpos); end; 'e', 'E': begin addChar(tok.literal, ESC); Inc(L.bufpos); end; 'a', 'A': begin addChar(tok.literal, BEL); Inc(L.bufpos); end; 'b', 'B': begin addChar(tok.literal, BACKSPACE); Inc(L.bufpos); end; 'v', 'V': begin addChar(tok.literal, VT); Inc(L.bufpos); end; 't', 'T': begin addChar(tok.literal, Tabulator); Inc(L.bufpos); end; '''', '"': begin addChar(tok.literal, L.buf[L.bufpos]); Inc(L.bufpos); end; '\': begin addChar(tok.literal, '\'); Inc(L.bufpos) end; 'x', 'X': begin inc(L.bufpos); xi := 0; handleHexChar(L, xi); handleHexChar(L, xi); addChar(tok.literal, Chr(xi)); end; '0'..'9': begin if matchTwoChars(L, '0', ['0'..'9']) then // this warning will make it easier for newcomers: lexMessage(L, warnOctalEscape); xi := 0; handleDecChars(L, xi); if (xi <= 255) then addChar(tok.literal, Chr(xi)) else lexMessage(L, errInvalidCharacterConstant) end else lexMessage(L, errInvalidCharacterConstant) end end; function HandleCRLF(var L: TLexer; pos: int): int; begin case L.buf[pos] of CR: begin if getColNumber(L, pos) > MaxLineLength then lexMessagePos(L, hintLineTooLong, pos); result := lexbase.HandleCR(L, pos) end; LF: begin if getColNumber(L, pos) > MaxLineLength then lexMessagePos(L, hintLineTooLong, pos); result := lexbase.HandleLF(L, pos) end; else result := pos end end; procedure getString(var L: TLexer; var tok: TToken; rawMode: Boolean); var line, line2, pos: int; c: Char; buf: PChar; begin pos := L.bufPos + 1; // skip " buf := L.buf; // put `buf` in a register line := L.linenumber; // save linenumber for better error message if (buf[pos] = '"') and (buf[pos+1] = '"') then begin tok.tokType := tkTripleStrLit; // long string literal: inc(pos, 2); // skip "" // skip leading newline: pos := HandleCRLF(L, pos); buf := L.buf; repeat case buf[pos] of '"': begin if (buf[pos+1] = '"') and (buf[pos+2] = '"') then break; addChar(tok.literal, '"'); Inc(pos) end; CR, LF: begin pos := HandleCRLF(L, pos); buf := L.buf; tok.literal := tok.literal +{&} tnl; end; lexbase.EndOfFile: begin line2 := L.linenumber; L.LineNumber := line; lexMessagePos(L, errClosingTripleQuoteExpected, L.lineStart); L.LineNumber := line2; break end else begin addChar(tok.literal, buf[pos]); Inc(pos) end end until false; L.bufpos := pos + 3 // skip the three """ end else begin // ordinary string literal if rawMode then tok.tokType := tkRStrLit else tok.tokType := tkStrLit; repeat c := buf[pos]; if c = '"' then begin inc(pos); // skip '"' break end; if c in [CR, LF, lexbase.EndOfFile] then begin lexMessage(L, errClosingQuoteExpected); break end; if (c = '\') and not rawMode then begin L.bufPos := pos; getEscapedChar(L, tok); pos := L.bufPos; end else begin addChar(tok.literal, c); Inc(pos) end until false; L.bufpos := pos; end end; procedure getCharacter(var L: TLexer; var tok: TToken); var c: Char; begin Inc(L.bufpos); // skip ' c := L.buf[L.bufpos]; case c of #0..Pred(' '), '''': lexMessage(L, errInvalidCharacterConstant); '\': getEscapedChar(L, tok); else begin tok.literal := c + ''; Inc(L.bufpos); end end; if L.buf[L.bufpos] <> '''' then lexMessage(L, errMissingFinalQuote); inc(L.bufpos); // skip ' end; {@ignore} {$ifopt Q+} {$define Q_on} {$Q-} {$endif} {$ifopt R+} {$define R_on} {$R-} {$endif} {@emit} procedure getSymbol(var L: TLexer; var tok: TToken); var pos: int; c: Char; buf: pchar; h: THash; // hashing algorithm inlined begin h := 0; pos := L.bufpos; buf := L.buf; while true do begin c := buf[pos]; case c of 'a'..'z', '0'..'9', #128..#255: begin h := h +{%} Ord(c); h := h +{%} h shl 10; h := h xor (h shr 6) end; 'A'..'Z': begin c := chr(ord(c) + (ord('a')-ord('A'))); // toLower() h := h +{%} Ord(c); h := h +{%} h shl 10; h := h xor (h shr 6) end; '_': begin end; else break end; Inc(pos) end; h := h +{%} h shl 3; h := h xor (h shr 11); h := h +{%} h shl 15; tok.ident := getIdent(addr(L.buf[L.bufpos]), pos-L.bufpos, h); L.bufpos := pos; if (tok.ident.id < ord(tokKeywordLow)-ord(tkSymbol)) or (tok.ident.id > ord(tokKeywordHigh)-ord(tkSymbol)) then tok.tokType := tkSymbol else tok.tokType := TTokType(tok.ident.id+ord(tkSymbol)); if buf[pos] = '"' then begin getString(L, tok, true); if tok.tokType = tkRStrLit then tok.tokType := tkCallRStrLit else tok.tokType := tkCallTripleStrLit end end; procedure getOperator(var L: TLexer; var tok: TToken); var pos: int; c: Char; buf: pchar; h: THash; // hashing algorithm inlined begin pos := L.bufpos; buf := L.buf; h := 0; while true do begin c := buf[pos]; if c in OpChars then begin h := h +{%} Ord(c); h := h +{%} h shl 10; h := h xor (h shr 6) end else break; Inc(pos) end; h := h +{%} h shl 3; h := h xor (h shr 11); h := h +{%} h shl 15; tok.ident := getIdent(addr(L.buf[L.bufpos]), pos-L.bufpos, h); if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh) then tok.tokType := tkOpr else tok.tokType := TTokType(tok.ident.id - oprLow + ord(tkColon)); L.bufpos := pos end; {@ignore} {$ifdef Q_on} {$undef Q_on} {$Q+} {$endif} {$ifdef R_on} {$undef R_on} {$R+} {$endif} {@emit} procedure handleIndentation(var L: TLexer; var tok: TToken; indent: int); var i: int; begin tok.indent := indent; i := high(L.indentStack); if indent > L.indentStack[i] then tok.tokType := tkInd else if indent = L.indentStack[i] then tok.tokType := tkSad else begin // check we have the indentation somewhere in the stack: while (i >= 0) and (indent <> L.indentStack[i]) do begin dec(i); inc(L.dedent); end; dec(L.dedent); tok.tokType := tkDed; if i < 0 then begin tok.tokType := tkSad; // for the parser it is better as SAD lexMessage(L, errInvalidIndentation); end end end; procedure scanComment(var L: TLexer; var tok: TToken); var buf: PChar; pos, col: int; indent: int; begin pos := L.bufpos; buf := L.buf; // a comment ends if the next line does not start with the # on the same // column after only whitespace tok.tokType := tkComment; col := getColNumber(L, pos); while true do begin while not (buf[pos] in [CR, LF, lexbase.EndOfFile]) do begin addChar(tok.literal, buf[pos]); inc(pos); end; pos := handleCRLF(L, pos); buf := L.buf; indent := 0; while buf[pos] = ' ' do begin inc(pos); inc(indent) end; if (buf[pos] = '#') and (col = indent) then begin tok.literal := tok.literal +{&} nl; end else begin if buf[pos] > ' ' then begin L.indentAhead := indent; inc(L.dedent) end; break end end; L.bufpos := pos; end; procedure skip(var L: TLexer; var tok: TToken); var buf: PChar; indent, pos: int; begin pos := L.bufpos; buf := L.buf; repeat case buf[pos] of ' ': Inc(pos); Tabulator: begin lexMessagePos(L, errTabulatorsAreNotAllowed, pos); inc(pos); // BUGFIX end; // newline is special: CR, LF: begin pos := HandleCRLF(L, pos); buf := L.buf; indent := 0; while buf[pos] = ' ' do begin Inc(pos); Inc(indent) end; if (buf[pos] > ' ') then begin handleIndentation(L, tok, indent); break; end end; else break // EndOfFile also leaves the loop end until false; L.bufpos := pos; end; procedure rawGetTok(var L: TLexer; var tok: TToken); var c: Char; begin fillToken(tok); if L.dedent > 0 then begin dec(L.dedent); if L.indentAhead >= 0 then begin handleIndentation(L, tok, L.indentAhead); L.indentAhead := -1; end else tok.tokType := tkDed; exit; end; // Skip whitespace, comments: skip(L, tok); // skip // got an documentation comment or tkIndent, return that: if tok.toktype <> tkInvalid then exit; c := L.buf[L.bufpos]; if c in SymStartChars - ['r', 'R', 'l'] then // common case first getSymbol(L, tok) else if c in ['0'..'9'] then tok := getNumber(L) else begin case c of '#': scanComment(L, tok); ':': begin tok.tokType := tkColon; inc(L.bufpos); end; ',': begin tok.toktype := tkComma; Inc(L.bufpos) end; 'l': begin // if we parsed exactly one character and its a small L (l), this // is treated as a warning because it may be confused with the number 1 if not (L.buf[L.bufpos+1] in (SymChars+['_'])) then lexMessage(L, warnSmallLshouldNotBeUsed); getSymbol(L, tok); end; 'r', 'R': begin if L.buf[L.bufPos+1] = '"' then begin Inc(L.bufPos); getString(L, tok, true); end else getSymbol(L, tok); end; '(': begin Inc(L.bufpos); if (L.buf[L.bufPos] = '.') and (L.buf[L.bufPos+1] <> '.') then begin tok.toktype := tkParDotLe; Inc(L.bufpos); end else tok.toktype := tkParLe; end; ')': begin tok.toktype := tkParRi; Inc(L.bufpos) end; '[': begin Inc(L.bufpos); if (L.buf[L.bufPos] = '.') and (L.buf[L.bufPos+1] <> '.') then begin tok.toktype := tkBracketDotLe; Inc(L.bufpos); end else tok.toktype := tkBracketLe; end; ']': begin tok.toktype := tkBracketRi; Inc(L.bufpos) end; '.': begin if L.buf[L.bufPos+1] = ']' then begin tok.tokType := tkBracketDotRi; Inc(L.bufpos, 2); end else if L.buf[L.bufPos+1] = '}' then begin tok.tokType := tkCurlyDotRi; Inc(L.bufpos, 2); end else if L.buf[L.bufPos+1] = ')' then begin tok.tokType := tkParDotRi; Inc(L.bufpos, 2); end else getOperator(L, tok) end; '{': begin Inc(L.bufpos); if (L.buf[L.bufPos] = '.') and (L.buf[L.bufPos+1] <> '.') then begin tok.toktype := tkCurlyDotLe; Inc(L.bufpos); end else tok.toktype := tkCurlyLe; end; '}': begin tok.toktype := tkCurlyRi; Inc(L.bufpos) end; ';': begin tok.toktype := tkSemiColon; Inc(L.bufpos) end; '`': begin tok.tokType := tkAccent; Inc(L.bufpos); end; '"': getString(L, tok, false); '''': begin getCharacter(L, tok); tok.tokType := tkCharLit; end; lexbase.EndOfFile: tok.toktype := tkEof; else if c in OpChars then getOperator(L, tok) else begin tok.literal := c + ''; tok.tokType := tkInvalid; lexMessage(L, errInvalidToken, c +{&} ' (\' +{&} toString(ord(c)) + ')'); Inc(L.bufpos); end end end end; initialization dummyIdent := getIdent(''); end.