Initial import

author: Andreas Rumpf <andreas@andi> 2008-06-22 16:14:11 +0200
committer: Andreas Rumpf <andreas@andi> 2008-06-22 16:14:11 +0200
commit: 405b86068e6a3d39970b9129ceec0a9108464b28 (patch)
tree: c0449946f54baae6ea88baf453157ddd7faa8f86 /nim/scanner.pas
download: Nim-405b86068e6a3d39970b9129ceec0a9108464b28.tar.gz
1 files changed, 1025 insertions, 0 deletions
diff --git a/nim/scanner.pas b/nim/scanner.pas
new file mode 100755
index 000000000..8e5bc3fc5
--- /dev/null
+++ b/nim/scanner.pas
@@ -0,0 +1,1025 @@
+//
+//
+//           The Nimrod Compiler
+//        (c) Copyright 2008 Andreas Rumpf
+//
+//    See the file "copying.txt", included in this
+//    distribution, for details about the copyright.
+//
+unit scanner;
+
+// This scanner is handwritten for efficiency. I used an elegant buffering
+// scheme which I have not seen anywhere else:
+// We guarantee that a hole line is in the buffer (too long lines are reported
+// as an error). Thus only when scanning the \n or \r character we have
+// to check wether we need to read in the next chunk. (\n or \r already need
+// special handling for incrementing the line counter; choosing both \n and \r
+// allows the scanner to properly read Unix, DOS or Macintosh text files, even
+// when it is not the native format.
+
+interface
+
+{$include 'config.inc'}
+
+uses
+  charsets, nsystem, sysutils,
+  hashes, options, msgs, strutils, platform, idents,
+  lexbase, wordrecg;
+
+const
+  MaxLineLength = 80; // lines longer than this lead to a warning
+
+  numChars: TCharSet = ['0'..'9','a'..'z','A'..'Z']; // we support up to base 36
+  SymChars: TCharSet = ['a'..'z', 'A'..'Z', '0'..'9', #128..#255];
+  SymStartChars: TCharSet = ['a'..'z', 'A'..'Z', #128..#255];
+  OpChars: TCharSet = ['+', '-', '*', '/', '<', '>', '!', '?', '^', '.',
+    '|', '=', '%', '&', '$', '@', '~', #128..#255];
+
+type
+  TTokType = (tkInvalid, tkEof, // order is important here!
+    tkSymbol,
+    // keywords:
+    //[[[cog
+    //keywords = (file("data/keywords.txt").read()).split()
+    //idents = ""
+    //strings = ""
+    //i = 1
+    //for k in keywords:
+    //  idents += "tk" + k.capitalize() + ", "
+    //  strings += "'" + k + "', "
+    //  if i % 4 == 0: idents += "\n"; strings += "\n"
+    //  i += 1
+    //cog.out(idents)
+    //]]]
+    tkAddr, tkAnd, tkAs, tkAsm, 
+    tkBlock, tkBreak, tkCase, tkCast, 
+    tkConst, tkContinue, tkConverter, tkDiscard, 
+    tkDiv, tkElif, tkElse, tkEnd, 
+    tkEnum, tkExcept, tkException, tkFinally, 
+    tkFor, tkFrom, tkGeneric, tkIf, 
+    tkImplies, tkImport, tkIn, tkInclude, 
+    tkIs, tkIsnot, tkIterator, tkLambda, 
+    tkMacro, tkMethod, tkMod, tkNil, 
+    tkNot, tkNotin, tkObject, tkOf, 
+    tkOr, tkOut, tkProc, tkPtr, 
+    tkRaise, tkRecord, tkRef, tkReturn, 
+    tkShl, tkShr, tkTemplate, tkTry, 
+    tkType, tkVar, tkWhen, tkWhere, 
+    tkWhile, tkWith, tkWithout, tkXor, 
+    tkYield, 
+    //[[[end]]]
+    tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit,
+    tkFloatLit, tkFloat32Lit, tkFloat64Lit,
+    tkStrLit, tkRStrLit, tkTripleStrLit, tkCharLit, tkRCharLit,
+    tkParLe, tkParRi, tkBracketLe, tkBracketRi, tkCurlyLe, tkCurlyRi,
+    tkBracketDotLe, tkBracketDotRi, // [. and  .]
+    tkCurlyDotLe, tkCurlyDotRi, // {.  and  .}
+    tkParDotLe, tkParDotRi, // (. and .)
+    tkComma, tkSemiColon, tkColon,
+    tkEquals, tkDot, tkDotDot, tkHat, tkOpr,
+    tkComment, tkAccent, tkInd, tkSad, tkDed,
+    // pseudo token types used by the source renderers:
+    tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr
+  );
+  TTokTypes = set of TTokType;
+const
+  tokKeywordLow = succ(tkSymbol);
+  tokKeywordHigh = pred(tkIntLit);
+  tokOperators: TTokTypes = {@set}[tkOpr, tkSymbol, tkBracketLe, tkBracketRi,
+    tkIn, tkIs, tkIsNot, tkEquals, tkDot, tkHat, tkNot, tkAnd, tkOr, tkXor,
+    tkShl, tkShr, tkDiv, tkMod, tkNotIn];
+
+  TokTypeToStr: array [TTokType] of string = (
+    'tkInvalid', '[EOF]',
+    'tkSymbol',
+    //[[[cog
+    //cog.out(strings)
+    //]]]
+    'addr', 'and', 'as', 'asm', 
+    'block', 'break', 'case', 'cast', 
+    'const', 'continue', 'converter', 'discard', 
+    'div', 'elif', 'else', 'end', 
+    'enum', 'except', 'exception', 'finally', 
+    'for', 'from', 'generic', 'if', 
+    'implies', 'import', 'in', 'include', 
+    'is', 'isnot', 'iterator', 'lambda', 
+    'macro', 'method', 'mod', 'nil', 
+    'not', 'notin', 'object', 'of', 
+    'or', 'out', 'proc', 'ptr', 
+    'raise', 'record', 'ref', 'return', 
+    'shl', 'shr', 'template', 'try', 
+    'type', 'var', 'when', 'where', 
+    'while', 'with', 'without', 'xor', 
+    'yield', 
+    //[[[end]]]
+    'tkIntLit', 'tkInt8Lit', 'tkInt16Lit', 'tkInt32Lit', 'tkInt64Lit',
+    'tkFloatLit', 'tkFloat32Lit', 'tkFloat64Lit',
+    'tkStrLit', 'tkRStrLit', 'tkTripleStrLit', 'tkCharLit', 'tkRCharLit',
+    '('+'', ')'+'', '['+'', ']'+'', '{'+'', '}'+'',
+    '[.', '.]', '{.', '.}', '(.', '.)', ','+'', ';'+'', ':'+'',
+    '='+'', '.'+'', '..', '^'+'', 'tkOpr',
+    'tkComment', '`'+'', '[new indentation]', '[same indentation]',
+    '[dedentation]',
+    'tkSpaces', 'tkInfixOpr', 'tkPrefixOpr', 'tkPostfixOpr'
+  );
+
+type
+  PToken = ^TToken;
+  TToken = object          // a Nimrod token
+    tokType: TTokType;     // the type of the token
+    indent: int;           // the indentation; only valid if tokType = tkIndent
+    ident: PIdent;         // the parsed identifier
+    iNumber: BiggestInt;   // the parsed integer literal
+    fNumber: BiggestFloat; // the parsed floating point literal
+    base: TNumericalBase;  // the numerical base; only valid for int
+                           // or float literals
+    literal: string;       // the parsed (string) literal; and
+                           // documentation comments are here too
+    next: PToken;          // next token; used for arbitrary look-ahead
+  end;
+
+  PLexer = ^TLexer;
+  TLexer = object(TBaseLexer)
+    // lexers can be put into a stack through the next pointer;
+    // this feature is currently unused, however
+    filename: string;
+    next: PLexer;
+    indentStack: array of int; // the indentation stack
+    dedent: int;             // counter for DED token generation
+    indentAhead: int;        // if > 0 an indendation has already been read
+                             // this is needed because scanning # comments
+                             // needs so much look-ahead
+  end;
+
+procedure pushInd(var L: TLexer; indent: int);
+function isKeyword(kind: TTokType): boolean;
+
+function openLexer(out lex: TLexer; const filename: string): TResult;
+procedure bufferLexer(out lex: TLexer; const buf: string);
+
+procedure rawGetTok(var L: TLexer; var tok: TToken);
+// reads in the next token into tok and skips it
+
+function getColumn(const L: TLexer): int;
+
+function getLineInfo(const L: TLexer): TLineInfo;
+
+procedure closeLexer(var lex: TLexer);
+
+procedure PrintTok(tok: PToken);
+function tokToStr(tok: PToken): string;
+
+// auxiliary functions:
+procedure lexMessage(const L: TLexer; const msg: TMsgKind;
+                     const arg: string = '');
+
+// the Pascal scanner uses this too:
+procedure fillToken(var L: TToken);
+
+implementation
+
+function isKeyword(kind: TTokType): boolean;
+begin
+  result := (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)
+end;
+
+procedure pushInd(var L: TLexer; indent: int);
+var
+  len: int;
+begin
+  len := length(L.indentStack);
+  setLength(L.indentStack, len+1);
+  assert(indent > L.indentStack[len-1]);
+  L.indentstack[len] := indent;
+  //writeln('push indent ', indent);
+end;
+
+function findIdent(const L: TLexer; indent: int): boolean;
+var
+  i: int;
+begin
+  for i := length(L.indentStack)-1 downto 0 do
+    if L.indentStack[i] = indent then begin result := true; exit end;
+  result := false
+end;
+
+function tokToStr(tok: PToken): string;
+begin
+  case tok.tokType of
+    tkIntLit..tkInt64Lit:
+      result := toString(tok.iNumber);
+    tkFloatLit..tkFloat64Lit:
+      result := toStringF(tok.fNumber);
+    tkInvalid, tkStrLit..tkRCharLit, tkComment:
+      result := tok.literal;
+    tkParLe..tkColon, tkEof, tkInd, tkSad, tkDed, tkAccent:
+      result := tokTypeToStr[tok.tokType];
+    else if (tok.ident <> nil) then
+      result := tok.ident.s
+    else begin
+      assert(false);
+      result := ''
+    end
+  end
+end;
+
+procedure PrintTok(tok: PToken);
+begin
+  write(output, TokTypeToStr[tok.tokType]);
+  write(output, ' '+'');
+  writeln(output, tokToStr(tok))
+end;
+
+// ----------------------------------------------------------------------------
+
+var
+  dummyIdent: PIdent;
+
+procedure fillToken(var L: TToken);
+begin
+  L.TokType := tkInvalid;
+  L.iNumber := 0;
+  L.Indent := 0;
+  L.literal := '';
+  L.fNumber := 0.0;
+  L.base := base10;
+  L.ident := dummyIdent; // this prevents many bugs!
+end;
+
+function openLexer(out lex: TLexer; const filename: string): TResult;
+begin
+{@ignore}
+  FillChar(lex, sizeof(lex), 0); // work around Delphi/fpc bug
+{@emit}
+  if initBaseLexer(lex, filename) then
+    result := Success
+  else
+    result := Failure;
+{@ignore}
+  setLength(lex.indentStack, 1);
+  lex.indentStack[0] := 0;
+{@emit lex.indentStack := [0]; }
+  lex.filename := filename;
+  lex.indentAhead := -1;
+end;
+
+procedure bufferLexer(out lex: TLexer; const buf: string);
+begin
+{@ignore}
+  FillChar(lex, sizeof(lex), 0); // work around Delphi/fpc bug
+{@emit}
+  initBaseLexerFromBuffer(lex, buf);
+{@ignore}
+  setLength(lex.indentStack, 1);
+  lex.indentStack[0] := 0;
+{@emit lex.indentStack := [0]; }
+  lex.filename := 'buffer';
+  lex.indentAhead := -1;
+end;
+
+procedure closeLexer(var lex: TLexer);
+begin
+  deinitBaseLexer(lex);
+end;
+
+function getColumn(const L: TLexer): int;
+begin
+  result := getColNumber(L, L.bufPos)
+end;
+
+function getLineInfo(const L: TLexer): TLineInfo;
+begin
+  result := newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos))
+end;
+
+procedure lexMessage(const L: TLexer; const msg: TMsgKind;
+                     const arg: string = '');
+begin
+  msgs.liMessage(getLineInfo(L), msg, arg)
+end;
+
+procedure lexMessagePos(var L: TLexer; const msg: TMsgKind; pos: int;
+                        const arg: string = '');
+var
+  info: TLineInfo;
+begin
+  info := newLineInfo(L.filename, L.linenumber, pos - L.lineStart);
+  msgs.liMessage(info, msg, arg);
+end;
+
+// ----------------------------------------------------------------------------
+
+procedure matchUnderscoreChars(var L: TLexer; var tok: TToken;
+                               const chars: TCharSet);
+// matches ([chars]_)*
+var
+  pos: int;
+  buf: PChar;
+begin
+  pos := L.bufpos; // use registers for pos, buf
+  buf := L.buf;
+  repeat
+    if buf[pos] in chars then begin
+      addChar(tok.literal, buf[pos]);
+      Inc(pos)
+    end
+    else break;
+    if buf[pos] = '_' then begin
+      addChar(tok.literal, '_');
+      Inc(pos);
+    end;
+  until false;
+  L.bufPos := pos;
+end;
+
+function matchTwoChars(const L: TLexer; first: Char;
+                       const second: TCharSet): Boolean;
+begin
+  result := (L.buf[L.bufpos] = first) and (L.buf[L.bufpos+1] in Second);
+end;
+
+function isFloatLiteral(const s: string): boolean;
+var
+  i: int;
+begin
+  for i := strStart to length(s)+strStart-1 do
+    if s[i] in ['.','e','E'] then begin
+      result := true; exit
+    end;
+  result := false
+end;
+
+function GetNumber(var L: TLexer): TToken;
+// extremely hard work above us!
+var
+  pos, endpos: int;
+  xi: biggestInt;
+begin
+  // get the base:
+  result.tokType := tkIntLit; // int literal until we know better
+  result.literal := '';
+  result.base := base10; // BUGFIX
+  pos := L.bufpos;
+  // make sure the literal is correct for error messages:
+  matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9']);
+  if (L.buf[L.bufpos] = '.') and (L.buf[L.bufpos+1] in ['0'..'9']) then begin
+    addChar(result.literal, '.');
+    inc(L.bufpos);
+    //matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9'])
+    matchUnderscoreChars(L, result, ['0'..'9']);
+    if L.buf[L.bufpos] in ['e', 'E'] then begin
+      addChar(result.literal, 'e');
+      inc(L.bufpos);
+      if L.buf[L.bufpos] in ['+', '-'] then begin
+        addChar(result.literal, L.buf[L.bufpos]);
+        inc(L.bufpos);
+      end;
+      matchUnderscoreChars(L, result, ['0'..'9']);
+    end
+  end;
+  endpos := L.bufpos;
+  if L.buf[endpos] = '''' then begin
+    //matchUnderscoreChars(L, result, ['''', 'f', 'F', 'i', 'I', '0'..'9']);
+    inc(endpos);
+    L.bufpos := pos; // restore position
+    case L.buf[endpos] of
+      'f', 'F': begin
+        inc(endpos);
+        if (L.buf[endpos] = '6') and (L.buf[endpos+1] = '4') then begin
+          result.tokType := tkFloat64Lit;
+          inc(endpos, 2);
+        end
+        else if (L.buf[endpos] = '3') and (L.buf[endpos+1] = '2') then begin
+          result.tokType := tkFloat32Lit;
+          inc(endpos, 2);
+        end
+        else lexMessage(L, errInvalidNumber, result.literal);
+      end;
+      'i', 'I': begin
+        inc(endpos);
+        if (L.buf[endpos] = '6') and (L.buf[endpos+1] = '4') then begin
+          result.tokType := tkInt64Lit;
+          inc(endpos, 2);
+        end
+        else if (L.buf[endpos] = '3') and (L.buf[endpos+1] = '2') then begin
+          result.tokType := tkInt32Lit;
+          inc(endpos, 2);
+        end
+        else if (L.buf[endpos] = '1') and (L.buf[endpos+1] = '6') then begin
+          result.tokType := tkInt16Lit;
+          inc(endpos, 2);
+        end
+        else if (L.buf[endpos] = '8') then begin
+          result.tokType := tkInt8Lit;
+          inc(endpos);
+        end
+        else lexMessage(L, errInvalidNumber, result.literal);
+      end;
+      else lexMessage(L, errInvalidNumber, result.literal);
+    end
+  end
+  else
+    L.bufpos := pos; // restore position
+
+  try
+    if (L.buf[pos] = '0') and (L.buf[pos+1] in ['x','X','b','B','o','O'])
+    then begin
+      inc(pos, 2);
+      xi := 0;
+      // it may be a base prefix
+      case L.buf[pos-1] of
+        'b', 'B': begin
+          result.base := base2;
+          while true do begin
+            case L.buf[pos] of
+              'A'..'Z', 'a'..'z', '2'..'9', '.': begin
+                lexMessage(L, errInvalidNumber, result.literal);
+                inc(pos)
+              end;
+              '_': inc(pos);
+              '0', '1': begin
+                xi := (xi shl 1) or (ord(L.buf[pos]) - ord('0'));
+                inc(pos);
+              end;
+              else break;
+            end
+          end
+        end;
+        'o': begin
+          result.base := base8;
+          while true do begin
+            case L.buf[pos] of
+              'A'..'Z', 'a'..'z', '8'..'9', '.': begin
+                lexMessage(L, errInvalidNumber, result.literal);
+                inc(pos)
+              end;
+              '_': inc(pos);
+              '0'..'7': begin
+                xi := (xi shl 3) or (ord(L.buf[pos]) - ord('0'));
+                inc(pos);
+              end;
+              else break;
+            end
+          end
+        end;
+        'O': lexMessage(L, errInvalidNumber, result.literal);
+        'x', 'X': begin
+          result.base := base16;
+          while true do begin
+            case L.buf[pos] of
+              'G'..'Z', 'g'..'z', '.': begin
+                lexMessage(L, errInvalidNumber, result.literal);
+                inc(pos);
+              end;
+              '_': inc(pos);
+              '0'..'9': begin
+                xi := (xi shl 4) or (ord(L.buf[pos]) - ord('0'));
+                inc(pos);
+              end;
+              'a'..'f': begin
+                xi := (xi shl 4) or (ord(L.buf[pos]) - ord('a') + 10);
+                inc(pos);
+              end;
+              'A'..'F': begin
+                xi := (xi shl 4) or (ord(L.buf[pos]) - ord('A') + 10);
+                inc(pos);
+              end;
+              else break;
+            end
+          end
+        end;
+        else assert(false);
+      end;
+      // now look at the optional type suffix:
+      case result.tokType of
+        tkIntLit..tkInt64Lit:
+          result.iNumber := xi;
+        tkFloat32Lit:
+          result.fNumber := ({@cast}PFloat32(addr(xi)))^;
+          // note: this code is endian neutral!
+          // XXX: Test this on big endian machine!
+        tkFloat64Lit:
+          result.fNumber := ({@cast}PFloat64(addr(xi)))^;
+        else assert(false);
+      end
+    end
+    else if isFloatLiteral(result.literal)
+         or (result.tokType = tkFloat32Lit)
+         or (result.tokType = tkFloat64Lit) then begin
+      result.fnumber := parseFloat(result.literal);
+      if result.tokType = tkIntLit then result.tokType := tkFloatLit;
+    end
+    else begin
+      result.iNumber := ParseInt(result.literal)
+    end;
+  except
+    on EInvalidValue do
+      lexMessage(L, errInvalidNumber, result.literal);
+  {@ignore}
+    on sysutils.EIntOverflow do
+      lexMessage(L, errNumberOutOfRange, result.literal);
+  {@emit}
+    on EOverflow do
+      lexMessage(L, errNumberOutOfRange, result.literal);
+  end;
+  L.bufpos := endpos;
+end;
+
+procedure handleHexChar(var L: TLexer; var xi: int);
+begin
+  case L.buf[L.bufpos] of
+    '0'..'9': begin
+      xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0'));
+      inc(L.bufpos);
+    end;
+    'a'..'f': begin
+      xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10);
+      inc(L.bufpos);
+    end;
+    'A'..'F': begin
+      xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10);
+      inc(L.bufpos);
+    end;
+    else begin end // do nothing
+  end
+end;
+
+procedure handleDecChars(var L: TLexer; var xi: int);
+begin
+  while L.buf[L.bufpos] in ['0'..'9'] do begin
+    xi := (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0'));
+    inc(L.bufpos);
+  end;
+end;
+
+procedure getEscapedChar(var L: TLexer; var tok: TToken);
+var
+  xi: int;
+begin
+  inc(L.bufpos); // skip '\'
+  case L.buf[L.bufpos] of
+    'n', 'N': begin
+      if tok.toktype = tkCharLit then
+        lexMessage(L, errNnotAllowedInCharacter);
+      tok.literal := tok.literal +{&} tnl;
+      Inc(L.bufpos);
+    end;
+    'r', 'R', 'c', 'C': begin addChar(tok.literal, CR); Inc(L.bufpos); end;
+    'l', 'L': begin addChar(tok.literal, LF); Inc(L.bufpos); end;
+    'f', 'F': begin addChar(tok.literal, FF); inc(L.bufpos); end;
+    'e', 'E': begin addChar(tok.literal, ESC); Inc(L.bufpos); end;
+    'a', 'A': begin addChar(tok.literal, BEL); Inc(L.bufpos); end;
+    'b', 'B': begin addChar(tok.literal, BACKSPACE); Inc(L.bufpos); end;
+    'v', 'V': begin addChar(tok.literal, VT); Inc(L.bufpos); end;
+    't', 'T': begin addChar(tok.literal, Tabulator); Inc(L.bufpos); end;
+    '''', '"': begin addChar(tok.literal, L.buf[L.bufpos]); Inc(L.bufpos); end;
+    '\': begin addChar(tok.literal, '\'); Inc(L.bufpos) end;
+    'x', 'X': begin
+      inc(L.bufpos);
+      xi := 0;
+      handleHexChar(L, xi);
+      handleHexChar(L, xi);
+      addChar(tok.literal, Chr(xi));
+    end;
+    '0'..'9': begin
+      if matchTwoChars(L, '0', ['0'..'9']) then
+      // this warning will make it easier for newcomers:
+        lexMessage(L, warnOctalEscape);
+      xi := 0;
+      handleDecChars(L, xi);
+      if (xi <= 255) then
+        addChar(tok.literal, Chr(xi))
+      else
+        lexMessage(L, errInvalidCharacterConstant)
+    end
+    else lexMessage(L, errInvalidCharacterConstant)
+  end
+end;
+
+function HandleCRLF(var L: TLexer; pos: int): int;
+begin
+  case L.buf[pos] of
+    CR: begin
+      if getColNumber(L, pos) > MaxLineLength then
+        lexMessagePos(L, hintLineTooLong, pos);
+      result := lexbase.HandleCR(L, pos)
+    end;
+    LF: begin
+      if getColNumber(L, pos) > MaxLineLength then
+        lexMessagePos(L, hintLineTooLong, pos);
+      result := lexbase.HandleLF(L, pos)
+    end;
+    else result := pos
+  end
+end;
+
+procedure getString(var L: TLexer; var tok: TToken; rawMode: Boolean);
+var
+  line, line2, pos: int;
+  c: Char;
+  buf: PChar;
+begin
+  pos := L.bufPos + 1; // skip "
+  buf := L.buf; // put `buf` in a register
+  line := L.linenumber; // save linenumber for better error message
+  if (buf[pos] = '"') and (buf[pos+1] = '"') then begin
+    tok.tokType := tkTripleStrLit;
+    // long string literal:
+    inc(pos, 2); // skip ""
+    // skip leading newline:
+    pos := HandleCRLF(L, pos);
+    repeat
+      case buf[pos] of
+        '"': begin
+          if (buf[pos+1] = '"') and (buf[pos+2] = '"') then
+            break;
+          addChar(tok.literal, '"');
+          Inc(pos)
+        end;
+        CR, LF: begin
+          pos := HandleCRLF(L, pos);
+          tok.literal := tok.literal +{&} tnl;
+        end;
+        lexbase.EndOfFile: begin
+          line2 := L.linenumber;
+          L.LineNumber := line;
+          lexMessagePos(L, errClosingTripleQuoteExpected, L.lineStart);
+          L.LineNumber := line2;
+          break
+        end
+        else begin
+          addChar(tok.literal, buf[pos]);
+          Inc(pos)
+        end
+      end
+    until false;
+    L.bufpos := pos + 3 // skip the three """
+  end
+  else begin // ordinary string literal
+    if rawMode then tok.tokType := tkRStrLit
+    else tok.tokType := tkStrLit;
+    repeat
+      c := buf[pos];
+      if c = '"' then begin
+        inc(pos); // skip '"'
+        break
+      end;
+      if c in [CR, LF, lexbase.EndOfFile] then begin
+        lexMessage(L, errClosingQuoteExpected);
+        break
+      end;
+      if (c = '\') and not rawMode then begin
+        L.bufPos := pos;
+        getEscapedChar(L, tok);
+        pos := L.bufPos;
+      end
+      else begin
+        addChar(tok.literal, c);
+        Inc(pos)
+      end
+    until false;
+    L.bufpos := pos;
+  end
+end;
+
+procedure getCharacter(var L: TLexer; var tok: TToken; rawMode: Boolean);
+var
+  c: Char;
+begin
+  Inc(L.bufpos); // skip '
+  c := L.buf[L.bufpos];
+  case c of
+    #0..Pred(' '), '''': lexMessage(L, errInvalidCharacterConstant);
+    '\': begin
+      if not rawMode then
+        getEscapedChar(L, tok)
+      else begin
+        tok.literal := '\'+''; Inc(L.bufpos);
+      end
+    end
+    else begin
+      tok.literal := c + '';
+      Inc(L.bufpos);
+    end
+  end;
+  if L.buf[L.bufpos] <> '''' then lexMessage(L, errMissingFinalQuote);
+  inc(L.bufpos); // skip '
+end;
+
+{@ignore}
+{$ifopt Q+} {$define Q_on} {$Q-} {$endif}
+{$ifopt R+} {$define R_on} {$R-} {$endif}
+{@emit}
+procedure getSymbol(var L: TLexer; var tok: TToken);
+var
+  pos: int;
+  c: Char;
+  buf: pchar;
+  h: THash; // hashing algorithm inlined
+begin
+  h := 0;
+  pos := L.bufpos;
+  buf := L.buf;
+  while true do begin
+    c := buf[pos];
+    case c of
+      'a'..'z', '0'..'9', #128..#255: begin
+        h := h +{%} Ord(c);
+        h := h +{%} h shl 10;
+        h := h xor (h shr 6)
+      end;
+      'A'..'Z': begin
+        c := chr(ord(c) + (ord('a')-ord('A'))); // toLower()
+        h := h +{%} Ord(c);
+        h := h +{%} h shl 10;
+        h := h xor (h shr 6)
+      end;
+      '_': begin end;
+      else break
+    end;
+    Inc(pos)
+  end;
+  h := h +{%} h shl 3;
+  h := h xor (h shr 11);
+  h := h +{%} h shl 15;
+  tok.ident := getIdent(addr(L.buf[L.bufpos]), pos-L.bufpos, h);
+  L.bufpos := pos;
+  if (tok.ident.id < ord(tokKeywordLow)-ord(tkSymbol)) or
+     (tok.ident.id > ord(tokKeywordHigh)-ord(tkSymbol)) then
+    tok.tokType := tkSymbol
+  else
+    tok.tokType := TTokType(tok.ident.id+ord(tkSymbol))
+end;
+
+procedure getOperator(var L: TLexer; var tok: TToken);
+var
+  pos: int;
+  c: Char;
+  buf: pchar;
+  h: THash; // hashing algorithm inlined
+begin
+  pos := L.bufpos;
+  buf := L.buf;
+  h := 0;
+  while true do begin
+    c := buf[pos];
+    if c in OpChars then begin
+      h := h +{%} Ord(c);
+      h := h +{%} h shl 10;
+      h := h xor (h shr 6)
+    end
+    else break;
+    Inc(pos)
+  end;
+  h := h +{%} h shl 3;
+  h := h xor (h shr 11);
+  h := h +{%} h shl 15;
+  tok.ident := getIdent(addr(L.buf[L.bufpos]), pos-L.bufpos, h);
+  if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh) then
+    tok.tokType := tkOpr
+  else
+    tok.tokType := TTokType(tok.ident.id - oprLow + ord(tkColon));
+  L.bufpos := pos
+end;
+{@ignore}
+{$ifdef Q_on} {$undef Q_on} {$Q+} {$endif}
+{$ifdef R_on} {$undef R_on} {$R+} {$endif}
+{@emit}
+
+procedure handleIndentation(var L: TLexer; var tok: TToken; indent: int);
+var
+  i: int;
+begin
+  tok.indent := indent;
+  i := high(L.indentStack);
+  if indent > L.indentStack[i] then
+    tok.tokType := tkInd
+  else if indent = L.indentStack[i] then
+    tok.tokType := tkSad
+  else begin
+    // check we have the indentation somewhere in the stack:
+    while (i >= 0) and (indent <> L.indentStack[i]) do begin
+      dec(i);
+      inc(L.dedent);
+    end;
+    dec(L.dedent);
+    tok.tokType := tkDed;
+    if i >= 0 then
+      setLength(L.indentStack, i+1) // pop indentations
+    else begin
+      tok.tokType := tkSad; // for the parser it is better as SAD
+      lexMessage(L, errInvalidIndentation);
+    end
+  end;
+end;
+
+procedure scanComment(var L: TLexer; var tok: TToken);
+var
+  buf: PChar;
+  pos, col: int;
+  indent: int;
+begin
+  pos := L.bufpos;
+  buf := L.buf;
+  // a comment ends if the next line does not start with the # on the same
+  // column after only whitespace
+  tok.tokType := tkComment;
+  col := getColNumber(L, pos);
+  while true do begin
+    while not (buf[pos] in [CR, LF, lexbase.EndOfFile]) do begin
+      addChar(tok.literal, buf[pos]); inc(pos);
+    end;
+    pos := handleCRLF(L, pos);
+    indent := 0;
+    while buf[pos] = ' ' do begin inc(pos); inc(indent) end;
+    if (buf[pos] = '#') and (col = indent) then begin
+      tok.literal := tok.literal +{&} nl;
+    end
+    else begin
+      if buf[pos] > ' ' then begin
+        L.indentAhead := indent;
+        inc(L.dedent)
+      end;
+      break
+    end
+  end;
+  L.bufpos := pos;
+end;
+
+procedure skip(var L: TLexer; var tok: TToken);
+var
+  buf: PChar;
+  indent, pos: int;
+begin
+  pos := L.bufpos;
+  buf := L.buf;
+  repeat
+    case buf[pos] of
+      ' ': Inc(pos);
+      Tabulator: begin
+        lexMessagePos(L, errTabulatorsAreNotAllowed, pos);
+        inc(pos); // BUGFIX
+      end;
+      // newline is special:
+      CR, LF: begin
+        pos := HandleCRLF(L, pos);
+        indent := 0;
+        while buf[pos] = ' ' do begin
+          Inc(pos); Inc(indent)
+        end;
+        if (buf[pos] > ' ') then begin
+          handleIndentation(L, tok, indent);
+          break;
+        end
+      end;
+      else break // EndOfFile also leaves the loop
+    end
+  until false;
+  L.bufpos := pos;
+end;
+
+procedure rawGetTok(var L: TLexer; var tok: TToken);
+var
+  c: Char;
+begin
+  fillToken(tok);
+  if L.dedent > 0 then begin
+    dec(L.dedent);
+    if L.indentAhead >= 0 then begin
+      handleIndentation(L, tok, L.indentAhead);
+      L.indentAhead := -1;
+    end
+    else
+      tok.tokType := tkDed;
+    exit;
+  end;
+  // Skip whitespace, comments:
+  skip(L, tok); // skip
+  // got an documentation comment or tkIndent, return that:
+  if tok.toktype <> tkInvalid then exit;
+
+  // to the parser
+  c := L.buf[L.bufpos];
+  if c in SymStartChars - ['r', 'R', 'l'] then // common case first
+    getSymbol(L, tok)
+  else if c in ['0'..'9'] then
+    tok := getNumber(L)
+  else begin
+    case c of
+      '#': scanComment(L, tok);
+      ':': begin
+        tok.tokType := tkColon;
+        inc(L.bufpos);
+      end;
+      ',': begin
+        tok.toktype := tkComma;
+        Inc(L.bufpos)
+      end;
+      'l': begin
+        // if we parsed exactly one character and its a small L (l), this
+        // is treated as a warning because it may be confused with the number 1
+        if not (L.buf[L.bufpos+1] in (SymChars+['_'])) then
+          lexMessage(L, warnSmallLshouldNotBeUsed);
+        getSymbol(L, tok);
+      end;
+      'r', 'R': begin
+        case L.buf[L.bufPos+1] of
+          '''': begin
+            Inc(L.bufPos);
+            getCharacter(L, tok, true);
+            tok.tokType := tkRCharLit;
+          end;
+          '"': begin
+            Inc(L.bufPos);
+            getString(L, tok, true);
+          end;
+          else getSymbol(L, tok);
+        end
+      end;
+      '(': begin
+        Inc(L.bufpos);
+        if (L.buf[L.bufPos] = '.')
+        and (L.buf[L.bufPos+1] <> '.') then begin
+          tok.toktype := tkParDotLe;
+          Inc(L.bufpos);
+        end
+        else
+          tok.toktype := tkParLe;
+      end;
+      ')': begin
+        tok.toktype := tkParRi;
+        Inc(L.bufpos)
+      end;
+      '[': begin
+        Inc(L.bufpos);
+        if (L.buf[L.bufPos] = '.')
+        and (L.buf[L.bufPos+1] <> '.') then begin
+          tok.toktype := tkBracketDotLe;
+          Inc(L.bufpos);
+        end
+        else
+          tok.toktype := tkBracketLe;
+      end;
+      ']': begin
+        tok.toktype := tkBracketRi;
+        Inc(L.bufpos)
+      end;
+      '.': begin
+        if L.buf[L.bufPos+1] = ']' then begin
+          tok.tokType := tkBracketDotRi;
+          Inc(L.bufpos, 2);
+        end
+        else if L.buf[L.bufPos+1] = '}' then begin
+          tok.tokType := tkCurlyDotRi;
+          Inc(L.bufpos, 2);
+        end
+        else if L.buf[L.bufPos+1] = ')' then begin
+          tok.tokType := tkParDotRi;
+          Inc(L.bufpos, 2);
+        end
+        else
+          getOperator(L, tok)
+      end;
+      '{': begin
+        Inc(L.bufpos);
+        if (L.buf[L.bufPos] = '.')
+        and (L.buf[L.bufPos+1] <> '.') then begin
+          tok.toktype := tkCurlyDotLe;
+          Inc(L.bufpos);
+        end
+        else
+          tok.toktype := tkCurlyLe;
+      end;
+      '}': begin
+        tok.toktype := tkCurlyRi;
+        Inc(L.bufpos)
+      end;
+      ';': begin
+        tok.toktype := tkSemiColon;
+        Inc(L.bufpos)
+      end;
+      '`': begin
+        tok.tokType := tkAccent;
+        Inc(L.bufpos);
+      end;
+      '"': getString(L, tok, false);
+      '''': begin
+        getCharacter(L, tok, false);
+        tok.tokType := tkCharLit;
+      end;
+      lexbase.EndOfFile: tok.toktype := tkEof;
+      else if c in OpChars then
+        getOperator(L, tok)
+      else begin
+        tok.literal := c + '';
+        tok.tokType := tkInvalid;
+        lexMessage(L, errInvalidToken, c +{&} ' (\' +{&} toString(ord(c)) + ')');
+        Inc(L.bufpos);
+      end
+    end
+  end
+end;
+
+initialization
+  dummyIdent := getIdent('');
+end.
author	Andreas Rumpf <andreas@andi>	2008-06-22 16:14:11 +0200
committer	Andreas Rumpf <andreas@andi>	2008-06-22 16:14:11 +0200
commit	405b86068e6a3d39970b9129ceec0a9108464b28 (patch)
tree	c0449946f54baae6ea88baf453157ddd7faa8f86 /nim/scanner.pas
download	Nim-405b86068e6a3d39970b9129ceec0a9108464b28.tar.gz