# # # The Nimrod Compiler # (c) Copyright 2009 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # # This module implements a FreePascal scanner. This is a adaption from # the scanner module. import nhashes, options, msgs, strutils, platform, idents, lexbase, wordrecg, scanner const MaxLineLength* = 80 # lines longer than this lead to a warning numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'} # we support up to base 36 SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'} SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'} OpChars*: TCharSet = {'+', '-', '*', '/', '<', '>', '!', '?', '^', '.', '|', '=', ':', '%', '&', '$', '@', '~', '\x80'..'\xFF'} type # order is important for TPasTokKind TPasTokKind* = enum pxInvalid, pxEof, # keywords: #[[[cog #from string import capitalize #keywords = eval(open("data/pas_keyw.yml").read()) #idents = "" #strings = "" #i = 1 #for k in keywords: # idents = idents + "px" + capitalize(k) + ", " # strings = strings + "'" + k + "', " # if i % 4 == 0: # idents = idents + "\n" # strings = strings + "\n" # i = i + 1 #cog.out(idents) #]]] pxAnd, pxArray, pxAs, pxAsm, pxBegin, pxCase, pxClass, pxConst, pxConstructor, pxDestructor, pxDiv, pxDo, pxDownto, pxElse, pxEnd, pxExcept, pxExports, pxFinalization, pxFinally, pxFor, pxFunction, pxGoto, pxIf, pxImplementation, pxIn, pxInherited, pxInitialization, pxInline, pxInterface, pxIs, pxLabel, pxLibrary, pxMod, pxNil, pxNot, pxObject, pxOf, pxOr, pxOut, pxPacked, pxProcedure, pxProgram, pxProperty, pxRaise, pxRecord, pxRepeat, pxResourcestring, pxSet, pxShl, pxShr, pxThen, pxThreadvar, pxTo, pxTry, pxType, pxUnit, pxUntil, pxUses, pxVar, pxWhile, pxWith, pxXor, #[[[end]]] pxComment, # ordinary comment pxCommand, # {@} pxAmp, # {&} pxPer, # {%} pxStrLit, pxSymbol, # a symbol pxIntLit, pxInt64Lit, # long constant like 0x00000070fffffff or out of int range pxFloatLit, pxParLe, pxParRi, pxBracketLe, pxBracketRi, pxComma, pxSemiColon, pxColon, # operators pxAsgn, pxEquals, pxDot, pxDotDot, pxHat, pxPlus, pxMinus, pxStar, pxSlash, pxLe, pxLt, pxGe, pxGt, pxNeq, pxAt, pxStarDirLe, pxStarDirRi, pxCurlyDirLe, pxCurlyDirRi TPasTokKinds* = set[TPasTokKind] const PasTokKindToStr*: array[TPasTokKind, string] = ["pxInvalid", "[EOF]", #[[[cog #cog.out(strings) #]]] "and", "array", "as", "asm", "begin", "case", "class", "const", "constructor", "destructor", "div", "do", "downto", "else", "end", "except", "exports", "finalization", "finally", "for", "function", "goto", "if", "implementation", "in", "inherited", "initialization", "inline", "interface", "is", "label", "library", "mod", "nil", "not", "object", "of", "or", "out", "packed", "procedure", "program", "property", "raise", "record", "repeat", "resourcestring", "set", "shl", "shr", "then", "threadvar", "to", "try", "type", "unit", "until", "uses", "var", "while", "with", "xor", #[[[end]]] "pxComment", "pxCommand", "{&}", "{%}", "pxStrLit", "[IDENTIFIER]", "pxIntLit", "pxInt64Lit", "pxFloatLit", "(", ")", "[", "]", ",", ";", ":", ":=", "=", ".", "..", "^", "+", "-", "*", "/", "<=", "<", ">=", ">", "<>", "@", "(*$", "*)", "{$", "}"] type TPasTok* = object of TToken # a Pascal token xkind*: TPasTokKind # the type of the token TPasLex* = object of TLexer proc getPasTok*(L: var TPasLex, tok: var TPasTok) proc PrintPasTok*(tok: TPasTok) proc pasTokToStr*(tok: TPasTok): string # implementation proc pastokToStr(tok: TPasTok): string = case tok.xkind of pxIntLit, pxInt64Lit: result = $(tok.iNumber) of pxFloatLit: result = $(tok.fNumber) of pxInvalid, pxComment..pxStrLit: result = tok.literal else: if (tok.ident.s != ""): result = tok.ident.s else: result = pasTokKindToStr[tok.xkind] proc PrintPasTok(tok: TPasTok) = write(stdout, pasTokKindToStr[tok.xkind]) write(stdout, ' ') writeln(stdout, pastokToStr(tok)) proc setKeyword(L: var TPasLex, tok: var TPasTok) = case tok.ident.id #[[[cog #for k in keywords: # m = capitalize(k) # cog.outl("ord(w%s):%s tok.xkind := px%s;" % (m, ' '*(18-len(m)), m)) #]]] of ord(wAnd): tok.xkind = pxAnd of ord(wArray): tok.xkind = pxArray of ord(wAs): tok.xkind = pxAs of ord(wAsm): tok.xkind = pxAsm of ord(wBegin): tok.xkind = pxBegin of ord(wCase): tok.xkind = pxCase of ord(wClass): tok.xkind = pxClass of ord(wConst): tok.xkind = pxConst of ord(wConstructor): tok.xkind = pxConstructor of ord(wDestructor): tok.xkind = pxDestructor of ord(wDiv): tok.xkind = pxDiv of ord(wDo): tok.xkind = pxDo of ord(wDownto): tok.xkind = pxDownto of ord(wElse): tok.xkind = pxElse of ord(wEnd): tok.xkind = pxEnd of ord(wExcept): tok.xkind = pxExcept of ord(wExports): tok.xkind = pxExports of ord(wFinalization): tok.xkind = pxFinalization of ord(wFinally): tok.xkind = pxFinally of ord(wFor): tok.xkind = pxFor of ord(wFunction): tok.xkind = pxFunction of ord(wGoto): tok.xkind = pxGoto of ord(wIf): tok.xkind = pxIf of ord(wImplementation): tok.xkind = pxImplementation of ord(wIn): tok.xkind = pxIn of ord(wInherited): tok.xkind = pxInherited of ord(wInitialization): tok.xkind = pxInitialization of ord(wInline): tok.xkind = pxInline of ord(wInterface): tok.xkind = pxInterface of ord(wIs): tok.xkind = pxIs of ord(wLabel): tok.xkind = pxLabel of ord(wLibrary): tok.xkind = pxLibrary of ord(wMod): tok.xkind = pxMod of ord(wNil): tok.xkind = pxNil of ord(wNot): tok.xkind = pxNot of ord(wObject): tok.xkind = pxObject of ord(wOf): tok.xkind = pxOf of ord(wOr): tok.xkind = pxOr of ord(wOut): tok.xkind = pxOut of ord(wPacked): tok.xkind = pxPacked of ord(wProcedure): tok.xkind = pxProcedure of ord(wProgram): tok.xkind = pxProgram of ord(wProperty): tok.xkind = pxProperty of ord(wRaise): tok.xkind = pxRaise of ord(wRecord): tok.xkind = pxRecord of ord(wRepeat): tok.xkind = pxRepeat of ord(wResourcestring): tok.xkind = pxResourcestring of ord(wSet): tok.xkind = pxSet of ord(wShl): tok.xkind = pxShl of ord(wShr): tok.xkind = pxShr of ord(wThen): tok.xkind = pxThen of ord(wThreadvar): tok.xkind = pxThreadvar of ord(wTo): tok.xkind = pxTo of ord(wTry): tok.xkind = pxTry of ord(wType): tok.xkind = pxType of ord(wUnit): tok.xkind = pxUnit of ord(wUntil): tok.xkind = pxUntil of ord(wUses): tok.xkind = pxUses of ord(wVar): tok.xkind = pxVar of ord(wWhile): tok.xkind = pxWhile of ord(wWith): tok.xkind = pxWith of ord(wXor): tok.xkind = pxXor #[[[end]]] else: tok.xkind = pxSymbol proc matchUnderscoreChars(L: var TPasLex, tok: var TPasTok, chars: TCharSet) = # matches ([chars]_)* var pos: int buf: cstring pos = L.bufpos # use registers for pos, buf buf = L.buf while true: if buf[pos] in chars: add(tok.literal, buf[pos]) Inc(pos) else: break if buf[pos] == '_': add(tok.literal, '_') Inc(pos) L.bufPos = pos proc isFloatLiteral(s: string): bool = for i in countup(0, len(s) + 0 - 1): if s[i] in {'.', 'e', 'E'}: return true result = false proc getNumber2(L: var TPasLex, tok: var TPasTok) = var pos, bits: int xi: biggestInt pos = L.bufpos + 1 # skip % if not (L.buf[pos] in {'0'..'1'}): # BUGFIX for %date% tok.xkind = pxInvalid add(tok.literal, '%') inc(L.bufpos) return tok.base = base2 xi = 0 bits = 0 while true: case L.buf[pos] of 'A'..'Z', 'a'..'z', '2'..'9', '.': lexMessage(L, errInvalidNumber) inc(pos) of '_': inc(pos) of '0', '1': xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) inc(pos) inc(bits) else: break tok.iNumber = xi if (bits > 32): tok.xkind = pxInt64Lit else: tok.xkind = pxIntLit L.bufpos = pos proc getNumber16(L: var TPasLex, tok: var TPasTok) = var pos, bits: int xi: biggestInt pos = L.bufpos + 1 # skip $ tok.base = base16 xi = 0 bits = 0 while true: case L.buf[pos] of 'G'..'Z', 'g'..'z', '.': lexMessage(L, errInvalidNumber) inc(pos) of '_': inc(pos) of '0'..'9': xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) inc(pos) inc(bits, 4) of 'a'..'f': xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10) inc(pos) inc(bits, 4) of 'A'..'F': xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) inc(pos) inc(bits, 4) else: break tok.iNumber = xi if (bits > 32): tok.xkind = pxInt64Lit else: tok.xkind = pxIntLit L.bufpos = pos proc getNumber10(L: var TPasLex, tok: var TPasTok) = tok.base = base10 matchUnderscoreChars(L, tok, {'0'..'9'}) if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): add(tok.literal, '.') inc(L.bufpos) matchUnderscoreChars(L, tok, {'e', 'E', '+', '-', '0'..'9'}) try: if isFloatLiteral(tok.literal): tok.fnumber = parseFloat(tok.literal) tok.xkind = pxFloatLit else: tok.iNumber = ParseInt(tok.literal) if (tok.iNumber < low(int32)) or (tok.iNumber > high(int32)): tok.xkind = pxInt64Lit else: tok.xkind = pxIntLit except EInvalidValue: lexMessage(L, errInvalidNumber, tok.literal) except EOverflow: lexMessage(L, errNumberOutOfRange, tok.literal) proc HandleCRLF(L: var TLexer, pos: int): int = case L.buf[pos] of CR: result = lexbase.HandleCR(L, pos) of LF: result = lexbase.HandleLF(L, pos) else: result = pos proc getString(L: var TPasLex, tok: var TPasTok) = var pos, xi: int buf: cstring pos = L.bufPos buf = L.buf while true: if buf[pos] == '\'': inc(pos) while true: case buf[pos] of CR, LF, lexbase.EndOfFile: lexMessage(L, errClosingQuoteExpected) break of '\'': inc(pos) if buf[pos] == '\'': inc(pos) add(tok.literal, '\'') else: break else: add(tok.literal, buf[pos]) inc(pos) elif buf[pos] == '#': inc(pos) xi = 0 case buf[pos] of '$': inc(pos) xi = 0 while true: case buf[pos] of '0'..'9': xi = (xi shl 4) or (ord(buf[pos]) - ord('0')) of 'a'..'f': xi = (xi shl 4) or (ord(buf[pos]) - ord('a') + 10) of 'A'..'F': xi = (xi shl 4) or (ord(buf[pos]) - ord('A') + 10) else: break inc(pos) of '0'..'9': xi = 0 while buf[pos] in {'0'..'9'}: xi = (xi * 10) + (ord(buf[pos]) - ord('0')) inc(pos) else: lexMessage(L, errInvalidCharacterConstant) if (xi <= 255): add(tok.literal, Chr(xi)) else: lexMessage(L, errInvalidCharacterConstant) else: break tok.xkind = pxStrLit L.bufpos = pos proc getSymbol(L: var TPasLex, tok: var TPasTok) = var pos: int c: Char buf: cstring h: THash # hashing algorithm inlined h = 0 pos = L.bufpos buf = L.buf while true: c = buf[pos] case c of 'a'..'z', '0'..'9', '\x80'..'\xFF': h = h +% Ord(c) h = h +% h shl 10 h = h xor (h shr 6) of 'A'..'Z': c = chr(ord(c) + (ord('a') - ord('A'))) # toLower() h = h +% Ord(c) h = h +% h shl 10 h = h xor (h shr 6) of '_': nil else: break Inc(pos) h = h +% h shl 3 h = h xor (h shr 11) h = h +% h shl 15 tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h) L.bufpos = pos setKeyword(L, tok) proc scanLineComment(L: var TPasLex, tok: var TPasTok) = var buf: cstring pos, col: int indent: int pos = L.bufpos buf = L.buf # a comment ends if the next line does not start with the // on the same # column after only whitespace tok.xkind = pxComment col = getColNumber(L, pos) while true: inc(pos, 2) # skip // add(tok.literal, '#') while not (buf[pos] in {CR, LF, lexbase.EndOfFile}): add(tok.literal, buf[pos]) inc(pos) pos = handleCRLF(L, pos) buf = L.buf indent = 0 while buf[pos] == ' ': inc(pos) inc(indent) if (col == indent) and (buf[pos] == '/') and (buf[pos + 1] == '/'): tok.literal = tok.literal & "\n" else: break L.bufpos = pos proc scanCurlyComment(L: var TPasLex, tok: var TPasTok) = var buf: cstring pos: int pos = L.bufpos buf = L.buf tok.literal = "#" tok.xkind = pxComment while true: case buf[pos] of CR, LF: pos = HandleCRLF(L, pos) buf = L.buf tok.literal = tok.literal & "\n" & '#' of '}': inc(pos) break of lexbase.EndOfFile: lexMessage(L, errTokenExpected, "}") else: add(tok.literal, buf[pos]) inc(pos) L.bufpos = pos proc scanStarComment(L: var TPasLex, tok: var TPasTok) = var buf: cstring pos: int pos = L.bufpos buf = L.buf tok.literal = "#" tok.xkind = pxComment while true: case buf[pos] of CR, LF: pos = HandleCRLF(L, pos) buf = L.buf tok.literal = tok.literal & "\n" & '#' of '*': inc(pos) if buf[pos] == ')': inc(pos) break else: add(tok.literal, '*') of lexbase.EndOfFile: lexMessage(L, errTokenExpected, "*)") else: add(tok.literal, buf[pos]) inc(pos) L.bufpos = pos proc skip(L: var TPasLex, tok: var TPasTok) = var buf: cstring pos: int pos = L.bufpos buf = L.buf while true: case buf[pos] of ' ', Tabulator: Inc(pos) # newline is special: of CR, LF: pos = HandleCRLF(L, pos) buf = L.buf else: break # EndOfFile also leaves the loop L.bufpos = pos proc getPasTok(L: var TPasLex, tok: var TPasTok) = var c: Char tok.xkind = pxInvalid fillToken(tok) skip(L, tok) c = L.buf[L.bufpos] if c in SymStartChars: getSymbol(L, tok) elif c in {'0'..'9'}: getNumber10(L, tok) else: case c of ';': tok.xkind = pxSemicolon Inc(L.bufpos) of '/': if L.buf[L.bufpos + 1] == '/': scanLineComment(L, tok) else: tok.xkind = pxSlash inc(L.bufpos) of ',': tok.xkind = pxComma Inc(L.bufpos) of '(': Inc(L.bufpos) if (L.buf[L.bufPos] == '*'): if (L.buf[L.bufPos + 1] == '$'): Inc(L.bufpos, 2) skip(L, tok) getSymbol(L, tok) tok.xkind = pxStarDirLe else: inc(L.bufpos) scanStarComment(L, tok) else: tok.xkind = pxParLe of '*': inc(L.bufpos) if L.buf[L.bufpos] == ')': inc(L.bufpos) tok.xkind = pxStarDirRi else: tok.xkind = pxStar of ')': tok.xkind = pxParRi Inc(L.bufpos) of '[': Inc(L.bufpos) tok.xkind = pxBracketLe of ']': Inc(L.bufpos) tok.xkind = pxBracketRi of '.': inc(L.bufpos) if L.buf[L.bufpos] == '.': tok.xkind = pxDotDot inc(L.bufpos) else: tok.xkind = pxDot of '{': Inc(L.bufpos) case L.buf[L.bufpos] of '$': Inc(L.bufpos) skip(L, tok) getSymbol(L, tok) tok.xkind = pxCurlyDirLe of '&': Inc(L.bufpos) tok.xkind = pxAmp of '%': Inc(L.bufpos) tok.xkind = pxPer of '@': Inc(L.bufpos) tok.xkind = pxCommand else: scanCurlyComment(L, tok) of '+': tok.xkind = pxPlus inc(L.bufpos) of '-': tok.xkind = pxMinus inc(L.bufpos) of ':': inc(L.bufpos) if L.buf[L.bufpos] == '=': inc(L.bufpos) tok.xkind = pxAsgn else: tok.xkind = pxColon of '<': inc(L.bufpos) if L.buf[L.bufpos] == '>': inc(L.bufpos) tok.xkind = pxNeq elif L.buf[L.bufpos] == '=': inc(L.bufpos) tok.xkind = pxLe else: tok.xkind = pxLt of '>': inc(L.bufpos) if L.buf[L.bufpos] == '=': inc(L.bufpos) tok.xkind = pxGe else: tok.xkind = pxGt of '=': tok.xkind = pxEquals inc(L.bufpos) of '@': tok.xkind = pxAt inc(L.bufpos) of '^': tok.xkind = pxHat inc(L.bufpos) of '}': tok.xkind = pxCurlyDirRi Inc(L.bufpos) of '\'', '#': getString(L, tok) of '$': getNumber16(L, tok) of '%': getNumber2(L, tok) of lexbase.EndOfFile: tok.xkind = pxEof else: tok.literal = c & "" tok.xkind = pxInvalid lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')') Inc(L.bufpos)