diff options
Diffstat (limited to 'rod/paslex.nim')
-rwxr-xr-x | rod/paslex.nim | 671 |
1 files changed, 671 insertions, 0 deletions
diff --git a/rod/paslex.nim b/rod/paslex.nim new file mode 100755 index 000000000..e29e549ba --- /dev/null +++ b/rod/paslex.nim @@ -0,0 +1,671 @@ +# +# +# The Nimrod Compiler +# (c) Copyright 2009 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +# This module implements a FreePascal scanner. This is a adaption from +# the scanner module. + +import + nhashes, options, msgs, strutils, platform, idents, lexbase, wordrecg, scanner + +const + MaxLineLength* = 80 # lines longer than this lead to a warning + numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'} # we support up to base 36 + SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'} + SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'} + OpChars*: TCharSet = {'+', '-', '*', '/', '<', '>', '!', '?', '^', '.', '|', + '=', ':', '%', '&', '$', '@', '~', '\x80'..'\xFF'} + +type # order is important for TPasTokKind + TPasTokKind* = enum + pxInvalid, pxEof, # keywords: + #[[[cog + #from string import capitalize + #keywords = eval(open("data/pas_keyw.yml").read()) + #idents = "" + #strings = "" + #i = 1 + #for k in keywords: + # idents = idents + "px" + capitalize(k) + ", " + # strings = strings + "'" + k + "', " + # if i % 4 == 0: + # idents = idents + "\n" + # strings = strings + "\n" + # i = i + 1 + #cog.out(idents) + #]]] + pxAnd, pxArray, pxAs, pxAsm, pxBegin, pxCase, pxClass, pxConst, + pxConstructor, pxDestructor, pxDiv, pxDo, pxDownto, pxElse, pxEnd, pxExcept, + pxExports, pxFinalization, pxFinally, pxFor, pxFunction, pxGoto, pxIf, + pxImplementation, pxIn, pxInherited, pxInitialization, pxInline, + pxInterface, pxIs, pxLabel, pxLibrary, pxMod, pxNil, pxNot, pxObject, pxOf, + pxOr, pxOut, pxPacked, pxProcedure, pxProgram, pxProperty, pxRaise, + pxRecord, pxRepeat, pxResourcestring, pxSet, pxShl, pxShr, pxThen, + pxThreadvar, pxTo, pxTry, pxType, pxUnit, pxUntil, pxUses, pxVar, pxWhile, + pxWith, pxXor, #[[[end]]] + pxComment, # ordinary comment + pxCommand, # {@} + pxAmp, # {&} + pxPer, # {%} + pxStrLit, pxSymbol, # a symbol + pxIntLit, pxInt64Lit, # long constant like 0x00000070fffffff or out of int range + pxFloatLit, pxParLe, pxParRi, pxBracketLe, pxBracketRi, pxComma, + pxSemiColon, pxColon, # operators + pxAsgn, pxEquals, pxDot, pxDotDot, pxHat, pxPlus, pxMinus, pxStar, pxSlash, + pxLe, pxLt, pxGe, pxGt, pxNeq, pxAt, pxStarDirLe, pxStarDirRi, pxCurlyDirLe, + pxCurlyDirRi + TPasTokKinds* = set[TPasTokKind] + +const + PasTokKindToStr*: array[TPasTokKind, string] = ["pxInvalid", "[EOF]", #[[[cog + #cog.out(strings) + #]]] + "and", "array", "as", "asm", "begin", "case", "class", "const", + "constructor", "destructor", "div", "do", "downto", "else", "end", "except", + "exports", "finalization", "finally", "for", "function", "goto", "if", + "implementation", "in", "inherited", "initialization", "inline", + "interface", "is", "label", "library", "mod", "nil", "not", "object", "of", + "or", "out", "packed", "procedure", "program", "property", "raise", + "record", "repeat", "resourcestring", "set", "shl", "shr", "then", + "threadvar", "to", "try", "type", "unit", "until", "uses", "var", "while", + "with", "xor", #[[[end]]] + "pxComment", "pxCommand", "{&}", "{%}", "pxStrLit", "[IDENTIFIER]", + "pxIntLit", "pxInt64Lit", "pxFloatLit", "(", ")", "[", "]", ",", ";", ":", + ":=", "=", ".", "..", "^", "+", "-", "*", "/", "<=", "<", ">=", ">", "<>", + "@", "(*$", "*)", "{$", "}"] + +type + TPasTok* = object of TToken # a Pascal token + xkind*: TPasTokKind # the type of the token + + TPasLex* = object of TLexer + +proc getPasTok*(L: var TPasLex, tok: var TPasTok) +proc PrintPasTok*(tok: TPasTok) +proc pasTokToStr*(tok: TPasTok): string +# implementation + +proc pastokToStr(tok: TPasTok): string = + case tok.xkind + of pxIntLit, pxInt64Lit: result = $(tok.iNumber) + of pxFloatLit: result = $(tok.fNumber) + of pxInvalid, pxComment..pxStrLit: result = tok.literal + else: + if (tok.ident.s != ""): result = tok.ident.s + else: result = pasTokKindToStr[tok.xkind] + +proc PrintPasTok(tok: TPasTok) = + write(stdout, pasTokKindToStr[tok.xkind]) + write(stdout, ' ') + writeln(stdout, pastokToStr(tok)) + +proc setKeyword(L: var TPasLex, tok: var TPasTok) = + case tok.ident.id #[[[cog + #for k in keywords: + # m = capitalize(k) + # cog.outl("ord(w%s):%s tok.xkind := px%s;" % (m, ' '*(18-len(m)), m)) + #]]] + of ord(wAnd): + tok.xkind = pxAnd + of ord(wArray): + tok.xkind = pxArray + of ord(wAs): + tok.xkind = pxAs + of ord(wAsm): + tok.xkind = pxAsm + of ord(wBegin): + tok.xkind = pxBegin + of ord(wCase): + tok.xkind = pxCase + of ord(wClass): + tok.xkind = pxClass + of ord(wConst): + tok.xkind = pxConst + of ord(wConstructor): + tok.xkind = pxConstructor + of ord(wDestructor): + tok.xkind = pxDestructor + of ord(wDiv): + tok.xkind = pxDiv + of ord(wDo): + tok.xkind = pxDo + of ord(wDownto): + tok.xkind = pxDownto + of ord(wElse): + tok.xkind = pxElse + of ord(wEnd): + tok.xkind = pxEnd + of ord(wExcept): + tok.xkind = pxExcept + of ord(wExports): + tok.xkind = pxExports + of ord(wFinalization): + tok.xkind = pxFinalization + of ord(wFinally): + tok.xkind = pxFinally + of ord(wFor): + tok.xkind = pxFor + of ord(wFunction): + tok.xkind = pxFunction + of ord(wGoto): + tok.xkind = pxGoto + of ord(wIf): + tok.xkind = pxIf + of ord(wImplementation): + tok.xkind = pxImplementation + of ord(wIn): + tok.xkind = pxIn + of ord(wInherited): + tok.xkind = pxInherited + of ord(wInitialization): + tok.xkind = pxInitialization + of ord(wInline): + tok.xkind = pxInline + of ord(wInterface): + tok.xkind = pxInterface + of ord(wIs): + tok.xkind = pxIs + of ord(wLabel): + tok.xkind = pxLabel + of ord(wLibrary): + tok.xkind = pxLibrary + of ord(wMod): + tok.xkind = pxMod + of ord(wNil): + tok.xkind = pxNil + of ord(wNot): + tok.xkind = pxNot + of ord(wObject): + tok.xkind = pxObject + of ord(wOf): + tok.xkind = pxOf + of ord(wOr): + tok.xkind = pxOr + of ord(wOut): + tok.xkind = pxOut + of ord(wPacked): + tok.xkind = pxPacked + of ord(wProcedure): + tok.xkind = pxProcedure + of ord(wProgram): + tok.xkind = pxProgram + of ord(wProperty): + tok.xkind = pxProperty + of ord(wRaise): + tok.xkind = pxRaise + of ord(wRecord): + tok.xkind = pxRecord + of ord(wRepeat): + tok.xkind = pxRepeat + of ord(wResourcestring): + tok.xkind = pxResourcestring + of ord(wSet): + tok.xkind = pxSet + of ord(wShl): + tok.xkind = pxShl + of ord(wShr): + tok.xkind = pxShr + of ord(wThen): + tok.xkind = pxThen + of ord(wThreadvar): + tok.xkind = pxThreadvar + of ord(wTo): + tok.xkind = pxTo + of ord(wTry): + tok.xkind = pxTry + of ord(wType): + tok.xkind = pxType + of ord(wUnit): + tok.xkind = pxUnit + of ord(wUntil): + tok.xkind = pxUntil + of ord(wUses): + tok.xkind = pxUses + of ord(wVar): + tok.xkind = pxVar + of ord(wWhile): + tok.xkind = pxWhile + of ord(wWith): + tok.xkind = pxWith + of ord(wXor): + tok.xkind = pxXor #[[[end]]] + else: tok.xkind = pxSymbol + +proc matchUnderscoreChars(L: var TPasLex, tok: var TPasTok, chars: TCharSet) = + # matches ([chars]_)* + var + pos: int + buf: cstring + pos = L.bufpos # use registers for pos, buf + buf = L.buf + while true: + if buf[pos] in chars: + add(tok.literal, buf[pos]) + Inc(pos) + else: + break + if buf[pos] == '_': + add(tok.literal, '_') + Inc(pos) + L.bufPos = pos + +proc isFloatLiteral(s: string): bool = + for i in countup(0, len(s) + 0 - 1): + if s[i] in {'.', 'e', 'E'}: + return true + result = false + +proc getNumber2(L: var TPasLex, tok: var TPasTok) = + var + pos, bits: int + xi: biggestInt + pos = L.bufpos + 1 # skip % + if not (L.buf[pos] in {'0'..'1'}): + # BUGFIX for %date% + tok.xkind = pxInvalid + add(tok.literal, '%') + inc(L.bufpos) + return + tok.base = base2 + xi = 0 + bits = 0 + while true: + case L.buf[pos] + of 'A'..'Z', 'a'..'z', '2'..'9', '.': + lexMessage(L, errInvalidNumber) + inc(pos) + of '_': + inc(pos) + of '0', '1': + xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + inc(bits) + else: break + tok.iNumber = xi + if (bits > 32): + tok.xkind = pxInt64Lit + else: + tok.xkind = pxIntLit + L.bufpos = pos + +proc getNumber16(L: var TPasLex, tok: var TPasTok) = + var + pos, bits: int + xi: biggestInt + pos = L.bufpos + 1 # skip $ + tok.base = base16 + xi = 0 + bits = 0 + while true: + case L.buf[pos] + of 'G'..'Z', 'g'..'z', '.': + lexMessage(L, errInvalidNumber) + inc(pos) + of '_': + inc(pos) + of '0'..'9': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + inc(bits, 4) + of 'a'..'f': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10) + inc(pos) + inc(bits, 4) + of 'A'..'F': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) + inc(pos) + inc(bits, 4) + else: break + tok.iNumber = xi + if (bits > 32): + tok.xkind = pxInt64Lit + else: + tok.xkind = pxIntLit + L.bufpos = pos + +proc getNumber10(L: var TPasLex, tok: var TPasTok) = + tok.base = base10 + matchUnderscoreChars(L, tok, {'0'..'9'}) + if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): + add(tok.literal, '.') + inc(L.bufpos) + matchUnderscoreChars(L, tok, {'e', 'E', '+', '-', '0'..'9'}) + try: + if isFloatLiteral(tok.literal): + tok.fnumber = parseFloat(tok.literal) + tok.xkind = pxFloatLit + else: + tok.iNumber = ParseInt(tok.literal) + if (tok.iNumber < low(int32)) or (tok.iNumber > high(int32)): + tok.xkind = pxInt64Lit + else: + tok.xkind = pxIntLit + except EInvalidValue: + lexMessage(L, errInvalidNumber, tok.literal) + except EOverflow: + lexMessage(L, errNumberOutOfRange, tok.literal) + +proc HandleCRLF(L: var TLexer, pos: int): int = + case L.buf[pos] + of CR: result = lexbase.HandleCR(L, pos) + of LF: result = lexbase.HandleLF(L, pos) + else: result = pos + +proc getString(L: var TPasLex, tok: var TPasTok) = + var + pos, xi: int + buf: cstring + pos = L.bufPos + buf = L.buf + while true: + if buf[pos] == '\'': + inc(pos) + while true: + case buf[pos] + of CR, LF, lexbase.EndOfFile: + lexMessage(L, errClosingQuoteExpected) + break + of '\'': + inc(pos) + if buf[pos] == '\'': + inc(pos) + add(tok.literal, '\'') + else: + break + else: + add(tok.literal, buf[pos]) + inc(pos) + elif buf[pos] == '#': + inc(pos) + xi = 0 + case buf[pos] + of '$': + inc(pos) + xi = 0 + while true: + case buf[pos] + of '0'..'9': xi = (xi shl 4) or (ord(buf[pos]) - ord('0')) + of 'a'..'f': xi = (xi shl 4) or (ord(buf[pos]) - ord('a') + 10) + of 'A'..'F': xi = (xi shl 4) or (ord(buf[pos]) - ord('A') + 10) + else: break + inc(pos) + of '0'..'9': + xi = 0 + while buf[pos] in {'0'..'9'}: + xi = (xi * 10) + (ord(buf[pos]) - ord('0')) + inc(pos) + else: lexMessage(L, errInvalidCharacterConstant) + if (xi <= 255): add(tok.literal, Chr(xi)) + else: lexMessage(L, errInvalidCharacterConstant) + else: + break + tok.xkind = pxStrLit + L.bufpos = pos + +proc getSymbol(L: var TPasLex, tok: var TPasTok) = + var + pos: int + c: Char + buf: cstring + h: THash # hashing algorithm inlined + h = 0 + pos = L.bufpos + buf = L.buf + while true: + c = buf[pos] + case c + of 'a'..'z', '0'..'9', '\x80'..'\xFF': + h = h +% Ord(c) + h = h +% h shl 10 + h = h xor (h shr 6) + of 'A'..'Z': + c = chr(ord(c) + (ord('a') - ord('A'))) # toLower() + h = h +% Ord(c) + h = h +% h shl 10 + h = h xor (h shr 6) + of '_': + nil + else: break + Inc(pos) + h = h +% h shl 3 + h = h xor (h shr 11) + h = h +% h shl 15 + tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h) + L.bufpos = pos + setKeyword(L, tok) + +proc scanLineComment(L: var TPasLex, tok: var TPasTok) = + var + buf: cstring + pos, col: int + indent: int + pos = L.bufpos + buf = L.buf # a comment ends if the next line does not start with the // on the same + # column after only whitespace + tok.xkind = pxComment + col = getColNumber(L, pos) + while true: + inc(pos, 2) # skip // + add(tok.literal, '#') + while not (buf[pos] in {CR, LF, lexbase.EndOfFile}): + add(tok.literal, buf[pos]) + inc(pos) + pos = handleCRLF(L, pos) + buf = L.buf + indent = 0 + while buf[pos] == ' ': + inc(pos) + inc(indent) + if (col == indent) and (buf[pos] == '/') and (buf[pos + 1] == '/'): + tok.literal = tok.literal & "\n" + else: + break + L.bufpos = pos + +proc scanCurlyComment(L: var TPasLex, tok: var TPasTok) = + var + buf: cstring + pos: int + pos = L.bufpos + buf = L.buf + tok.literal = "#" + tok.xkind = pxComment + while true: + case buf[pos] + of CR, LF: + pos = HandleCRLF(L, pos) + buf = L.buf + tok.literal = tok.literal & "\n" & '#' + of '}': + inc(pos) + break + of lexbase.EndOfFile: + lexMessage(L, errTokenExpected, "}") + else: + add(tok.literal, buf[pos]) + inc(pos) + L.bufpos = pos + +proc scanStarComment(L: var TPasLex, tok: var TPasTok) = + var + buf: cstring + pos: int + pos = L.bufpos + buf = L.buf + tok.literal = "#" + tok.xkind = pxComment + while true: + case buf[pos] + of CR, LF: + pos = HandleCRLF(L, pos) + buf = L.buf + tok.literal = tok.literal & "\n" & '#' + of '*': + inc(pos) + if buf[pos] == ')': + inc(pos) + break + else: + add(tok.literal, '*') + of lexbase.EndOfFile: + lexMessage(L, errTokenExpected, "*)") + else: + add(tok.literal, buf[pos]) + inc(pos) + L.bufpos = pos + +proc skip(L: var TPasLex, tok: var TPasTok) = + var + buf: cstring + pos: int + pos = L.bufpos + buf = L.buf + while true: + case buf[pos] + of ' ', Tabulator: + Inc(pos) # newline is special: + of CR, LF: + pos = HandleCRLF(L, pos) + buf = L.buf + else: + break # EndOfFile also leaves the loop + L.bufpos = pos + +proc getPasTok(L: var TPasLex, tok: var TPasTok) = + var c: Char + tok.xkind = pxInvalid + fillToken(tok) + skip(L, tok) + c = L.buf[L.bufpos] + if c in SymStartChars: + getSymbol(L, tok) + elif c in {'0'..'9'}: + getNumber10(L, tok) + else: + case c + of ';': + tok.xkind = pxSemicolon + Inc(L.bufpos) + of '/': + if L.buf[L.bufpos + 1] == '/': + scanLineComment(L, tok) + else: + tok.xkind = pxSlash + inc(L.bufpos) + of ',': + tok.xkind = pxComma + Inc(L.bufpos) + of '(': + Inc(L.bufpos) + if (L.buf[L.bufPos] == '*'): + if (L.buf[L.bufPos + 1] == '$'): + Inc(L.bufpos, 2) + skip(L, tok) + getSymbol(L, tok) + tok.xkind = pxStarDirLe + else: + inc(L.bufpos) + scanStarComment(L, tok) + else: + tok.xkind = pxParLe + of '*': + inc(L.bufpos) + if L.buf[L.bufpos] == ')': + inc(L.bufpos) + tok.xkind = pxStarDirRi + else: + tok.xkind = pxStar + of ')': + tok.xkind = pxParRi + Inc(L.bufpos) + of '[': + Inc(L.bufpos) + tok.xkind = pxBracketLe + of ']': + Inc(L.bufpos) + tok.xkind = pxBracketRi + of '.': + inc(L.bufpos) + if L.buf[L.bufpos] == '.': + tok.xkind = pxDotDot + inc(L.bufpos) + else: + tok.xkind = pxDot + of '{': + Inc(L.bufpos) + case L.buf[L.bufpos] + of '$': + Inc(L.bufpos) + skip(L, tok) + getSymbol(L, tok) + tok.xkind = pxCurlyDirLe + of '&': + Inc(L.bufpos) + tok.xkind = pxAmp + of '%': + Inc(L.bufpos) + tok.xkind = pxPer + of '@': + Inc(L.bufpos) + tok.xkind = pxCommand + else: scanCurlyComment(L, tok) + of '+': + tok.xkind = pxPlus + inc(L.bufpos) + of '-': + tok.xkind = pxMinus + inc(L.bufpos) + of ':': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxAsgn + else: + tok.xkind = pxColon + of '<': + inc(L.bufpos) + if L.buf[L.bufpos] == '>': + inc(L.bufpos) + tok.xkind = pxNeq + elif L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxLe + else: + tok.xkind = pxLt + of '>': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxGe + else: + tok.xkind = pxGt + of '=': + tok.xkind = pxEquals + inc(L.bufpos) + of '@': + tok.xkind = pxAt + inc(L.bufpos) + of '^': + tok.xkind = pxHat + inc(L.bufpos) + of '}': + tok.xkind = pxCurlyDirRi + Inc(L.bufpos) + of '\'', '#': + getString(L, tok) + of '$': + getNumber16(L, tok) + of '%': + getNumber2(L, tok) + of lexbase.EndOfFile: + tok.xkind = pxEof + else: + tok.literal = c & "" + tok.xkind = pxInvalid + lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')') + Inc(L.bufpos) |