diff options
Diffstat (limited to 'rod/pas2nim/paslex.nim')
-rwxr-xr-x | rod/pas2nim/paslex.nim | 544 |
1 files changed, 544 insertions, 0 deletions
diff --git a/rod/pas2nim/paslex.nim b/rod/pas2nim/paslex.nim new file mode 100755 index 000000000..9159c5de7 --- /dev/null +++ b/rod/pas2nim/paslex.nim @@ -0,0 +1,544 @@ +# +# +# Pas2nim - Pascal to Nimrod source converter +# (c) Copyright 2010 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +# This module implements a FreePascal scanner. This is an adaption from +# the scanner module. + +import + nhashes, options, msgs, strutils, platform, idents, lexbase + +const + MaxLineLength* = 80 # lines longer than this lead to a warning + numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'} + SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'} + SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'} + OpChars*: TCharSet = {'+', '-', '*', '/', '<', '>', '!', '?', '^', '.', '|', + '=', ':', '%', '&', '$', '@', '~', '\x80'..'\xFF'} + +# keywords are sorted! + +type + TPasTokKind* = enum + pxInvalid, pxEof, + pxAnd, pxArray, pxAs, pxAsm, pxBegin, pxCase, pxClass, pxConst, + pxConstructor, pxDestructor, pxDiv, pxDo, pxDownto, pxElse, pxEnd, pxExcept, + pxExports, pxFinalization, pxFinally, pxFor, pxFunction, pxGoto, pxIf, + pxImplementation, pxIn, pxInherited, pxInitialization, pxInline, + pxInterface, pxIs, pxLabel, pxLibrary, pxMod, pxNil, pxNot, pxObject, pxOf, + pxOr, pxOut, pxPacked, pxProcedure, pxProgram, pxProperty, pxRaise, + pxRecord, pxRepeat, pxResourcestring, pxSet, pxShl, pxShr, pxThen, + pxThreadvar, pxTo, pxTry, pxType, pxUnit, pxUntil, pxUses, pxVar, pxWhile, + pxWith, pxXor, + pxComment, # ordinary comment + pxCommand, # {@} + pxAmp, # {&} + pxPer, # {%} + pxStrLit, pxSymbol, # a symbol + pxIntLit, pxInt64Lit, # long constant like 0x70fffffff or out of int range + pxFloatLit, pxParLe, pxParRi, pxBracketLe, pxBracketRi, pxComma, + pxSemiColon, pxColon, # operators + pxAsgn, pxEquals, pxDot, pxDotDot, pxHat, pxPlus, pxMinus, pxStar, pxSlash, + pxLe, pxLt, pxGe, pxGt, pxNeq, pxAt, pxStarDirLe, pxStarDirRi, pxCurlyDirLe, + pxCurlyDirRi + TPasTokKinds* = set[TPasTokKind] + +const + Keywords = ["and", "array", "as", "asm", "begin", "case", "class", "const", + "constructor", "destructor", "div", "do", "downto", "else", "end", "except", + "exports", "finalization", "finally", "for", "function", "goto", "if", + "implementation", "in", "inherited", "initialization", "inline", + "interface", "is", "label", "library", "mod", "nil", "not", "object", "of", + "or", "out", "packed", "procedure", "program", "property", "raise", + "record", "repeat", "resourcestring", "set", "shl", "shr", "then", + "threadvar", "to", "try", "type", "unit", "until", "uses", "var", "while", + "with", "xor"] + + firstKeyword = pxAnd + lastKeyword = pxXor + +type + TPasTok* = object + xkind*: TPasTokKind # the type of the token + ident*: PIdent # the parsed identifier + iNumber*: BiggestInt # the parsed integer literal + fNumber*: BiggestFloat # the parsed floating point literal + base*: TNumericalBase # the numerical base; only valid for int + # or float literals + literal*: string # the parsed (string) literal + + TPasLex* = object + filename*: string + + +proc getPasTok*(L: var TPasLex, tok: var TPasTok) +proc PrintPasTok*(tok: TPasTok) +proc pasTokToStr*(tok: TPasTok): string +# implementation + +var dummyIdent: PIdent + +proc fillToken(L: var TToken) = + L.TokType = tkInvalid + L.iNumber = 0 + L.Indent = 0 + L.literal = "" + L.fNumber = 0.0 + L.base = base10 + L.ident = dummyIdent # this prevents many bugs! + +proc openLexer(lex: var TLexer, filename: string, inputstream: PLLStream) = + openBaseLexer(lex, inputstream) + lex.indentStack = @[0] + lex.filename = filename + lex.indentAhead = - 1 + +proc closeLexer(lex: var TLexer) = + inc(gLinesCompiled, lex.LineNumber) + closeBaseLexer(lex) + +proc getColumn(L: TLexer): int = + result = getColNumber(L, L.bufPos) + +proc getLineInfo(L: TLexer): TLineInfo = + result = newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos)) + +proc lexMessage(L: TLexer, msg: TMsgKind, arg = "") = + msgs.liMessage(getLineInfo(L), msg, arg) + +proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") = + var info = newLineInfo(L.filename, L.linenumber, pos - L.lineStart) + msgs.liMessage(info, msg, arg) + +proc binaryStrSearch(x: openarray[string], y: string): int = + var a = 0 + var b = len(x) + while a < b: + var mid = (a + b) div 2 + if x[mid] < y: a = mid + 1 + else: b = mid + if a < len(x) and x[a] == y: result = a + else: result = -1 + +proc pastokToStr(tok: TPasTok): string = + case tok.xkind + of pxIntLit, pxInt64Lit: result = $(tok.iNumber) + of pxFloatLit: result = $(tok.fNumber) + of pxInvalid, pxComment..pxStrLit: result = tok.literal + else: + if (tok.ident.s != ""): result = tok.ident.s + else: result = pasTokKindToStr[tok.xkind] + +proc PrintPasTok(tok: TPasTok) = + write(stdout, pasTokKindToStr[tok.xkind]) + write(stdout, ' ') + writeln(stdout, pastokToStr(tok)) + +proc setKeyword(L: var TPasLex, tok: var TPasTok) = + var x = binaryStrSearch(keywords, toLower(tok.ident.s)) + if x < 0: tok.xkind = pxSymbol + else: tok.xKind = TPasTokKind(x + ord(firstKeyword)) + +proc matchUnderscoreChars(L: var TPasLex, tok: var TPasTok, chars: TCharSet) = + # matches ([chars]_)* + var pos = L.bufpos # use registers for pos, buf + var buf = L.buf + while true: + if buf[pos] in chars: + add(tok.literal, buf[pos]) + Inc(pos) + else: + break + if buf[pos] == '_': + add(tok.literal, '_') + Inc(pos) + L.bufPos = pos + +proc isFloatLiteral(s: string): bool = + for i in countup(0, len(s)-1): + if s[i] in {'.', 'e', 'E'}: + return true + +proc getNumber2(L: var TPasLex, tok: var TPasTok) = + var pos = L.bufpos + 1 # skip % + if not (L.buf[pos] in {'0'..'1'}): + # BUGFIX for %date% + tok.xkind = pxInvalid + add(tok.literal, '%') + inc(L.bufpos) + return + tok.base = base2 + var xi: biggestInt = 0 + var bits = 0 + while true: + case L.buf[pos] + of 'A'..'Z', 'a'..'z', '2'..'9', '.': + lexMessage(L, errInvalidNumber) + inc(pos) + of '_': + inc(pos) + of '0', '1': + xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + inc(bits) + else: break + tok.iNumber = xi + if (bits > 32): tok.xkind = pxInt64Lit + else: tok.xkind = pxIntLit + L.bufpos = pos + +proc getNumber16(L: var TPasLex, tok: var TPasTok) = + var pos = L.bufpos + 1 # skip $ + tok.base = base16 + var xi: biggestInt = 0 + var bits = 0 + while true: + case L.buf[pos] + of 'G'..'Z', 'g'..'z', '.': + lexMessage(L, errInvalidNumber) + inc(pos) + of '_': inc(pos) + of '0'..'9': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + inc(bits, 4) + of 'a'..'f': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10) + inc(pos) + inc(bits, 4) + of 'A'..'F': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) + inc(pos) + inc(bits, 4) + else: break + tok.iNumber = xi + if (bits > 32): + tok.xkind = pxInt64Lit + else: + tok.xkind = pxIntLit + L.bufpos = pos + +proc getNumber10(L: var TPasLex, tok: var TPasTok) = + tok.base = base10 + matchUnderscoreChars(L, tok, {'0'..'9'}) + if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): + add(tok.literal, '.') + inc(L.bufpos) + matchUnderscoreChars(L, tok, {'e', 'E', '+', '-', '0'..'9'}) + try: + if isFloatLiteral(tok.literal): + tok.fnumber = parseFloat(tok.literal) + tok.xkind = pxFloatLit + else: + tok.iNumber = ParseInt(tok.literal) + if (tok.iNumber < low(int32)) or (tok.iNumber > high(int32)): + tok.xkind = pxInt64Lit + else: + tok.xkind = pxIntLit + except EInvalidValue: + lexMessage(L, errInvalidNumber, tok.literal) + except EOverflow: + lexMessage(L, errNumberOutOfRange, tok.literal) + +proc HandleCRLF(L: var TLexer, pos: int): int = + case L.buf[pos] + of CR: result = lexbase.HandleCR(L, pos) + of LF: result = lexbase.HandleLF(L, pos) + else: result = pos + +proc getString(L: var TPasLex, tok: var TPasTok) = + var xi: int + var pos = L.bufPos + var buf = L.buf + while true: + if buf[pos] == '\'': + inc(pos) + while true: + case buf[pos] + of CR, LF, lexbase.EndOfFile: + lexMessage(L, errClosingQuoteExpected) + break + of '\'': + inc(pos) + if buf[pos] == '\'': + inc(pos) + add(tok.literal, '\'') + else: + break + else: + add(tok.literal, buf[pos]) + inc(pos) + elif buf[pos] == '#': + inc(pos) + xi = 0 + case buf[pos] + of '$': + inc(pos) + xi = 0 + while true: + case buf[pos] + of '0'..'9': xi = (xi shl 4) or (ord(buf[pos]) - ord('0')) + of 'a'..'f': xi = (xi shl 4) or (ord(buf[pos]) - ord('a') + 10) + of 'A'..'F': xi = (xi shl 4) or (ord(buf[pos]) - ord('A') + 10) + else: break + inc(pos) + of '0'..'9': + xi = 0 + while buf[pos] in {'0'..'9'}: + xi = (xi * 10) + (ord(buf[pos]) - ord('0')) + inc(pos) + else: lexMessage(L, errInvalidCharacterConstant) + if (xi <= 255): add(tok.literal, Chr(xi)) + else: lexMessage(L, errInvalidCharacterConstant) + else: + break + tok.xkind = pxStrLit + L.bufpos = pos + +proc getSymbol(L: var TPasLex, tok: var TPasTok) = + var h: THash = 0 + var pos = L.bufpos + var buf = L.buf + while true: + var c = buf[pos] + case c + of 'a'..'z', '0'..'9', '\x80'..'\xFF': + h = h +% Ord(c) + h = h +% h shl 10 + h = h xor (h shr 6) + of 'A'..'Z': + c = chr(ord(c) + (ord('a') - ord('A'))) # toLower() + h = h +% Ord(c) + h = h +% h shl 10 + h = h xor (h shr 6) + of '_': nil + else: break + Inc(pos) + h = h +% h shl 3 + h = h xor (h shr 11) + h = h +% h shl 15 + tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h) + L.bufpos = pos + setKeyword(L, tok) + +proc scanLineComment(L: var TPasLex, tok: var TPasTok) = + var pos = L.bufpos + var buf = L.buf + # a comment ends if the next line does not start with the // on the same + # column after only whitespace + tok.xkind = pxComment + var col = getColNumber(L, pos) + while true: + inc(pos, 2) # skip // + add(tok.literal, '#') + while not (buf[pos] in {CR, LF, lexbase.EndOfFile}): + add(tok.literal, buf[pos]) + inc(pos) + pos = handleCRLF(L, pos) + buf = L.buf + var indent = 0 + while buf[pos] == ' ': + inc(pos) + inc(indent) + if (col == indent) and (buf[pos] == '/') and (buf[pos + 1] == '/'): + tok.literal = tok.literal & "\n" + else: + break + L.bufpos = pos + +proc scanCurlyComment(L: var TPasLex, tok: var TPasTok) = + var pos = L.bufpos + var buf = L.buf + tok.literal = "#" + tok.xkind = pxComment + while true: + case buf[pos] + of CR, LF: + pos = HandleCRLF(L, pos) + buf = L.buf + add(tok.literal, "\n#") + of '}': + inc(pos) + break + of lexbase.EndOfFile: lexMessage(L, errTokenExpected, "}") + else: + add(tok.literal, buf[pos]) + inc(pos) + L.bufpos = pos + +proc scanStarComment(L: var TPasLex, tok: var TPasTok) = + var pos = L.bufpos + var buf = L.buf + tok.literal = "#" + tok.xkind = pxComment + while true: + case buf[pos] + of CR, LF: + pos = HandleCRLF(L, pos) + buf = L.buf + add(tok.literal, "\n#") + of '*': + inc(pos) + if buf[pos] == ')': + inc(pos) + break + else: + add(tok.literal, '*') + of lexbase.EndOfFile: + lexMessage(L, errTokenExpected, "*)") + else: + add(tok.literal, buf[pos]) + inc(pos) + L.bufpos = pos + +proc skip(L: var TPasLex, tok: var TPasTok) = + var pos = L.bufpos + var buf = L.buf + while true: + case buf[pos] + of ' ', Tabulator: + Inc(pos) # newline is special: + of CR, LF: + pos = HandleCRLF(L, pos) + buf = L.buf + else: + break # EndOfFile also leaves the loop + L.bufpos = pos + +proc getPasTok(L: var TPasLex, tok: var TPasTok) = + tok.xkind = pxInvalid + fillToken(tok) + skip(L, tok) + var c = L.buf[L.bufpos] + if c in SymStartChars: + getSymbol(L, tok) + elif c in {'0'..'9'}: + getNumber10(L, tok) + else: + case c + of ';': + tok.xkind = pxSemicolon + Inc(L.bufpos) + of '/': + if L.buf[L.bufpos + 1] == '/': + scanLineComment(L, tok) + else: + tok.xkind = pxSlash + inc(L.bufpos) + of ',': + tok.xkind = pxComma + Inc(L.bufpos) + of '(': + Inc(L.bufpos) + if (L.buf[L.bufPos] == '*'): + if (L.buf[L.bufPos + 1] == '$'): + Inc(L.bufpos, 2) + skip(L, tok) + getSymbol(L, tok) + tok.xkind = pxStarDirLe + else: + inc(L.bufpos) + scanStarComment(L, tok) + else: + tok.xkind = pxParLe + of '*': + inc(L.bufpos) + if L.buf[L.bufpos] == ')': + inc(L.bufpos) + tok.xkind = pxStarDirRi + else: + tok.xkind = pxStar + of ')': + tok.xkind = pxParRi + Inc(L.bufpos) + of '[': + Inc(L.bufpos) + tok.xkind = pxBracketLe + of ']': + Inc(L.bufpos) + tok.xkind = pxBracketRi + of '.': + inc(L.bufpos) + if L.buf[L.bufpos] == '.': + tok.xkind = pxDotDot + inc(L.bufpos) + else: + tok.xkind = pxDot + of '{': + Inc(L.bufpos) + case L.buf[L.bufpos] + of '$': + Inc(L.bufpos) + skip(L, tok) + getSymbol(L, tok) + tok.xkind = pxCurlyDirLe + of '&': + Inc(L.bufpos) + tok.xkind = pxAmp + of '%': + Inc(L.bufpos) + tok.xkind = pxPer + of '@': + Inc(L.bufpos) + tok.xkind = pxCommand + else: scanCurlyComment(L, tok) + of '+': + tok.xkind = pxPlus + inc(L.bufpos) + of '-': + tok.xkind = pxMinus + inc(L.bufpos) + of ':': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxAsgn + else: + tok.xkind = pxColon + of '<': + inc(L.bufpos) + if L.buf[L.bufpos] == '>': + inc(L.bufpos) + tok.xkind = pxNeq + elif L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxLe + else: + tok.xkind = pxLt + of '>': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxGe + else: + tok.xkind = pxGt + of '=': + tok.xkind = pxEquals + inc(L.bufpos) + of '@': + tok.xkind = pxAt + inc(L.bufpos) + of '^': + tok.xkind = pxHat + inc(L.bufpos) + of '}': + tok.xkind = pxCurlyDirRi + Inc(L.bufpos) + of '\'', '#': + getString(L, tok) + of '$': + getNumber16(L, tok) + of '%': + getNumber2(L, tok) + of lexbase.EndOfFile: + tok.xkind = pxEof + else: + tok.literal = c & "" + tok.xkind = pxInvalid + lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')') + Inc(L.bufpos) |