summary refs log tree commit diff stats
path: root/rod/pas2nim/paslex.nim
diff options
context:
space:
mode:
Diffstat (limited to 'rod/pas2nim/paslex.nim')
-rwxr-xr-xrod/pas2nim/paslex.nim544
1 files changed, 544 insertions, 0 deletions
diff --git a/rod/pas2nim/paslex.nim b/rod/pas2nim/paslex.nim
new file mode 100755
index 000000000..9159c5de7
--- /dev/null
+++ b/rod/pas2nim/paslex.nim
@@ -0,0 +1,544 @@
+#
+#
+#      Pas2nim - Pascal to Nimrod source converter
+#        (c) Copyright 2010 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+# This module implements a FreePascal scanner. This is an adaption from
+# the scanner module.
+
+import 
+  nhashes, options, msgs, strutils, platform, idents, lexbase
+
+const 
+  MaxLineLength* = 80         # lines longer than this lead to a warning
+  numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'} 
+  SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
+  SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
+  OpChars*: TCharSet = {'+', '-', '*', '/', '<', '>', '!', '?', '^', '.', '|', 
+    '=', ':', '%', '&', '$', '@', '~', '\x80'..'\xFF'}
+
+# keywords are sorted!
+
+type
+  TPasTokKind* = enum 
+    pxInvalid, pxEof,         
+    pxAnd, pxArray, pxAs, pxAsm, pxBegin, pxCase, pxClass, pxConst, 
+    pxConstructor, pxDestructor, pxDiv, pxDo, pxDownto, pxElse, pxEnd, pxExcept, 
+    pxExports, pxFinalization, pxFinally, pxFor, pxFunction, pxGoto, pxIf, 
+    pxImplementation, pxIn, pxInherited, pxInitialization, pxInline, 
+    pxInterface, pxIs, pxLabel, pxLibrary, pxMod, pxNil, pxNot, pxObject, pxOf, 
+    pxOr, pxOut, pxPacked, pxProcedure, pxProgram, pxProperty, pxRaise, 
+    pxRecord, pxRepeat, pxResourcestring, pxSet, pxShl, pxShr, pxThen, 
+    pxThreadvar, pxTo, pxTry, pxType, pxUnit, pxUntil, pxUses, pxVar, pxWhile, 
+    pxWith, pxXor,
+    pxComment,                # ordinary comment
+    pxCommand,                # {@}
+    pxAmp,                    # {&}
+    pxPer,                    # {%}
+    pxStrLit, pxSymbol,       # a symbol
+    pxIntLit, pxInt64Lit, # long constant like 0x70fffffff or out of int range
+    pxFloatLit, pxParLe, pxParRi, pxBracketLe, pxBracketRi, pxComma, 
+    pxSemiColon, pxColon,     # operators
+    pxAsgn, pxEquals, pxDot, pxDotDot, pxHat, pxPlus, pxMinus, pxStar, pxSlash, 
+    pxLe, pxLt, pxGe, pxGt, pxNeq, pxAt, pxStarDirLe, pxStarDirRi, pxCurlyDirLe, 
+    pxCurlyDirRi
+  TPasTokKinds* = set[TPasTokKind]
+
+const 
+  Keywords = ["and", "array", "as", "asm", "begin", "case", "class", "const", 
+    "constructor", "destructor", "div", "do", "downto", "else", "end", "except", 
+    "exports", "finalization", "finally", "for", "function", "goto", "if", 
+    "implementation", "in", "inherited", "initialization", "inline", 
+    "interface", "is", "label", "library", "mod", "nil", "not", "object", "of", 
+    "or", "out", "packed", "procedure", "program", "property", "raise", 
+    "record", "repeat", "resourcestring", "set", "shl", "shr", "then", 
+    "threadvar", "to", "try", "type", "unit", "until", "uses", "var", "while", 
+    "with", "xor"]
+
+  firstKeyword = pxAnd
+  lastKeyword = pxXor
+
+type 
+  TPasTok* = object
+    xkind*: TPasTokKind       # the type of the token
+    ident*: PIdent            # the parsed identifier
+    iNumber*: BiggestInt      # the parsed integer literal
+    fNumber*: BiggestFloat    # the parsed floating point literal
+    base*: TNumericalBase     # the numerical base; only valid for int
+                              # or float literals
+    literal*: string          # the parsed (string) literal
+  
+  TPasLex* = object 
+    filename*: string
+  
+
+proc getPasTok*(L: var TPasLex, tok: var TPasTok)
+proc PrintPasTok*(tok: TPasTok)
+proc pasTokToStr*(tok: TPasTok): string
+# implementation
+
+var dummyIdent: PIdent
+
+proc fillToken(L: var TToken) = 
+  L.TokType = tkInvalid
+  L.iNumber = 0
+  L.Indent = 0
+  L.literal = ""
+  L.fNumber = 0.0
+  L.base = base10
+  L.ident = dummyIdent        # this prevents many bugs!
+  
+proc openLexer(lex: var TLexer, filename: string, inputstream: PLLStream) = 
+  openBaseLexer(lex, inputstream)
+  lex.indentStack = @[0]
+  lex.filename = filename
+  lex.indentAhead = - 1
+
+proc closeLexer(lex: var TLexer) = 
+  inc(gLinesCompiled, lex.LineNumber)
+  closeBaseLexer(lex)
+
+proc getColumn(L: TLexer): int = 
+  result = getColNumber(L, L.bufPos)
+
+proc getLineInfo(L: TLexer): TLineInfo = 
+  result = newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos))
+
+proc lexMessage(L: TLexer, msg: TMsgKind, arg = "") = 
+  msgs.liMessage(getLineInfo(L), msg, arg)
+
+proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") = 
+  var info = newLineInfo(L.filename, L.linenumber, pos - L.lineStart)
+  msgs.liMessage(info, msg, arg)
+
+proc binaryStrSearch(x: openarray[string], y: string): int =
+  var a = 0
+  var b = len(x)
+  while a < b:
+     var mid = (a + b) div 2
+     if x[mid] < y: a = mid + 1
+     else: b = mid
+  if a < len(x) and x[a] == y: result = a
+  else: result = -1
+
+proc pastokToStr(tok: TPasTok): string = 
+  case tok.xkind
+  of pxIntLit, pxInt64Lit: result = $(tok.iNumber)
+  of pxFloatLit: result = $(tok.fNumber)
+  of pxInvalid, pxComment..pxStrLit: result = tok.literal
+  else: 
+    if (tok.ident.s != ""): result = tok.ident.s
+    else: result = pasTokKindToStr[tok.xkind]
+  
+proc PrintPasTok(tok: TPasTok) = 
+  write(stdout, pasTokKindToStr[tok.xkind])
+  write(stdout, ' ')
+  writeln(stdout, pastokToStr(tok))
+
+proc setKeyword(L: var TPasLex, tok: var TPasTok) = 
+  var x = binaryStrSearch(keywords, toLower(tok.ident.s))
+  if x < 0: tok.xkind = pxSymbol
+  else: tok.xKind = TPasTokKind(x + ord(firstKeyword))
+  
+proc matchUnderscoreChars(L: var TPasLex, tok: var TPasTok, chars: TCharSet) = 
+  # matches ([chars]_)*
+  var pos = L.bufpos              # use registers for pos, buf
+  var buf = L.buf
+  while true: 
+    if buf[pos] in chars: 
+      add(tok.literal, buf[pos])
+      Inc(pos)
+    else: 
+      break 
+    if buf[pos] == '_': 
+      add(tok.literal, '_')
+      Inc(pos)
+  L.bufPos = pos
+
+proc isFloatLiteral(s: string): bool = 
+  for i in countup(0, len(s)-1): 
+    if s[i] in {'.', 'e', 'E'}: 
+      return true
+
+proc getNumber2(L: var TPasLex, tok: var TPasTok) = 
+  var pos = L.bufpos + 1 # skip %
+  if not (L.buf[pos] in {'0'..'1'}): 
+    # BUGFIX for %date%
+    tok.xkind = pxInvalid
+    add(tok.literal, '%')
+    inc(L.bufpos)
+    return 
+  tok.base = base2
+  var xi: biggestInt = 0
+  var bits = 0
+  while true: 
+    case L.buf[pos]
+    of 'A'..'Z', 'a'..'z', '2'..'9', '.': 
+      lexMessage(L, errInvalidNumber)
+      inc(pos)
+    of '_': 
+      inc(pos)
+    of '0', '1': 
+      xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
+      inc(pos)
+      inc(bits)
+    else: break 
+  tok.iNumber = xi
+  if (bits > 32): tok.xkind = pxInt64Lit
+  else: tok.xkind = pxIntLit
+  L.bufpos = pos
+
+proc getNumber16(L: var TPasLex, tok: var TPasTok) = 
+  var pos = L.bufpos + 1          # skip $
+  tok.base = base16
+  var xi: biggestInt = 0
+  var bits = 0
+  while true: 
+    case L.buf[pos]
+    of 'G'..'Z', 'g'..'z', '.': 
+      lexMessage(L, errInvalidNumber)
+      inc(pos)
+    of '_': inc(pos)
+    of '0'..'9': 
+      xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
+      inc(pos)
+      inc(bits, 4)
+    of 'a'..'f': 
+      xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10)
+      inc(pos)
+      inc(bits, 4)
+    of 'A'..'F': 
+      xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
+      inc(pos)
+      inc(bits, 4)
+    else: break 
+  tok.iNumber = xi
+  if (bits > 32): 
+    tok.xkind = pxInt64Lit
+  else: 
+    tok.xkind = pxIntLit
+  L.bufpos = pos
+
+proc getNumber10(L: var TPasLex, tok: var TPasTok) = 
+  tok.base = base10
+  matchUnderscoreChars(L, tok, {'0'..'9'})
+  if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): 
+    add(tok.literal, '.')
+    inc(L.bufpos)
+    matchUnderscoreChars(L, tok, {'e', 'E', '+', '-', '0'..'9'})
+  try: 
+    if isFloatLiteral(tok.literal): 
+      tok.fnumber = parseFloat(tok.literal)
+      tok.xkind = pxFloatLit
+    else: 
+      tok.iNumber = ParseInt(tok.literal)
+      if (tok.iNumber < low(int32)) or (tok.iNumber > high(int32)): 
+        tok.xkind = pxInt64Lit
+      else: 
+        tok.xkind = pxIntLit
+  except EInvalidValue: 
+    lexMessage(L, errInvalidNumber, tok.literal)
+  except EOverflow: 
+    lexMessage(L, errNumberOutOfRange, tok.literal)
+  
+proc HandleCRLF(L: var TLexer, pos: int): int = 
+  case L.buf[pos]
+  of CR: result = lexbase.HandleCR(L, pos)
+  of LF: result = lexbase.HandleLF(L, pos)
+  else: result = pos
+  
+proc getString(L: var TPasLex, tok: var TPasTok) = 
+  var xi: int
+  var pos = L.bufPos
+  var buf = L.buf
+  while true: 
+    if buf[pos] == '\'': 
+      inc(pos)
+      while true: 
+        case buf[pos]
+        of CR, LF, lexbase.EndOfFile: 
+          lexMessage(L, errClosingQuoteExpected)
+          break 
+        of '\'': 
+          inc(pos)
+          if buf[pos] == '\'': 
+            inc(pos)
+            add(tok.literal, '\'')
+          else: 
+            break 
+        else: 
+          add(tok.literal, buf[pos])
+          inc(pos)
+    elif buf[pos] == '#': 
+      inc(pos)
+      xi = 0
+      case buf[pos]
+      of '$': 
+        inc(pos)
+        xi = 0
+        while true: 
+          case buf[pos]
+          of '0'..'9': xi = (xi shl 4) or (ord(buf[pos]) - ord('0'))
+          of 'a'..'f': xi = (xi shl 4) or (ord(buf[pos]) - ord('a') + 10)
+          of 'A'..'F': xi = (xi shl 4) or (ord(buf[pos]) - ord('A') + 10)
+          else: break 
+          inc(pos)
+      of '0'..'9': 
+        xi = 0
+        while buf[pos] in {'0'..'9'}: 
+          xi = (xi * 10) + (ord(buf[pos]) - ord('0'))
+          inc(pos)
+      else: lexMessage(L, errInvalidCharacterConstant)
+      if (xi <= 255): add(tok.literal, Chr(xi))
+      else: lexMessage(L, errInvalidCharacterConstant)
+    else: 
+      break 
+  tok.xkind = pxStrLit
+  L.bufpos = pos
+
+proc getSymbol(L: var TPasLex, tok: var TPasTok) = 
+  var h: THash = 0
+  var pos = L.bufpos
+  var buf = L.buf
+  while true: 
+    var c = buf[pos]
+    case c
+    of 'a'..'z', '0'..'9', '\x80'..'\xFF': 
+      h = h +% Ord(c)
+      h = h +% h shl 10
+      h = h xor (h shr 6)
+    of 'A'..'Z': 
+      c = chr(ord(c) + (ord('a') - ord('A'))) # toLower()
+      h = h +% Ord(c)
+      h = h +% h shl 10
+      h = h xor (h shr 6)
+    of '_': nil
+    else: break 
+    Inc(pos)
+  h = h +% h shl 3
+  h = h xor (h shr 11)
+  h = h +% h shl 15
+  tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
+  L.bufpos = pos
+  setKeyword(L, tok)
+
+proc scanLineComment(L: var TPasLex, tok: var TPasTok) = 
+  var pos = L.bufpos
+  var buf = L.buf 
+  # a comment ends if the next line does not start with the // on the same
+  # column after only whitespace
+  tok.xkind = pxComment
+  var col = getColNumber(L, pos)
+  while true: 
+    inc(pos, 2)               # skip //
+    add(tok.literal, '#')
+    while not (buf[pos] in {CR, LF, lexbase.EndOfFile}): 
+      add(tok.literal, buf[pos])
+      inc(pos)
+    pos = handleCRLF(L, pos)
+    buf = L.buf
+    var indent = 0
+    while buf[pos] == ' ': 
+      inc(pos)
+      inc(indent)
+    if (col == indent) and (buf[pos] == '/') and (buf[pos + 1] == '/'): 
+      tok.literal = tok.literal & "\n"
+    else: 
+      break 
+  L.bufpos = pos
+
+proc scanCurlyComment(L: var TPasLex, tok: var TPasTok) = 
+  var pos = L.bufpos
+  var buf = L.buf
+  tok.literal = "#"
+  tok.xkind = pxComment
+  while true: 
+    case buf[pos]
+    of CR, LF: 
+      pos = HandleCRLF(L, pos)
+      buf = L.buf
+      add(tok.literal, "\n#")
+    of '}': 
+      inc(pos)
+      break 
+    of lexbase.EndOfFile: lexMessage(L, errTokenExpected, "}")
+    else: 
+      add(tok.literal, buf[pos])
+      inc(pos)
+  L.bufpos = pos
+
+proc scanStarComment(L: var TPasLex, tok: var TPasTok) = 
+  var pos = L.bufpos
+  var buf = L.buf
+  tok.literal = "#"
+  tok.xkind = pxComment
+  while true: 
+    case buf[pos]
+    of CR, LF: 
+      pos = HandleCRLF(L, pos)
+      buf = L.buf
+      add(tok.literal, "\n#")
+    of '*': 
+      inc(pos)
+      if buf[pos] == ')': 
+        inc(pos)
+        break 
+      else: 
+        add(tok.literal, '*')
+    of lexbase.EndOfFile: 
+      lexMessage(L, errTokenExpected, "*)")
+    else: 
+      add(tok.literal, buf[pos])
+      inc(pos)
+  L.bufpos = pos
+
+proc skip(L: var TPasLex, tok: var TPasTok) = 
+  var pos = L.bufpos
+  var buf = L.buf
+  while true: 
+    case buf[pos]
+    of ' ', Tabulator: 
+      Inc(pos)                # newline is special:
+    of CR, LF: 
+      pos = HandleCRLF(L, pos)
+      buf = L.buf
+    else: 
+      break                   # EndOfFile also leaves the loop
+  L.bufpos = pos
+
+proc getPasTok(L: var TPasLex, tok: var TPasTok) = 
+  tok.xkind = pxInvalid
+  fillToken(tok)
+  skip(L, tok)
+  var c = L.buf[L.bufpos]
+  if c in SymStartChars: 
+    getSymbol(L, tok)
+  elif c in {'0'..'9'}: 
+    getNumber10(L, tok)
+  else: 
+    case c
+    of ';': 
+      tok.xkind = pxSemicolon
+      Inc(L.bufpos)
+    of '/': 
+      if L.buf[L.bufpos + 1] == '/': 
+        scanLineComment(L, tok)
+      else: 
+        tok.xkind = pxSlash
+        inc(L.bufpos)
+    of ',': 
+      tok.xkind = pxComma
+      Inc(L.bufpos)
+    of '(': 
+      Inc(L.bufpos)
+      if (L.buf[L.bufPos] == '*'): 
+        if (L.buf[L.bufPos + 1] == '$'): 
+          Inc(L.bufpos, 2)
+          skip(L, tok)
+          getSymbol(L, tok)
+          tok.xkind = pxStarDirLe
+        else: 
+          inc(L.bufpos)
+          scanStarComment(L, tok)
+      else: 
+        tok.xkind = pxParLe
+    of '*': 
+      inc(L.bufpos)
+      if L.buf[L.bufpos] == ')': 
+        inc(L.bufpos)
+        tok.xkind = pxStarDirRi
+      else: 
+        tok.xkind = pxStar
+    of ')': 
+      tok.xkind = pxParRi
+      Inc(L.bufpos)
+    of '[': 
+      Inc(L.bufpos)
+      tok.xkind = pxBracketLe
+    of ']': 
+      Inc(L.bufpos)
+      tok.xkind = pxBracketRi
+    of '.': 
+      inc(L.bufpos)
+      if L.buf[L.bufpos] == '.': 
+        tok.xkind = pxDotDot
+        inc(L.bufpos)
+      else: 
+        tok.xkind = pxDot
+    of '{': 
+      Inc(L.bufpos)
+      case L.buf[L.bufpos]
+      of '$': 
+        Inc(L.bufpos)
+        skip(L, tok)
+        getSymbol(L, tok)
+        tok.xkind = pxCurlyDirLe
+      of '&': 
+        Inc(L.bufpos)
+        tok.xkind = pxAmp
+      of '%': 
+        Inc(L.bufpos)
+        tok.xkind = pxPer
+      of '@': 
+        Inc(L.bufpos)
+        tok.xkind = pxCommand
+      else: scanCurlyComment(L, tok)
+    of '+': 
+      tok.xkind = pxPlus
+      inc(L.bufpos)
+    of '-': 
+      tok.xkind = pxMinus
+      inc(L.bufpos)
+    of ':': 
+      inc(L.bufpos)
+      if L.buf[L.bufpos] == '=': 
+        inc(L.bufpos)
+        tok.xkind = pxAsgn
+      else: 
+        tok.xkind = pxColon
+    of '<': 
+      inc(L.bufpos)
+      if L.buf[L.bufpos] == '>': 
+        inc(L.bufpos)
+        tok.xkind = pxNeq
+      elif L.buf[L.bufpos] == '=': 
+        inc(L.bufpos)
+        tok.xkind = pxLe
+      else: 
+        tok.xkind = pxLt
+    of '>': 
+      inc(L.bufpos)
+      if L.buf[L.bufpos] == '=': 
+        inc(L.bufpos)
+        tok.xkind = pxGe
+      else: 
+        tok.xkind = pxGt
+    of '=': 
+      tok.xkind = pxEquals
+      inc(L.bufpos)
+    of '@': 
+      tok.xkind = pxAt
+      inc(L.bufpos)
+    of '^': 
+      tok.xkind = pxHat
+      inc(L.bufpos)
+    of '}': 
+      tok.xkind = pxCurlyDirRi
+      Inc(L.bufpos)
+    of '\'', '#': 
+      getString(L, tok)
+    of '$': 
+      getNumber16(L, tok)
+    of '%': 
+      getNumber2(L, tok)
+    of lexbase.EndOfFile: 
+      tok.xkind = pxEof
+    else: 
+      tok.literal = c & ""
+      tok.xkind = pxInvalid
+      lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')')
+      Inc(L.bufpos)