diff options
author | Oscar Campbell <oscar@campbell.nu> | 2015-05-31 02:55:06 +0200 |
---|---|---|
committer | Oscar Campbell <oscar@campbell.nu> | 2015-05-31 02:55:06 +0200 |
commit | 6a43b0e81721919d864ce053e0c0c624ba8c0127 (patch) | |
tree | 2cf049fad17c65735705edeed85585133c78b17d /compiler/lexer.nim | |
parent | 49b953533a05c984524daec359dbfb692a692eab (diff) | |
download | Nim-6a43b0e81721919d864ce053e0c0c624ba8c0127.tar.gz |
Clean up to compiler style. Refine error-msg for illegal octal 'O'
Diffstat (limited to 'compiler/lexer.nim')
-rw-r--r-- | compiler/lexer.nim | 188 |
1 files changed, 64 insertions, 124 deletions
diff --git a/compiler/lexer.nim b/compiler/lexer.nim index bdbaa79b1..e1615ac36 100644 --- a/compiler/lexer.nim +++ b/compiler/lexer.nim @@ -229,54 +229,6 @@ proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") = var info = newLineInfo(L.fileIdx, L.lineNumber, pos - L.lineStart) L.dispMessage(info, msg, arg) -proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: set[char]) = - var pos = L.bufpos # use registers for pos, buf - var buf = L.buf - while true: - if buf[pos] in chars: - add(tok.literal, buf[pos]) - inc(pos) - else: - break - if buf[pos] == '_': - if buf[pos+1] notin chars: - lexMessage(L, errInvalidToken, "_") # TODO/ozra - lift out for better msg - break - add(tok.literal, '_') - inc(pos) - L.bufpos = pos - -# Used for getting human friendlier err messages. Also only used in getNumber -proc matchAllLiteralishForMessage(L: var TLexer, tok: var TToken, startpos: int) = - # Note: <the erroneous 'O' in the set is intentional - const literalishChars = { 'A'..'F', 'a'..'f', '0'..'9', - 'X', 'x', 'o', 'O', 'c', 'C', 'b', 'B', - '_', '.', '\'', 'd', 'i', 'u' - } - - tok.literal = "" # Start over fresh for the "message literal" - var msgPos = L.bufpos - L.bufpos = startpos # Use L.bufpos as positioner because of matchUnderscore... - - # We use matchUnderscoreChars - but abuse it a little, passing '_' - matchUnderscoreChars(L, tok, literalishChars) - - # We must verify +/- specifically so that we're not past the literal - if L.buf[L.bufpos] in {'+', '-'} and - L.buf[L.bufpos - 1] in {'e', 'E'}: - add(tok.literal, L.buf[L.bufpos]) - inc(L.bufpos) - matchUnderscoreChars(L, tok, literalishChars) - - if L.buf[L.bufpos] in { - '\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U' - }: - inc(L.bufpos) - add(tok.literal, L.buf[L.bufpos]) - matchUnderscoreChars(L, tok, {'0'..'9'}) - - L.bufpos = msgPos - proc matchTwoChars(L: TLexer, first: char, second: set[char]): bool = result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in second) @@ -308,99 +260,104 @@ template eatChar(L: var TLexer, t: var TToken) = add(t.literal, L.buf[L.bufpos]) inc(L.bufpos) -# ###### ######## ######## ## ## ## ## ## ## -# ## ## ## ## ### ## ## ## ### ### -# ## ## ## #### ## ## ## #### #### -# ## #### ###### ## ## ## ## ## ## ## ### ## -# ## ## ## ## ## #### ## ## ## ## -# ## ## ## ## ## ### ## ## ## ## -# ###### ######## ## ## ## ####### ## ## - proc getNumber(L: var TLexer): TToken = var startpos, endpos: int xi: BiggestInt + const literalishChars = { 'A'..'F', 'a'..'f', '0'..'9', 'X', 'x', 'o', 'c', + 'C', 'b', 'B', '_', '.', '\''} + const literalishCharsNoDot = { 'A'..'F', 'a'..'f', '0'..'9', 'X', 'x', 'o', + 'c', 'C', 'b', 'B', '_', '\''} + + proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: set[char]) = + var pos = L.bufpos # use registers for pos, buf + var buf = L.buf + while true: + if buf[pos] in chars: + add(tok.literal, buf[pos]) + inc(pos) + else: + break + if buf[pos] == '_': + if buf[pos+1] notin chars: + lexMessage(L, errInvalidToken, "_") + break + add(tok.literal, '_') + inc(pos) + L.bufpos = pos - const literalishChars = { 'A'..'F', 'a'..'f', '0'..'9', - 'X', 'x', 'o', 'c', 'C', 'b', 'B', - '_', '.', '\'' - } - const literalishCharsNoDot = { 'A'..'F', 'a'..'f', '0'..'9', - 'X', 'x', 'o', 'c', 'C', 'b', 'B', - '_', '\'' - } + proc matchChars(L: var TLexer, tok: var TToken, chars: set[char]) = + var pos = L.bufpos # use registers for pos, buf + var buf = L.buf + while buf[pos] in chars: + add(tok.literal, buf[pos]) + inc(pos) + L.bufpos = pos - proc lexLiteralNumberMessage(L: var TLexer, msg: TMsgKind, startpos: int) = + proc lexMessageLitNum(L: var TLexer, msg: TMsgKind, startpos: int) = + # Used to get slightly human friendlier err messages. + # Note: the erroneous 'O' char in the character set is intentional + const literalishChars = {'A'..'F', 'a'..'f', '0'..'9', 'X', 'x', 'o', 'O', + 'c', 'C', 'b', 'B', '_', '.', '\'', 'd', 'i', 'u'} + var msgPos = L.bufpos var t: TToken - matchAllLiteralishForMessage(L, t, startpos) + t.literal = "" + L.bufpos = startpos # Use L.bufpos as pos because of matchChars + matchChars(L, t, literalishChars) + # We must verify +/- specifically so that we're not past the literal + if L.buf[L.bufpos] in {'+', '-'} and + L.buf[L.bufpos - 1] in {'e', 'E'}: + add(t.literal, L.buf[L.bufpos]) + inc(L.bufpos) + matchChars(L, t, literalishChars) + if L.buf[L.bufpos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}: + inc(L.bufpos) + add(t.literal, L.buf[L.bufpos]) + matchChars(L, t, {'0'..'9'}) + L.bufpos = msgPos lexMessage(L, msg, t.literal) - # get the base: result.tokType = tkIntLit # int literal until we know better result.literal = "" - result.base = base10 # BUGFIX - - startpos = L.bufpos # make sure the literal is correct for error messages: - + result.base = base10 + startpos = L.bufpos var isAFloatLiteral = false - - - # This first pass makes verifications and builds the literal string for - # the token, to keep the value extraction phase less error prone.. - + # First stage: find out base, make verifications, build token literal string if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in {'X', 'x', 'o', 'O', 'c', 'C', 'b', 'B'}: eatChar(L, result, '0') - case L.buf[L.bufpos] of 'O': - # TODO/ozra - add nicer message: " (did you mean octal? Then use one of '0o', '0c' or '0C'.)" - lexLiteralNumberMessage(L, errInvalidNumber, startpos) - + lexMessageLitNum(L, errInvalidNumberOctalCode, startpos) of 'x', 'X': eatChar(L, result, 'x') - #echo "is hex: ", result.literal matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'}) - #echo "got hex: ", result.literal of 'o', 'c', 'C': eatChar(L, result, 'c') - #echo "is octal: ", result.literal matchUnderscoreChars(L, result, {'0'..'7'}) of 'b', 'B': eatChar(L, result, 'b') - #echo "is binary: ", result.literal matchUnderscoreChars(L, result, {'0'..'1'}) else: internalError(getLineInfo(L), "getNumber") - else: - #echo "is decimal" matchUnderscoreChars(L, result, {'0'..'9'}) - if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): isAFloatLiteral = true eatChar(L, result, '.') matchUnderscoreChars(L, result, {'0'..'9'}) - if L.buf[L.bufpos] in {'e', 'E'}: isAFloatLiteral = true eatChar(L, result, 'e') - if L.buf[L.bufpos] in {'+', '-'}: eatChar(L, result) matchUnderscoreChars(L, result, {'0'..'9'}) - endpos = L.bufpos + # Second stage, find out if there's a datatype postfix and handle it var postPos = endpos - - #echo "At endpos is: ", L.buf[postPos] - if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}: if L.buf[postPos] == '\'': inc(postPos) - - #echo "is postfix: ", L.buf[postPos] - case L.buf[postPos] of 'f', 'F': inc(postPos) @@ -417,13 +374,9 @@ proc getNumber(L: var TLexer): TToken = inc(postPos, 3) else: # "f" alone defaults to float32 result.tokType = tkFloat32Lit - #lexMessage(L, errInvalidNumber, result.literal & "'f" & L.buf[postPos]) - - # 'd' is an ad hoc convenience shortcut for f64 - of 'd', 'D': + of 'd', 'D': # ad hoc convenience shortcut for f64 inc(postPos) result.tokType = tkFloat64Lit - of 'i', 'I': inc(postPos) if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'): @@ -439,11 +392,7 @@ proc getNumber(L: var TLexer): TToken = result.tokType = tkInt8Lit inc(postPos) else: - lexLiteralNumberMessage(L, errInvalidNumber, startpos) - #lexMessage(L, errInvalidNumber, result.literal & "'i" & L.buf[postPos]) - - #echo "is intish: ", result.literal, ":", $ result.tokType - + lexMessageLitNum(L, errInvalidNumber, startpos) of 'u', 'U': inc(postPos) if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'): @@ -461,18 +410,15 @@ proc getNumber(L: var TLexer): TToken = else: result.tokType = tkUIntLit else: - lexLiteralNumberMessage(L, errInvalidNumber, startpos) - #lexMessage(L, errInvalidNumber, result.literal & "'" & L.buf[postPos]) - - # Still a literalish char? Error! + lexMessageLitNum(L, errInvalidNumber, startpos) + # Is there still a literalish char awaiting? Then it's an error! if L.buf[postPos] in literalishCharsNoDot or (L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}): echo "likely literal error, additionally: '", L.buf[postPos], L.buf[postPos+1], L.buf[postPos+2], L.buf[postPos+3], "', ", postPos - lexLiteralNumberMessage(L, errInvalidNumber, startpos) - + lexMessageLitNum(L, errInvalidNumber, startpos) + # Third stage, extract actual number L.bufpos = startpos # restore position var pos: int = startpos - try: if (L.buf[pos] == '0') and (L.buf[pos + 1] in {'x', 'X', 'b', 'B', 'o', 'O', 'c', 'C'}): @@ -510,7 +456,6 @@ proc getNumber(L: var TLexer): TToken = break else: internalError(getLineInfo(L), "getNumber") - case result.tokType of tkIntLit, tkInt64Lit: result.iNumber = xi of tkInt8Lit: result.iNumber = BiggestInt(int8(toU8(int(xi)))) @@ -526,38 +471,33 @@ proc getNumber(L: var TLexer): TToken = # XXX: Test this on big endian machine! of tkFloat64Lit: result.fNumber = (cast[PFloat64](addr(xi)))[] else: internalError(getLineInfo(L), "getNumber") - elif isAFloatLiteral or (result.tokType == tkFloat32Lit) or (result.tokType == tkFloat64Lit): result.fNumber = parseFloat(result.literal) if result.tokType == tkIntLit: result.tokType = tkFloatLit - elif result.tokType == tkUint64Lit: xi = 0 let len = unsafeParseUInt(result.literal, xi) if len != result.literal.len or len == 0: raise newException(ValueError, "invalid integer: " & $xi) result.iNumber = xi - else: result.iNumber = parseBiggestInt(result.literal) if (result.iNumber < low(int32)) or (result.iNumber > high(int32)): if result.tokType == tkIntLit: result.tokType = tkInt64Lit elif result.tokType in {tkInt8Lit, tkInt16Lit, tkInt32Lit}: - lexLiteralNumberMessage(L, errNumberOutOfRange, startpos) + lexMessageLitNum(L, errNumberOutOfRange, startpos) elif result.tokType == tkInt8Lit and (result.iNumber < int8.low or result.iNumber > int8.high): - lexLiteralNumberMessage(L, errNumberOutOfRange, startpos) + lexMessageLitNum(L, errNumberOutOfRange, startpos) elif result.tokType == tkInt16Lit and (result.iNumber < int16.low or result.iNumber > int16.high): - lexLiteralNumberMessage(L, errNumberOutOfRange, startpos) - + lexMessageLitNum(L, errNumberOutOfRange, startpos) except ValueError: - lexLiteralNumberMessage(L, errInvalidNumber, startpos) + lexMessageLitNum(L, errInvalidNumber, startpos) except OverflowError, RangeError: - lexLiteralNumberMessage(L, errNumberOutOfRange, startpos) - + lexMessageLitNum(L, errNumberOutOfRange, startpos) L.bufpos = postPos proc handleHexChar(L: var TLexer, xi: var int) = |