diff options
author | Oscar Campbell <oscar@campbell.nu> | 2015-05-31 00:56:27 +0200 |
---|---|---|
committer | Oscar Campbell <oscar@campbell.nu> | 2015-05-31 00:56:27 +0200 |
commit | 49b953533a05c984524daec359dbfb692a692eab (patch) | |
tree | 030e49df51895bdd5d3fc7e6a55aa846d30f2aa9 /compiler/lexer.nim | |
parent | 6820b2fea919c033405e7e204343fddd947c2ef3 (diff) | |
download | Nim-49b953533a05c984524daec359dbfb692a692eab.tar.gz |
Fix #2523 - first commit. Needs some cleanup.
Diffstat (limited to 'compiler/lexer.nim')
-rw-r--r-- | compiler/lexer.nim | 302 |
1 files changed, 204 insertions, 98 deletions
diff --git a/compiler/lexer.nim b/compiler/lexer.nim index 8080e0e8c..bdbaa79b1 100644 --- a/compiler/lexer.nim +++ b/compiler/lexer.nim @@ -240,12 +240,43 @@ proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: set[char]) = break if buf[pos] == '_': if buf[pos+1] notin chars: - lexMessage(L, errInvalidToken, "_") + lexMessage(L, errInvalidToken, "_") # TODO/ozra - lift out for better msg break add(tok.literal, '_') inc(pos) L.bufpos = pos +# Used for getting human friendlier err messages. Also only used in getNumber +proc matchAllLiteralishForMessage(L: var TLexer, tok: var TToken, startpos: int) = + # Note: <the erroneous 'O' in the set is intentional + const literalishChars = { 'A'..'F', 'a'..'f', '0'..'9', + 'X', 'x', 'o', 'O', 'c', 'C', 'b', 'B', + '_', '.', '\'', 'd', 'i', 'u' + } + + tok.literal = "" # Start over fresh for the "message literal" + var msgPos = L.bufpos + L.bufpos = startpos # Use L.bufpos as positioner because of matchUnderscore... + + # We use matchUnderscoreChars - but abuse it a little, passing '_' + matchUnderscoreChars(L, tok, literalishChars) + + # We must verify +/- specifically so that we're not past the literal + if L.buf[L.bufpos] in {'+', '-'} and + L.buf[L.bufpos - 1] in {'e', 'E'}: + add(tok.literal, L.buf[L.bufpos]) + inc(L.bufpos) + matchUnderscoreChars(L, tok, literalishChars) + + if L.buf[L.bufpos] in { + '\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U' + }: + inc(L.bufpos) + add(tok.literal, L.buf[L.bufpos]) + matchUnderscoreChars(L, tok, {'0'..'9'}) + + L.bufpos = msgPos + proc matchTwoChars(L: TLexer, first: char, second: set[char]): bool = result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in second) @@ -268,136 +299,203 @@ proc unsafeParseUInt(s: string, b: var BiggestInt, start = 0): int = result = i - start {.pop.} # overflowChecks + +template eatChar(L: var TLexer, t: var TToken, replacementChar: char) = + add(t.literal, replacementChar) + inc(L.bufpos) + +template eatChar(L: var TLexer, t: var TToken) = + add(t.literal, L.buf[L.bufpos]) + inc(L.bufpos) + +# ###### ######## ######## ## ## ## ## ## ## +# ## ## ## ## ### ## ## ## ### ### +# ## ## ## #### ## ## ## #### #### +# ## #### ###### ## ## ## ## ## ## ## ### ## +# ## ## ## ## ## #### ## ## ## ## +# ## ## ## ## ## ### ## ## ## ## +# ###### ######## ## ## ## ####### ## ## + proc getNumber(L: var TLexer): TToken = var - pos, endpos: int + startpos, endpos: int xi: BiggestInt + + const literalishChars = { 'A'..'F', 'a'..'f', '0'..'9', + 'X', 'x', 'o', 'c', 'C', 'b', 'B', + '_', '.', '\'' + } + const literalishCharsNoDot = { 'A'..'F', 'a'..'f', '0'..'9', + 'X', 'x', 'o', 'c', 'C', 'b', 'B', + '_', '\'' + } + + proc lexLiteralNumberMessage(L: var TLexer, msg: TMsgKind, startpos: int) = + var t: TToken + matchAllLiteralishForMessage(L, t, startpos) + lexMessage(L, msg, t.literal) + # get the base: result.tokType = tkIntLit # int literal until we know better result.literal = "" result.base = base10 # BUGFIX - pos = L.bufpos # make sure the literal is correct for error messages: - var eallowed = false - if L.buf[pos] == '0' and L.buf[pos+1] in {'X', 'x'}: - matchUnderscoreChars(L, result, {'A'..'F', 'a'..'f', '0'..'9', 'X', 'x'}) + + startpos = L.bufpos # make sure the literal is correct for error messages: + + var isAFloatLiteral = false + + + # This first pass makes verifications and builds the literal string for + # the token, to keep the value extraction phase less error prone.. + + if L.buf[L.bufpos] == '0' and + L.buf[L.bufpos + 1] in {'X', 'x', 'o', 'O', 'c', 'C', 'b', 'B'}: + eatChar(L, result, '0') + + case L.buf[L.bufpos] + of 'O': + # TODO/ozra - add nicer message: " (did you mean octal? Then use one of '0o', '0c' or '0C'.)" + lexLiteralNumberMessage(L, errInvalidNumber, startpos) + + of 'x', 'X': + eatChar(L, result, 'x') + #echo "is hex: ", result.literal + matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'}) + #echo "got hex: ", result.literal + of 'o', 'c', 'C': + eatChar(L, result, 'c') + #echo "is octal: ", result.literal + matchUnderscoreChars(L, result, {'0'..'7'}) + of 'b', 'B': + eatChar(L, result, 'b') + #echo "is binary: ", result.literal + matchUnderscoreChars(L, result, {'0'..'1'}) + else: + internalError(getLineInfo(L), "getNumber") + else: - matchUnderscoreChars(L, result, {'0'..'9', 'b', 'B', 'o', 'c', 'C'}) - eallowed = true - if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): - add(result.literal, '.') - inc(L.bufpos) - matchUnderscoreChars(L, result, {'0'..'9'}) - eallowed = true - if eallowed and L.buf[L.bufpos] in {'e', 'E'}: - add(result.literal, 'e') - inc(L.bufpos) - if L.buf[L.bufpos] in {'+', '-'}: - add(result.literal, L.buf[L.bufpos]) - inc(L.bufpos) + #echo "is decimal" matchUnderscoreChars(L, result, {'0'..'9'}) + + if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): + isAFloatLiteral = true + eatChar(L, result, '.') + matchUnderscoreChars(L, result, {'0'..'9'}) + + if L.buf[L.bufpos] in {'e', 'E'}: + isAFloatLiteral = true + eatChar(L, result, 'e') + + if L.buf[L.bufpos] in {'+', '-'}: + eatChar(L, result) + matchUnderscoreChars(L, result, {'0'..'9'}) + endpos = L.bufpos - if L.buf[endpos] in {'\'', 'f', 'F', 'i', 'I', 'u', 'U'}: - if L.buf[endpos] == '\'': inc(endpos) - L.bufpos = pos # restore position - case L.buf[endpos] + var postPos = endpos + + #echo "At endpos is: ", L.buf[postPos] + + if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}: + if L.buf[postPos] == '\'': + inc(postPos) + + #echo "is postfix: ", L.buf[postPos] + + case L.buf[postPos] of 'f', 'F': - inc(endpos) - if (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'): + inc(postPos) + if (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'): result.tokType = tkFloat32Lit - inc(endpos, 2) - elif (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'): + inc(postPos, 2) + elif (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'): result.tokType = tkFloat64Lit - inc(endpos, 2) - elif (L.buf[endpos] == '1') and - (L.buf[endpos + 1] == '2') and - (L.buf[endpos + 2] == '8'): + inc(postPos, 2) + elif (L.buf[postPos] == '1') and + (L.buf[postPos + 1] == '2') and + (L.buf[postPos + 2] == '8'): result.tokType = tkFloat128Lit - inc(endpos, 3) - else: - lexMessage(L, errInvalidNumber, result.literal & "'f" & L.buf[endpos]) + inc(postPos, 3) + else: # "f" alone defaults to float32 + result.tokType = tkFloat32Lit + #lexMessage(L, errInvalidNumber, result.literal & "'f" & L.buf[postPos]) + + # 'd' is an ad hoc convenience shortcut for f64 + of 'd', 'D': + inc(postPos) + result.tokType = tkFloat64Lit + of 'i', 'I': - inc(endpos) - if (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'): + inc(postPos) + if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'): result.tokType = tkInt64Lit - inc(endpos, 2) - elif (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'): + inc(postPos, 2) + elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'): result.tokType = tkInt32Lit - inc(endpos, 2) - elif (L.buf[endpos] == '1') and (L.buf[endpos + 1] == '6'): + inc(postPos, 2) + elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'): result.tokType = tkInt16Lit - inc(endpos, 2) - elif (L.buf[endpos] == '8'): + inc(postPos, 2) + elif (L.buf[postPos] == '8'): result.tokType = tkInt8Lit - inc(endpos) + inc(postPos) else: - lexMessage(L, errInvalidNumber, result.literal & "'i" & L.buf[endpos]) + lexLiteralNumberMessage(L, errInvalidNumber, startpos) + #lexMessage(L, errInvalidNumber, result.literal & "'i" & L.buf[postPos]) + + #echo "is intish: ", result.literal, ":", $ result.tokType + of 'u', 'U': - inc(endpos) - if (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'): + inc(postPos) + if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'): result.tokType = tkUInt64Lit - inc(endpos, 2) - elif (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'): + inc(postPos, 2) + elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'): result.tokType = tkUInt32Lit - inc(endpos, 2) - elif (L.buf[endpos] == '1') and (L.buf[endpos + 1] == '6'): + inc(postPos, 2) + elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'): result.tokType = tkUInt16Lit - inc(endpos, 2) - elif (L.buf[endpos] == '8'): + inc(postPos, 2) + elif (L.buf[postPos] == '8'): result.tokType = tkUInt8Lit - inc(endpos) + inc(postPos) else: result.tokType = tkUIntLit - else: lexMessage(L, errInvalidNumber, result.literal & "'" & L.buf[endpos]) - else: - L.bufpos = pos # restore position + else: + lexLiteralNumberMessage(L, errInvalidNumber, startpos) + #lexMessage(L, errInvalidNumber, result.literal & "'" & L.buf[postPos]) + + # Still a literalish char? Error! + if L.buf[postPos] in literalishCharsNoDot or + (L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}): + echo "likely literal error, additionally: '", L.buf[postPos], L.buf[postPos+1], L.buf[postPos+2], L.buf[postPos+3], "', ", postPos + lexLiteralNumberMessage(L, errInvalidNumber, startpos) + + L.bufpos = startpos # restore position + var pos: int = startpos + try: if (L.buf[pos] == '0') and (L.buf[pos + 1] in {'x', 'X', 'b', 'B', 'o', 'O', 'c', 'C'}): inc(pos, 2) - xi = 0 # it may be a base prefix + xi = 0 # it is a base prefix case L.buf[pos - 1] # now look at the optional type suffix: of 'b', 'B': result.base = base2 - while true: - case L.buf[pos] - of '2'..'9', '.': - lexMessage(L, errInvalidNumber, result.literal) - inc(pos) - of '_': - if L.buf[pos+1] notin {'0'..'1'}: - lexMessage(L, errInvalidToken, "_") - break - inc(pos) - of '0', '1': + while pos < endpos: + if L.buf[pos] != '_': xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) - inc(pos) - else: break + inc(pos) of 'o', 'c', 'C': result.base = base8 - while true: - case L.buf[pos] - of '8'..'9', '.': - lexMessage(L, errInvalidNumber, result.literal) - inc(pos) - of '_': - if L.buf[pos+1] notin {'0'..'7'}: - lexMessage(L, errInvalidToken, "_") - break - inc(pos) - of '0'..'7': + while pos < endpos: + if L.buf[pos] != '_': xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0')) - inc(pos) - else: break - of 'O': - lexMessage(L, errInvalidNumber, result.literal) + inc(pos) of 'x', 'X': result.base = base16 - while true: + while pos < endpos: case L.buf[pos] of '_': - if L.buf[pos+1] notin {'0'..'9', 'a'..'f', 'A'..'F'}: - lexMessage(L, errInvalidToken, "_") - break inc(pos) of '0'..'9': xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) @@ -408,8 +506,11 @@ proc getNumber(L: var TLexer): TToken = of 'A'..'F': xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) inc(pos) - else: break - else: internalError(getLineInfo(L), "getNumber") + else: + break + else: + internalError(getLineInfo(L), "getNumber") + case result.tokType of tkIntLit, tkInt64Lit: result.iNumber = xi of tkInt8Lit: result.iNumber = BiggestInt(int8(toU8(int(xi)))) @@ -425,34 +526,39 @@ proc getNumber(L: var TLexer): TToken = # XXX: Test this on big endian machine! of tkFloat64Lit: result.fNumber = (cast[PFloat64](addr(xi)))[] else: internalError(getLineInfo(L), "getNumber") - elif isFloatLiteral(result.literal) or (result.tokType == tkFloat32Lit) or + + elif isAFloatLiteral or (result.tokType == tkFloat32Lit) or (result.tokType == tkFloat64Lit): result.fNumber = parseFloat(result.literal) if result.tokType == tkIntLit: result.tokType = tkFloatLit + elif result.tokType == tkUint64Lit: xi = 0 let len = unsafeParseUInt(result.literal, xi) if len != result.literal.len or len == 0: raise newException(ValueError, "invalid integer: " & $xi) result.iNumber = xi + else: result.iNumber = parseBiggestInt(result.literal) if (result.iNumber < low(int32)) or (result.iNumber > high(int32)): if result.tokType == tkIntLit: result.tokType = tkInt64Lit elif result.tokType in {tkInt8Lit, tkInt16Lit, tkInt32Lit}: - lexMessage(L, errNumberOutOfRange, result.literal) + lexLiteralNumberMessage(L, errNumberOutOfRange, startpos) elif result.tokType == tkInt8Lit and (result.iNumber < int8.low or result.iNumber > int8.high): - lexMessage(L, errNumberOutOfRange, result.literal) + lexLiteralNumberMessage(L, errNumberOutOfRange, startpos) elif result.tokType == tkInt16Lit and (result.iNumber < int16.low or result.iNumber > int16.high): - lexMessage(L, errNumberOutOfRange, result.literal) + lexLiteralNumberMessage(L, errNumberOutOfRange, startpos) + except ValueError: - lexMessage(L, errInvalidNumber, result.literal) + lexLiteralNumberMessage(L, errInvalidNumber, startpos) except OverflowError, RangeError: - lexMessage(L, errNumberOutOfRange, result.literal) - L.bufpos = endpos + lexLiteralNumberMessage(L, errNumberOutOfRange, startpos) + + L.bufpos = postPos proc handleHexChar(L: var TLexer, xi: var int) = case L.buf[L.bufpos] |