diff options
Diffstat (limited to 'js/baba-yaga/src/legacy/lexer.js')
-rw-r--r-- | js/baba-yaga/src/legacy/lexer.js | 425 |
1 files changed, 425 insertions, 0 deletions
diff --git a/js/baba-yaga/src/legacy/lexer.js b/js/baba-yaga/src/legacy/lexer.js new file mode 100644 index 0000000..054dd0e --- /dev/null +++ b/js/baba-yaga/src/legacy/lexer.js @@ -0,0 +1,425 @@ +// lexer.js + +import { LexError, ErrorHelpers } from '../core/error.js'; + +const tokenTypes = { + IDENTIFIER: 'IDENTIFIER', + TYPE: 'TYPE', + NUMBER: 'NUMBER', + STRING: 'STRING', + ARROW: 'ARROW', + COLON: 'COLON', + SEMICOLON: 'SEMICOLON', + COMMA: 'COMMA', + KEYWORD: 'KEYWORD', + OPERATOR: 'OPERATOR', + LPAREN: 'LPAREN', + RPAREN: 'RPAREN', + DOT: 'DOT', + LBRACKET: 'LBRACKET', + RBRACKET: 'RBRACKET', + LBRACE: 'LBRACE', + RBRACE: 'RBRACE', + EOF: 'EOF', +}; + +const keywords = ['when', 'is', 'then', 'if', 'Ok', 'Err', 'true', 'false', 'PI', 'INFINITY', 'and', 'or', 'xor']; + +function createLexer(input) { + let position = 0; + let line = 1; + let column = 1; + + function isWhitespace(char) { + return /\s/.test(char); + } + + function isDigit(char) { + return /\d/.test(char); + } + + function isLetter(char) { + return /[a-zA-Z_0-9]/.test(char); + } + + function readWhile(predicate) { + let str = ''; + while (position < input.length && predicate(input[position])) { + str += input[position]; + position++; + column++; + } + return str; + } + + function readString() { + let str = ''; + const startLine = line; + const startColumn = column; + + position++; // Skip the opening quote + column++; + + while (position < input.length && input[position] !== '"') { + const char = input[position]; + + // Handle newlines in strings + if (char === '\n') { + line++; + column = 1; + } else { + column++; + } + + // Handle escape sequences + if (char === '\\' && position + 1 < input.length) { + const nextChar = input[position + 1]; + switch (nextChar) { + case 'n': + str += '\n'; + position += 2; + column++; + break; + case 't': + str += '\t'; + position += 2; + column++; + break; + case 'r': + str += '\r'; + position += 2; + column++; + break; + case '\\': + str += '\\'; + position += 2; + column++; + break; + case '"': + str += '"'; + position += 2; + column++; + break; + default: + str += char; + position++; + } + } else { + str += char; + position++; + } + } + + // Check for unterminated string + if (position >= input.length) { + throw new LexError( + 'Unterminated string literal', + { line: startLine, column: startColumn, length: str.length + 1 }, + input, + [ + 'Add closing quote " at the end of the string', + 'Check for unescaped quotes inside the string', + 'Use \\" to include quotes in strings' + ] + ); + } + + position++; // Skip the closing quote + column++; + return { type: tokenTypes.STRING, value: str, line: startLine, column: startColumn }; + } + + function readNumber() { + let value = readWhile(isDigit); + let isFloat = false; + if (peekChar() === '.') { + position++; + column++; + value += '.' + readWhile(isDigit); + isFloat = true; + } + + const numericValue = isFloat ? parseFloat(value) : parseInt(value, 10); + return { + type: tokenTypes.NUMBER, + value: numericValue, + isFloat: isFloat, + originalString: value, + line, + column + }; + } + + function peekChar() { + return input[position]; + } + + function shouldBeNegativeLiteral() { + // Look at the previous non-whitespace token to decide + let prevPos = position - 1; + while (prevPos >= 0 && isWhitespace(input[prevPos])) { + prevPos--; + } + + if (prevPos < 0) { + // At start of input - should be negative literal + return true; + } + + const prevChar = input[prevPos]; + + // After opening parenthesis, comma, or operators - should be negative literal + if (prevChar === '(' || prevChar === ',' || prevChar === '+' || + prevChar === '*' || prevChar === '/' || prevChar === '%' || + prevChar === '=' || prevChar === '>' || prevChar === '<' || + prevChar === ':' || prevChar === ';') { + return true; + } + + // After closing parenthesis - should be binary minus + if (prevChar === ')') { + return false; + } + + // After numbers - this is tricky. In most cases it should be binary minus, + // but in function call contexts it might be a negative literal. + // Let's look ahead to see if this is likely a function call context. + if (isDigit(prevChar)) { + // Look ahead to see if we're in a function call context + // If we see whitespace followed by another minus, it's probably a negative literal + let lookAheadPos = position + 1; + while (lookAheadPos < input.length && isWhitespace(input[lookAheadPos])) { + lookAheadPos++; + } + if (lookAheadPos < input.length && input[lookAheadPos] === '-') { + // This looks like a function call with consecutive negative arguments + return true; + } + return false; // Default to binary minus + } + + // After identifiers - could be either, but in most contexts it's a negative literal + // (function calls, variable declarations, etc.) + if (isLetter(prevChar)) { + return true; + } + + // Default to negative literal + return true; + } + + function readNegativeNumber() { + // Consume the minus sign + position++; + column++; + + // Read the number part + let value = '-' + readWhile(isDigit); + let isFloat = false; + + if (peekChar() === '.') { + position++; + column++; + value += '.' + readWhile(isDigit); + isFloat = true; + } + + const numericValue = isFloat ? parseFloat(value) : parseInt(value, 10); + return { + type: tokenTypes.NUMBER, + value: numericValue, + isFloat: isFloat, + originalString: value, + line, + column + }; + } + + function nextToken() { + if (position >= input.length) { + return { type: tokenTypes.EOF, line, column }; + } + + let char = input[position]; + + if (isWhitespace(char)) { + if (char === '\n') { + line++; + column = 1; + } else { + column++; + } + position++; + return nextToken(); + } + + if (char === '/' && input[position + 1] === '/') { + while (position < input.length && input[position] !== '\n') { + position++; + column++; + } + return nextToken(); // Skip the comment and get the next real token + } + + if (char === '(') { + position++; + column++; + return { type: tokenTypes.LPAREN, value: '(', line, column }; + } + + if (char === ')') { + position++; + column++; + return { type: tokenTypes.RPAREN, value: ')', line, column }; + } + + if (char === '[') { + position++; + column++; + return { type: tokenTypes.LBRACKET, value: '[', line, column }; + } + + if (char === ']') { + position++; + column++; + return { type: tokenTypes.RBRACKET, value: ']', line, column }; + } + + if (char === '{') { + position++; + column++; + return { type: tokenTypes.LBRACE, value: '{', line, column }; + } + + if (char === '}') { + position++; + column++; + return { type: tokenTypes.RBRACE, value: '}', line, column }; + } + + // Handle double dot operator for string concatenation (must come before single dot) + if (char === '.' && input[position + 1] === '.') { + position += 2; + column += 2; + return { type: tokenTypes.OPERATOR, value: '..', line, column }; + } + + if (char === '.') { + position++; + column++; + return { type: tokenTypes.DOT, value: '.', line, column }; + } + + // Handle negative numbers based on context + if (char === '-' && position + 1 < input.length && isDigit(input[position + 1])) { + // Check if this should be a negative literal vs binary minus + if (shouldBeNegativeLiteral()) { + return readNegativeNumber(); + } + } + + if (isDigit(char)) { + return readNumber(); + } + + if (isLetter(char)) { + const value = readWhile(isLetter); + if (['Int', 'String', 'Result', 'Float', 'Number', 'List', 'Table', 'Bool'].includes(value)) { + return { type: tokenTypes.TYPE, value, line, column }; + } + if (keywords.includes(value)) { + return { type: tokenTypes.KEYWORD, value, line, column }; + } + return { type: tokenTypes.IDENTIFIER, value, line, column }; + } + + if (char === '"') { + return readString(); + } + + if (char === ':') { + position++; + column++; + return { type: tokenTypes.COLON, value: ':', line, column }; + } + + if (char === '-' && input[position + 1] === '>') { + position += 2; + column += 2; + return { type: tokenTypes.ARROW, value: '->', line, column }; + } + + if (char === ';') { + position++; + column++; + return { type: tokenTypes.SEMICOLON, value: ';', line, column }; + } + + // Handle >= and <= + if (char === '>' && input[position + 1] === '=') { + position += 2; + column += 2; + return { type: tokenTypes.OPERATOR, value: '>=', line, column }; + } + if (char === '<' && input[position + 1] === '=') { + position += 2; + column += 2; + return { type: tokenTypes.OPERATOR, value: '<=', line, column }; + } + + // Handle != (not equal) + if (char === '!' && input[position + 1] === '=') { + position += 2; + column += 2; + return { type: tokenTypes.OPERATOR, value: '!=', line, column }; + } + + if (char === ',') { + position++; + column++; + return { type: tokenTypes.COMMA, value: ',', line, column }; + } + + if (['+', '-', '*', '/', '=', '>', '<', '%'].includes(char)) { + position++; + column++; + return { type: tokenTypes.OPERATOR, value: char, line, column }; + } + + const suggestions = []; + + // Common character mistakes + if (char === '"' || char === '"') { + suggestions.push('Use straight quotes " instead of curly quotes'); + } else if (char === '–' || char === '—') { + suggestions.push('Use regular minus - or arrow -> instead of em/en dash'); + } else if (/[^\x00-\x7F]/.test(char)) { + suggestions.push('Use only ASCII characters in Baba Yaga code'); + } else { + suggestions.push(`Character "${char}" is not valid in Baba Yaga syntax`); + } + + throw new LexError( + `Unexpected character: ${JSON.stringify(char)}`, + { line, column, length: 1 }, + input, + suggestions + ); + } + + function allTokens() { + const tokens = []; + let token; + do { + token = nextToken(); + tokens.push(token); + } while (token.type !== tokenTypes.EOF); + return tokens; + } + + return { + allTokens, + }; +} + +export { createLexer, tokenTypes }; |