// lexer.js import { LexError, ErrorHelpers } from '../core/error.js'; const tokenTypes = { IDENTIFIER: 'IDENTIFIER', TYPE: 'TYPE', NUMBER: 'NUMBER', STRING: 'STRING', ARROW: 'ARROW', COLON: 'COLON', SEMICOLON: 'SEMICOLON', COMMA: 'COMMA', KEYWORD: 'KEYWORD', OPERATOR: 'OPERATOR', LPAREN: 'LPAREN', RPAREN: 'RPAREN', DOT: 'DOT', LBRACKET: 'LBRACKET', RBRACKET: 'RBRACKET', LBRACE: 'LBRACE', RBRACE: 'RBRACE', EOF: 'EOF', }; const keywords = ['when', 'is', 'then', 'if', 'Ok', 'Err', 'true', 'false', 'PI', 'INFINITY', 'and', 'or', 'xor']; function createLexer(input) { let position = 0; let line = 1; let column = 1; function isWhitespace(char) { return /\s/.test(char); } function isDigit(char) { return /\d/.test(char); } function isLetter(char) { return /[a-zA-Z_0-9]/.test(char); } function readWhile(predicate) { let str = ''; while (position < input.length && predicate(input[position])) { str += input[position]; position++; column++; } return str; } function readString() { let str = ''; const startLine = line; const startColumn = column; position++; // Skip the opening quote column++; while (position < input.length && input[position] !== '"') { const char = input[position]; // Handle newlines in strings if (char === '\n') { line++; column = 1; } else { column++; } // Handle escape sequences if (char === '\\' && position + 1 < input.length) { const nextChar = input[position + 1]; switch (nextChar) { case 'n': str += '\n'; position += 2; column++; break; case 't': str += '\t'; position += 2; column++; break; case 'r': str += '\r'; position += 2; column++; break; case '\\': str += '\\'; position += 2; column++; break; case '"': str += '"'; position += 2; column++; break; default: str += char; position++; } } else { str += char; position++; } } // Check for unterminated string if (position >= input.length) { throw new LexError( 'Unterminated string literal', { line: startLine, column: startColumn, length: str.length + 1 }, input, [ 'Add closing quote " at the end of the string', 'Check for unescaped quotes inside the string', 'Use \\" to include quotes in strings' ] ); } position++; // Skip the closing quote column++; return { type: tokenTypes.STRING, value: str, line: startLine, column: startColumn }; } function readNumber() { let value = readWhile(isDigit); let isFloat = false; if (peekChar() === '.') { position++; column++; value += '.' + readWhile(isDigit); isFloat = true; } const numericValue = isFloat ? parseFloat(value) : parseInt(value, 10); return { type: tokenTypes.NUMBER, value: numericValue, isFloat: isFloat, originalString: value, line, column }; } function peekChar() { return input[position]; } function shouldBeNegativeLiteral() { // Look at the previous non-whitespace token to decide let prevPos = position - 1; while (prevPos >= 0 && isWhitespace(input[prevPos])) { prevPos--; } if (prevPos < 0) { // At start of input - should be negative literal return true; } const prevChar = input[prevPos]; // After opening parenthesis, comma, or operators - should be negative literal if (prevChar === '(' || prevChar === ',' || prevChar === '+' || prevChar === '*' || prevChar === '/' || prevChar === '%' || prevChar === '=' || prevChar === '>' || prevChar === '<' || prevChar === ':' || prevChar === ';') { return true; } // After closing parenthesis - should be binary minus if (prevChar === ')') { return false; } // After numbers - this is tricky. In most cases it should be binary minus, // but in function call contexts it might be a negative literal. // Let's look ahead to see if this is likely a function call context. if (isDigit(prevChar)) { // Look ahead to see if we're in a function call context // If we see whitespace followed by another minus, it's probably a negative literal let lookAheadPos = position + 1; while (lookAheadPos < input.length && isWhitespace(input[lookAheadPos])) { lookAheadPos++; } if (lookAheadPos < input.length && input[lookAheadPos] === '-') { // This looks like a function call with consecutive negative arguments return true; } return false; // Default to binary minus } // After identifiers - could be either, but in most contexts it's a negative literal // (function calls, variable declarations, etc.) if (isLetter(prevChar)) { return true; } // Default to negative literal return true; } function readNegativeNumber() { // Consume the minus sign position++; column++; // Read the number part let value = '-' + readWhile(isDigit); let isFloat = false; if (peekChar() === '.') { position++; column++; value += '.' + readWhile(isDigit); isFloat = true; } const numericValue = isFloat ? parseFloat(value) : parseInt(value, 10); return { type: tokenTypes.NUMBER, value: numericValue, isFloat: isFloat, originalString: value, line, column }; } function nextToken() { if (position >= input.length) { return { type: tokenTypes.EOF, line, column }; } let char = input[position]; if (isWhitespace(char)) { if (char === '\n') { line++; column = 1; } else { column++; } position++; return nextToken(); } if (char === '/' && input[position + 1] === '/') { while (position < input.length && input[position] !== '\n') { position++; column++; } return nextToken(); // Skip the comment and get the next real token } if (char === '(') { position++; column++; return { type: tokenTypes.LPAREN, value: '(', line, column }; } if (char === ')') { position++; column++; return { type: tokenTypes.RPAREN, value: ')', line, column }; } if (char === '[') { position++; column++; return { type: tokenTypes.LBRACKET, value: '[', line, column }; } if (char === ']') { position++; column++; return { type: tokenTypes.RBRACKET, value: ']', line, column }; } if (char === '{') { position++; column++; return { type: tokenTypes.LBRACE, value: '{', line, column }; } if (char === '}') { position++; column++; return { type: tokenTypes.RBRACE, value: '}', line, column }; } // Handle double dot operator for string concatenation (must come before single dot) if (char === '.' && input[position + 1] === '.') { position += 2; column += 2; return { type: tokenTypes.OPERATOR, value: '..', line, column }; } if (char === '.') { position++; column++; return { type: tokenTypes.DOT, value: '.', line, column }; } // Handle negative numbers based on context if (char === '-' && position + 1 < input.length && isDigit(input[position + 1])) { // Check if this should be a negative literal vs binary minus if (shouldBeNegativeLiteral()) { return readNegativeNumber(); } } if (isDigit(char)) { return readNumber(); } if (isLetter(char)) { const value = readWhile(isLetter); if (['Int', 'String', 'Result', 'Float', 'Number', 'List', 'Table', 'Bool'].includes(value)) { return { type: tokenTypes.TYPE, value, line, column }; } if (keywords.includes(value)) { return { type: tokenTypes.KEYWORD, value, line, column }; } return { type: tokenTypes.IDENTIFIER, value, line, column }; } if (char === '"') { return readString(); } if (char === ':') { position++; column++; return { type: tokenTypes.COLON, value: ':', line, column }; } if (char === '-' && input[position + 1] === '>') { position += 2; column += 2; return { type: tokenTypes.ARROW, value: '->', line, column }; } if (char === ';') { position++; column++; return { type: tokenTypes.SEMICOLON, value: ';', line, column }; } // Handle >= and <= if (char === '>' && input[position + 1] === '=') { position += 2; column += 2; return { type: tokenTypes.OPERATOR, value: '>=', line, column }; } if (char === '<' && input[position + 1] === '=') { position += 2; column += 2; return { type: tokenTypes.OPERATOR, value: '<=', line, column }; } // Handle != (not equal) if (char === '!' && input[position + 1] === '=') { position += 2; column += 2; return { type: tokenTypes.OPERATOR, value: '!=', line, column }; } if (char === ',') { position++; column++; return { type: tokenTypes.COMMA, value: ',', line, column }; } if (['+', '-', '*', '/', '=', '>', '<', '%'].includes(char)) { position++; column++; return { type: tokenTypes.OPERATOR, value: char, line, column }; } const suggestions = []; // Common character mistakes if (char === '"' || char === '"') { suggestions.push('Use straight quotes " instead of curly quotes'); } else if (char === '–' || char === '—') { suggestions.push('Use regular minus - or arrow -> instead of em/en dash'); } else if (/[^\x00-\x7F]/.test(char)) { suggestions.push('Use only ASCII characters in Baba Yaga code'); } else { suggestions.push(`Character "${char}" is not valid in Baba Yaga syntax`); } throw new LexError( `Unexpected character: ${JSON.stringify(char)}`, { line, column, length: 1 }, input, suggestions ); } function allTokens() { const tokens = []; let token; do { token = nextToken(); tokens.push(token); } while (token.type !== tokenTypes.EOF); return tokens; } return { allTokens, }; } export { createLexer, tokenTypes };