// lexer-optimized.js - High-performance regex-based lexer import { LexError, ErrorHelpers } from './error.js'; const tokenTypes = { IDENTIFIER: 'IDENTIFIER', TYPE: 'TYPE', NUMBER: 'NUMBER', STRING: 'STRING', ARROW: 'ARROW', COLON: 'COLON', SEMICOLON: 'SEMICOLON', COMMA: 'COMMA', KEYWORD: 'KEYWORD', OPERATOR: 'OPERATOR', LPAREN: 'LPAREN', RPAREN: 'RPAREN', DOT: 'DOT', LBRACKET: 'LBRACKET', RBRACKET: 'RBRACKET', LBRACE: 'LBRACE', RBRACE: 'RBRACE', EOF: 'EOF', }; const keywords = new Set(['when', 'is', 'then', 'if', 'Ok', 'Err', 'true', 'false', 'PI', 'INFINITY', 'and', 'or', 'xor']); const types = new Set(['Int', 'String', 'Result', 'Float', 'Number', 'List', 'Table', 'Bool']); /** * Token pattern definitions with regex and processing functions */ const TOKEN_PATTERNS = [ // Whitespace (skip) { name: 'WHITESPACE', regex: /^[ \t\r]+/, skip: true }, // Newlines (track line numbers) - handled by advance function { name: 'NEWLINE', regex: /^\n/, skip: true }, // Comments (skip) { name: 'COMMENT', regex: /^\/\/.*$/m, skip: true }, // Multi-character operators (order matters - longest first) { name: 'ARROW', regex: /^->/, type: tokenTypes.ARROW }, { name: 'STRING_CONCAT', regex: /^\.\./, type: tokenTypes.OPERATOR, value: '..' }, { name: 'COMPARISON_OPS', regex: /^(>=|<=|!=)/, type: tokenTypes.OPERATOR }, // Numbers (including negative numbers in appropriate contexts) { name: 'NUMBER', regex: /^-?\d+(\.\d+)?/, type: tokenTypes.NUMBER, process: (match, lexer) => { const value = parseFloat(match[0]); const isFloat = match[0].includes('.'); return { type: tokenTypes.NUMBER, value, isFloat, originalString: match[0] }; } }, // Strings with escape sequence handling { name: 'STRING', regex: /^"((?:[^"\\]|\\.)*)"/, type: tokenTypes.STRING, process: (match, lexer) => { const rawString = match[1]; const processedString = rawString .replace(/\\n/g, '\n') .replace(/\\t/g, '\t') .replace(/\\r/g, '\r') .replace(/\\\\/g, '\\') .replace(/\\"/g, '"'); return { type: tokenTypes.STRING, value: processedString }; } }, // Identifiers, keywords, and types { name: 'IDENTIFIER', regex: /^[a-zA-Z_][a-zA-Z0-9_]*/, process: (match, lexer) => { const value = match[0]; if (keywords.has(value)) { return { type: tokenTypes.KEYWORD, value }; } else if (types.has(value)) { return { type: tokenTypes.TYPE, value }; } else { return { type: tokenTypes.IDENTIFIER, value }; } } }, // Single character operators { name: 'SINGLE_CHAR_OPS', regex: /^[+\-*/%=><]/, type: tokenTypes.OPERATOR }, // Punctuation { name: 'PUNCTUATION', regex: /^[()[\]{}:;,.]/, process: (match, lexer) => { const char = match[0]; const typeMap = { '(': tokenTypes.LPAREN, ')': tokenTypes.RPAREN, '[': tokenTypes.LBRACKET, ']': tokenTypes.RBRACKET, '{': tokenTypes.LBRACE, '}': tokenTypes.RBRACE, ':': tokenTypes.COLON, ';': tokenTypes.SEMICOLON, ',': tokenTypes.COMMA, '.': tokenTypes.DOT }; return { type: typeMap[char], value: char }; } } ]; /** * High-performance regex-based lexer */ function createOptimizedLexer(input) { let position = 0; let line = 1; let column = 1; // Pre-compile all regexes for better performance const compiledPatterns = TOKEN_PATTERNS.map(pattern => ({ ...pattern, compiledRegex: pattern.regex })); function getCurrentLocation() { return { line, column }; } function advance(length) { for (let i = 0; i < length; i++) { if (input[position + i] === '\n') { line++; column = 1; } else { column++; } } position += length; } function nextToken() { if (position >= input.length) { return { type: tokenTypes.EOF, value: '', line, column }; } const remaining = input.slice(position); const startLocation = getCurrentLocation(); // Try each pattern in order for (const pattern of compiledPatterns) { const match = remaining.match(pattern.compiledRegex); if (match) { const matchedText = match[0]; const tokenLength = matchedText.length; // Handle special patterns that affect lexer state if (pattern.onMatch) { pattern.onMatch({ line, column }); } advance(tokenLength); // Skip tokens that should be ignored if (pattern.skip) { return nextToken(); } // Create the token let token; if (pattern.process) { token = pattern.process(match, this); } else { token = { type: pattern.type, value: pattern.value || matchedText }; } // Add location information token.line = startLocation.line; token.column = startLocation.column; return token; } } // No pattern matched - handle error const char = remaining[0]; const suggestions = []; // Common character mistakes if (char === '"' || char === '"') { suggestions.push('Use straight quotes " instead of curly quotes'); } else if (char === '–' || char === '—') { suggestions.push('Use regular minus - or arrow -> instead of em/en dash'); } else if (/[^\x00-\x7F]/.test(char)) { suggestions.push('Use only ASCII characters in Baba Yaga code'); } else { suggestions.push(`Character "${char}" is not valid in Baba Yaga syntax`); } throw new LexError( `Unexpected character: ${JSON.stringify(char)}`, { line, column, length: 1 }, input, suggestions ); } function allTokens() { const tokens = []; let token; do { token = nextToken(); tokens.push(token); } while (token.type !== tokenTypes.EOF); return tokens; } return { allTokens, nextToken }; } /** * Performance comparison utility */ async function createLexerWithFallback(input, useOptimized = true) { if (useOptimized) { try { return createOptimizedLexer(input); } catch (error) { // If optimized lexer fails, fall back to original console.warn('Falling back to original lexer:', error.message); const { createLexer } = await import('./lexer.js'); return createLexer(input); } } else { const { createLexer } = await import('./lexer.js'); return createLexer(input); } } /** * Benchmark function to compare lexer performance */ async function benchmarkLexers(input, iterations = 1000) { console.log(`Benchmarking lexers with ${iterations} iterations...`); // Warm up for (let i = 0; i < 10; i++) { createOptimizedLexer(input).allTokens(); } // Benchmark optimized lexer const optimizedStart = performance.now(); for (let i = 0; i < iterations; i++) { createOptimizedLexer(input).allTokens(); } const optimizedTime = performance.now() - optimizedStart; // Benchmark original lexer const { createLexer } = await import('./lexer.js'); const originalStart = performance.now(); for (let i = 0; i < iterations; i++) { createLexer(input).allTokens(); } const originalTime = performance.now() - originalStart; console.log(`Original lexer: ${originalTime.toFixed(2)}ms`); console.log(`Optimized lexer: ${optimizedTime.toFixed(2)}ms`); console.log(`Speedup: ${(originalTime / optimizedTime).toFixed(2)}x`); return { originalTime, optimizedTime, speedup: originalTime / optimizedTime }; } export { createOptimizedLexer, createLexerWithFallback, benchmarkLexers, tokenTypes };