diff options
Diffstat (limited to 'js/baba-yaga/src/legacy/lexer-optimized.js')
-rw-r--r-- | js/baba-yaga/src/legacy/lexer-optimized.js | 357 |
1 files changed, 357 insertions, 0 deletions
diff --git a/js/baba-yaga/src/legacy/lexer-optimized.js b/js/baba-yaga/src/legacy/lexer-optimized.js new file mode 100644 index 0000000..0d4dc51 --- /dev/null +++ b/js/baba-yaga/src/legacy/lexer-optimized.js @@ -0,0 +1,357 @@ +// lexer-optimized.js - High-performance regex-based lexer + +import { LexError, ErrorHelpers } from './error.js'; + +const tokenTypes = { + IDENTIFIER: 'IDENTIFIER', + TYPE: 'TYPE', + NUMBER: 'NUMBER', + STRING: 'STRING', + ARROW: 'ARROW', + COLON: 'COLON', + SEMICOLON: 'SEMICOLON', + COMMA: 'COMMA', + KEYWORD: 'KEYWORD', + OPERATOR: 'OPERATOR', + LPAREN: 'LPAREN', + RPAREN: 'RPAREN', + DOT: 'DOT', + LBRACKET: 'LBRACKET', + RBRACKET: 'RBRACKET', + LBRACE: 'LBRACE', + RBRACE: 'RBRACE', + EOF: 'EOF', +}; + +const keywords = new Set(['when', 'is', 'then', 'if', 'Ok', 'Err', 'true', 'false', 'PI', 'INFINITY', 'and', 'or', 'xor']); +const types = new Set(['Int', 'String', 'Result', 'Float', 'Number', 'List', 'Table', 'Bool']); + +/** + * Token pattern definitions with regex and processing functions + */ +const TOKEN_PATTERNS = [ + // Whitespace (skip) + { + name: 'WHITESPACE', + regex: /^[ \t\r]+/, + skip: true + }, + + // Newlines (track line numbers) - handled by advance function + { + name: 'NEWLINE', + regex: /^\n/, + skip: true + }, + + // Comments (skip) + { + name: 'COMMENT', + regex: /^\/\/.*$/m, + skip: true + }, + + // Multi-character operators (order matters - longest first) + { + name: 'ARROW', + regex: /^->/, + type: tokenTypes.ARROW + }, + + { + name: 'STRING_CONCAT', + regex: /^\.\./, + type: tokenTypes.OPERATOR, + value: '..' + }, + + { + name: 'COMPARISON_OPS', + regex: /^(>=|<=|!=)/, + type: tokenTypes.OPERATOR + }, + + // Numbers (including negative numbers in appropriate contexts) + { + name: 'NUMBER', + regex: /^-?\d+(\.\d+)?/, + type: tokenTypes.NUMBER, + process: (match, lexer) => { + const value = parseFloat(match[0]); + const isFloat = match[0].includes('.'); + return { + type: tokenTypes.NUMBER, + value, + isFloat, + originalString: match[0] + }; + } + }, + + // Strings with escape sequence handling + { + name: 'STRING', + regex: /^"((?:[^"\\]|\\.)*)"/, + type: tokenTypes.STRING, + process: (match, lexer) => { + const rawString = match[1]; + const processedString = rawString + .replace(/\\n/g, '\n') + .replace(/\\t/g, '\t') + .replace(/\\r/g, '\r') + .replace(/\\\\/g, '\\') + .replace(/\\"/g, '"'); + + return { + type: tokenTypes.STRING, + value: processedString + }; + } + }, + + // Identifiers, keywords, and types + { + name: 'IDENTIFIER', + regex: /^[a-zA-Z_][a-zA-Z0-9_]*/, + process: (match, lexer) => { + const value = match[0]; + + if (keywords.has(value)) { + return { + type: tokenTypes.KEYWORD, + value + }; + } else if (types.has(value)) { + return { + type: tokenTypes.TYPE, + value + }; + } else { + return { + type: tokenTypes.IDENTIFIER, + value + }; + } + } + }, + + // Single character operators + { + name: 'SINGLE_CHAR_OPS', + regex: /^[+\-*/%=><]/, + type: tokenTypes.OPERATOR + }, + + // Punctuation + { + name: 'PUNCTUATION', + regex: /^[()[\]{}:;,.]/, + process: (match, lexer) => { + const char = match[0]; + const typeMap = { + '(': tokenTypes.LPAREN, + ')': tokenTypes.RPAREN, + '[': tokenTypes.LBRACKET, + ']': tokenTypes.RBRACKET, + '{': tokenTypes.LBRACE, + '}': tokenTypes.RBRACE, + ':': tokenTypes.COLON, + ';': tokenTypes.SEMICOLON, + ',': tokenTypes.COMMA, + '.': tokenTypes.DOT + }; + + return { + type: typeMap[char], + value: char + }; + } + } +]; + +/** + * High-performance regex-based lexer + */ +function createOptimizedLexer(input) { + let position = 0; + let line = 1; + let column = 1; + + // Pre-compile all regexes for better performance + const compiledPatterns = TOKEN_PATTERNS.map(pattern => ({ + ...pattern, + compiledRegex: pattern.regex + })); + + function getCurrentLocation() { + return { line, column }; + } + + function advance(length) { + for (let i = 0; i < length; i++) { + if (input[position + i] === '\n') { + line++; + column = 1; + } else { + column++; + } + } + position += length; + } + + function nextToken() { + if (position >= input.length) { + return { + type: tokenTypes.EOF, + value: '', + line, + column + }; + } + + const remaining = input.slice(position); + const startLocation = getCurrentLocation(); + + // Try each pattern in order + for (const pattern of compiledPatterns) { + const match = remaining.match(pattern.compiledRegex); + + if (match) { + const matchedText = match[0]; + const tokenLength = matchedText.length; + + // Handle special patterns that affect lexer state + if (pattern.onMatch) { + pattern.onMatch({ line, column }); + } + + advance(tokenLength); + + // Skip tokens that should be ignored + if (pattern.skip) { + return nextToken(); + } + + // Create the token + let token; + + if (pattern.process) { + token = pattern.process(match, this); + } else { + token = { + type: pattern.type, + value: pattern.value || matchedText + }; + } + + // Add location information + token.line = startLocation.line; + token.column = startLocation.column; + + return token; + } + } + + // No pattern matched - handle error + const char = remaining[0]; + const suggestions = []; + + // Common character mistakes + if (char === '"' || char === '"') { + suggestions.push('Use straight quotes " instead of curly quotes'); + } else if (char === '–' || char === '—') { + suggestions.push('Use regular minus - or arrow -> instead of em/en dash'); + } else if (/[^\x00-\x7F]/.test(char)) { + suggestions.push('Use only ASCII characters in Baba Yaga code'); + } else { + suggestions.push(`Character "${char}" is not valid in Baba Yaga syntax`); + } + + throw new LexError( + `Unexpected character: ${JSON.stringify(char)}`, + { line, column, length: 1 }, + input, + suggestions + ); + } + + function allTokens() { + const tokens = []; + let token; + + do { + token = nextToken(); + tokens.push(token); + } while (token.type !== tokenTypes.EOF); + + return tokens; + } + + return { + allTokens, + nextToken + }; +} + +/** + * Performance comparison utility + */ +async function createLexerWithFallback(input, useOptimized = true) { + if (useOptimized) { + try { + return createOptimizedLexer(input); + } catch (error) { + // If optimized lexer fails, fall back to original + console.warn('Falling back to original lexer:', error.message); + const { createLexer } = await import('./lexer.js'); + return createLexer(input); + } + } else { + const { createLexer } = await import('./lexer.js'); + return createLexer(input); + } +} + +/** + * Benchmark function to compare lexer performance + */ +async function benchmarkLexers(input, iterations = 1000) { + console.log(`Benchmarking lexers with ${iterations} iterations...`); + + // Warm up + for (let i = 0; i < 10; i++) { + createOptimizedLexer(input).allTokens(); + } + + // Benchmark optimized lexer + const optimizedStart = performance.now(); + for (let i = 0; i < iterations; i++) { + createOptimizedLexer(input).allTokens(); + } + const optimizedTime = performance.now() - optimizedStart; + + // Benchmark original lexer + const { createLexer } = await import('./lexer.js'); + const originalStart = performance.now(); + for (let i = 0; i < iterations; i++) { + createLexer(input).allTokens(); + } + const originalTime = performance.now() - originalStart; + + console.log(`Original lexer: ${originalTime.toFixed(2)}ms`); + console.log(`Optimized lexer: ${optimizedTime.toFixed(2)}ms`); + console.log(`Speedup: ${(originalTime / optimizedTime).toFixed(2)}x`); + + return { + originalTime, + optimizedTime, + speedup: originalTime / optimizedTime + }; +} + +export { + createOptimizedLexer, + createLexerWithFallback, + benchmarkLexers, + tokenTypes +}; |