about summary refs log tree commit diff stats
path: root/js/baba-yaga/src/core/lexer.js
diff options
context:
space:
mode:
Diffstat (limited to 'js/baba-yaga/src/core/lexer.js')
-rw-r--r--js/baba-yaga/src/core/lexer.js321
1 files changed, 321 insertions, 0 deletions
diff --git a/js/baba-yaga/src/core/lexer.js b/js/baba-yaga/src/core/lexer.js
new file mode 100644
index 0000000..8a2cc65
--- /dev/null
+++ b/js/baba-yaga/src/core/lexer.js
@@ -0,0 +1,321 @@
+// src/core/lexer.js - Optimized lexer (primary implementation)
+
+import { LexError, ErrorHelpers } from './error.js';
+
+const tokenTypes = {
+  IDENTIFIER: 'IDENTIFIER',
+  TYPE: 'TYPE',
+  NUMBER: 'NUMBER',
+  STRING: 'STRING',
+  ARROW: 'ARROW',
+  COLON: 'COLON',
+  SEMICOLON: 'SEMICOLON',
+  COMMA: 'COMMA',
+  KEYWORD: 'KEYWORD',
+  OPERATOR: 'OPERATOR',
+  LPAREN: 'LPAREN',
+  RPAREN: 'RPAREN',
+  DOT: 'DOT',
+  LBRACKET: 'LBRACKET',
+  RBRACKET: 'RBRACKET',
+  LBRACE: 'LBRACE',
+  RBRACE: 'RBRACE',
+  EOF: 'EOF',
+};
+
+const keywords = new Set(['when', 'is', 'then', 'if', 'Ok', 'Err', 'true', 'false', 'PI', 'INFINITY', 'and', 'or', 'xor']);
+const types = new Set(['Int', 'String', 'Result', 'Float', 'Number', 'List', 'Table', 'Bool']);
+
+/**
+ * Token pattern definitions with regex and processing functions
+ */
+const TOKEN_PATTERNS = [
+  // Whitespace (skip)
+  {
+    name: 'WHITESPACE',
+    regex: /^[ \t\r]+/,
+    skip: true
+  },
+  
+  // Newlines (track line numbers) - handled by advance function
+  {
+    name: 'NEWLINE',
+    regex: /^\n/,
+    skip: true
+  },
+  
+  // Comments (skip)
+  {
+    name: 'COMMENT',
+    regex: /^\/\/.*$/m,
+    skip: true
+  },
+  
+  // Multi-character operators (order matters - longest first)
+  {
+    name: 'ARROW',
+    regex: /^->/,
+    type: tokenTypes.ARROW
+  },
+  
+  {
+    name: 'STRING_CONCAT',
+    regex: /^\.\./,
+    type: tokenTypes.OPERATOR,
+    value: '..'
+  },
+  
+  {
+    name: 'COMPARISON_OPS',
+    regex: /^(>=|<=|!=)/,
+    type: tokenTypes.OPERATOR
+  },
+  
+  // Numbers (including negative numbers in appropriate contexts)
+  {
+    name: 'NUMBER',
+    regex: /^-?\d+(\.\d+)?/,
+    type: tokenTypes.NUMBER,
+    process: (match, lexer) => {
+      const value = parseFloat(match[0]);
+      const isFloat = match[0].includes('.');
+      return {
+        type: tokenTypes.NUMBER,
+        value,
+        isFloat,
+        originalString: match[0]
+      };
+    }
+  },
+  
+  // Strings with escape sequence handling
+  {
+    name: 'STRING',
+    regex: /^"((?:[^"\\]|\\.)*)"/,
+    type: tokenTypes.STRING,
+    process: (match, lexer) => {
+      const rawString = match[1];
+      const processedString = rawString
+        .replace(/\\n/g, '\n')
+        .replace(/\\t/g, '\t')
+        .replace(/\\r/g, '\r')
+        .replace(/\\\\/g, '\\')
+        .replace(/\\"/g, '"');
+      
+      return {
+        type: tokenTypes.STRING,
+        value: processedString
+      };
+    }
+  },
+  
+  // Identifiers, keywords, and types
+  {
+    name: 'IDENTIFIER',
+    regex: /^[a-zA-Z_][a-zA-Z0-9_]*/,
+    process: (match, lexer) => {
+      const value = match[0];
+      
+      if (keywords.has(value)) {
+        return {
+          type: tokenTypes.KEYWORD,
+          value
+        };
+      } else if (types.has(value)) {
+        return {
+          type: tokenTypes.TYPE,
+          value
+        };
+      } else {
+        return {
+          type: tokenTypes.IDENTIFIER,
+          value
+        };
+      }
+    }
+  },
+  
+  // Single character operators
+  {
+    name: 'SINGLE_CHAR_OPS',
+    regex: /^[+\-*/%=><]/,
+    type: tokenTypes.OPERATOR
+  },
+  
+  // Punctuation
+  {
+    name: 'PUNCTUATION',
+    regex: /^[()[\]{}:;,.]/,
+    process: (match, lexer) => {
+      const char = match[0];
+      const typeMap = {
+        '(': tokenTypes.LPAREN,
+        ')': tokenTypes.RPAREN,
+        '[': tokenTypes.LBRACKET,
+        ']': tokenTypes.RBRACKET,
+        '{': tokenTypes.LBRACE,
+        '}': tokenTypes.RBRACE,
+        ':': tokenTypes.COLON,
+        ';': tokenTypes.SEMICOLON,
+        ',': tokenTypes.COMMA,
+        '.': tokenTypes.DOT
+      };
+      
+      return {
+        type: typeMap[char],
+        value: char
+      };
+    }
+  }
+];
+
+/**
+ * High-performance regex-based lexer (primary implementation)
+ */
+function createOptimizedLexer(input) {
+  let position = 0;
+  let line = 1;
+  let column = 1;
+  
+  // Pre-compile all regexes for better performance
+  const compiledPatterns = TOKEN_PATTERNS.map(pattern => ({
+    ...pattern,
+    compiledRegex: pattern.regex
+  }));
+
+  function getCurrentLocation() {
+    return { line, column };
+  }
+
+  function advance(length) {
+    for (let i = 0; i < length; i++) {
+      if (input[position + i] === '\n') {
+        line++;
+        column = 1;
+      } else {
+        column++;
+      }
+    }
+    position += length;
+  }
+
+  function nextToken() {
+    if (position >= input.length) {
+      return {
+        type: tokenTypes.EOF,
+        value: '',
+        line,
+        column
+      };
+    }
+
+    const remaining = input.slice(position);
+    const startLocation = getCurrentLocation();
+
+    // Try each pattern in order
+    for (const pattern of compiledPatterns) {
+      const match = remaining.match(pattern.compiledRegex);
+      
+      if (match) {
+        const matchedText = match[0];
+        const tokenLength = matchedText.length;
+        
+        // Handle special patterns that affect lexer state
+        if (pattern.onMatch) {
+          pattern.onMatch({ line, column });
+        }
+        
+        advance(tokenLength);
+        
+        // Skip tokens that should be ignored
+        if (pattern.skip) {
+          return nextToken();
+        }
+        
+        // Create the token
+        let token;
+        
+        if (pattern.process) {
+          token = pattern.process(match, this);
+        } else {
+          token = {
+            type: pattern.type,
+            value: pattern.value || matchedText
+          };
+        }
+        
+        // Add location information
+        token.line = startLocation.line;
+        token.column = startLocation.column;
+        
+        return token;
+      }
+    }
+
+    // No pattern matched - handle error
+    const char = remaining[0];
+    const suggestions = [];
+    
+    // Common character mistakes
+    if (char === '"' || char === '"') {
+      suggestions.push('Use straight quotes " instead of curly quotes');
+    } else if (char === '–' || char === '—') {
+      suggestions.push('Use regular minus - or arrow -> instead of em/en dash');
+    } else if (/[^\x00-\x7F]/.test(char)) {
+      suggestions.push('Use only ASCII characters in Baba Yaga code');
+    } else {
+      suggestions.push(`Character "${char}" is not valid in Baba Yaga syntax`);
+    }
+    
+    throw new LexError(
+      `Unexpected character: ${JSON.stringify(char)}`,
+      { line, column, length: 1 },
+      input,
+      suggestions
+    );
+  }
+
+  function allTokens() {
+    const tokens = [];
+    let token;
+    
+    do {
+      token = nextToken();
+      tokens.push(token);
+    } while (token.type !== tokenTypes.EOF);
+    
+    return tokens;
+  }
+
+  return {
+    allTokens,
+    nextToken
+  };
+}
+
+/**
+ * Performance comparison utility with fallback
+ */
+async function createLexerWithFallback(input, useOptimized = true) {
+  if (useOptimized) {
+    try {
+      return createOptimizedLexer(input);
+    } catch (error) {
+      // If optimized lexer fails, fall back to legacy
+      console.warn('Falling back to legacy lexer:', error.message);
+      const { createLexer } = await import('../legacy/lexer.js');
+      return createLexer(input);
+    }
+  } else {
+    const { createLexer } = await import('../legacy/lexer.js');
+    return createLexer(input);
+  }
+}
+
+// Primary exports (optimized by default)
+export { 
+  createOptimizedLexer as createLexer,  // Main export uses optimized version
+  createOptimizedLexer,
+  createLexerWithFallback,
+  tokenTypes 
+};