// lexer-optimized.js - High-performance regex-based lexer

import { LexError, ErrorHelpers } from './error.js';

const tokenTypes = {
  IDENTIFIER: 'IDENTIFIER',
  TYPE: 'TYPE',
  NUMBER: 'NUMBER',
  STRING: 'STRING',
  ARROW: 'ARROW',
  COLON: 'COLON',
  SEMICOLON: 'SEMICOLON',
  COMMA: 'COMMA',
  KEYWORD: 'KEYWORD',
  OPERATOR: 'OPERATOR',
  LPAREN: 'LPAREN',
  RPAREN: 'RPAREN',
  DOT: 'DOT',
  LBRACKET: 'LBRACKET',
  RBRACKET: 'RBRACKET',
  LBRACE: 'LBRACE',
  RBRACE: 'RBRACE',
  EOF: 'EOF',
};

const keywords = new Set(['when', 'is', 'then', 'if', 'Ok', 'Err', 'true', 'false', 'PI', 'INFINITY', 'and', 'or', 'xor']);
const types = new Set(['Int', 'String', 'Result', 'Float', 'Number', 'List', 'Table', 'Bool']);

/**
 * Token pattern definitions with regex and processing functions
 */
const TOKEN_PATTERNS = [
  // Whitespace (skip)
  {
    name: 'WHITESPACE',
    regex: /^[ \t\r]+/,
    skip: true
  },
  
  // Newlines (track line numbers) - handled by advance function
  {
    name: 'NEWLINE',
    regex: /^\n/,
    skip: true
  },
  
  // Comments (skip)
  {
    name: 'COMMENT',
    regex: /^\/\/.*$/m,
    skip: true
  },
  
  // Multi-character operators (order matters - longest first)
  {
    name: 'ARROW',
    regex: /^->/,
    type: tokenTypes.ARROW
  },
  
  {
    name: 'STRING_CONCAT',
    regex: /^\.\./,
    type: tokenTypes.OPERATOR,
    value: '..'
  },
  
  {
    name: 'COMPARISON_OPS',
    regex: /^(>=|<=|!=)/,
    type: tokenTypes.OPERATOR
  },
  
  // Numbers (including negative numbers in appropriate contexts)
  {
    name: 'NUMBER',
    regex: /^-?\d+(\.\d+)?/,
    type: tokenTypes.NUMBER,
    process: (match, lexer) => {
      const value = parseFloat(match[0]);
      const isFloat = match[0].includes('.');
      return {
        type: tokenTypes.NUMBER,
        value,
        isFloat,
        originalString: match[0]
      };
    }
  },
  
  // Strings with escape sequence handling
  {
    name: 'STRING',
    regex: /^"((?:[^"\\]|\\.)*)"/,
    type: tokenTypes.STRING,
    process: (match, lexer) => {
      const rawString = match[1];
      const processedString = rawString
        .replace(/\\n/g, '\n')
        .replace(/\\t/g, '\t')
        .replace(/\\r/g, '\r')
        .replace(/\\\\/g, '\\')
        .replace(/\\"/g, '"');
      
      return {
        type: tokenTypes.STRING,
        value: processedString
      };
    }
  },
  
  // Identifiers, keywords, and types
  {
    name: 'IDENTIFIER',
    regex: /^[a-zA-Z_][a-zA-Z0-9_]*/,
    process: (match, lexer) => {
      const value = match[0];
      
      if (keywords.has(value)) {
        return {
          type: tokenTypes.KEYWORD,
          value
        };
      } else if (types.has(value)) {
        return {
          type: tokenTypes.TYPE,
          value
        };
      } else {
        return {
          type: tokenTypes.IDENTIFIER,
          value
        };
      }
    }
  },
  
  // Single character operators
  {
    name: 'SINGLE_CHAR_OPS',
    regex: /^[+\-*/%=><]/,
    type: tokenTypes.OPERATOR
  },
  
  // Punctuation
  {
    name: 'PUNCTUATION',
    regex: /^[()[\]{}:;,.]/,
    process: (match, lexer) => {
      const char = match[0];
      const typeMap = {
        '(': tokenTypes.LPAREN,
        ')': tokenTypes.RPAREN,
        '[': tokenTypes.LBRACKET,
        ']': tokenTypes.RBRACKET,
        '{': tokenTypes.LBRACE,
        '}': tokenTypes.RBRACE,
        ':': tokenTypes.COLON,
        ';': tokenTypes.SEMICOLON,
        ',': tokenTypes.COMMA,
        '.': tokenTypes.DOT
      };
      
      return {
        type: typeMap[char],
        value: char
      };
    }
  }
];

/**
 * High-performance regex-based lexer
 */
function createOptimizedLexer(input) {
  let position = 0;
  let line = 1;
  let column = 1;
  
  // Pre-compile all regexes for better performance
  const compiledPatterns = TOKEN_PATTERNS.map(pattern => ({
    ...pattern,
    compiledRegex: pattern.regex
  }));

  function getCurrentLocation() {
    return { line, column };
  }

  function advance(length) {
    for (let i = 0; i < length; i++) {
      if (input[position + i] === '\n') {
        line++;
        column = 1;
      } else {
        column++;
      }
    }
    position += length;
  }

  function nextToken() {
    if (position >= input.length) {
      return {
        type: tokenTypes.EOF,
        value: '',
        line,
        column
      };
    }

    const remaining = input.slice(position);
    const startLocation = getCurrentLocation();

    // Try each pattern in order
    for (const pattern of compiledPatterns) {
      const match = remaining.match(pattern.compiledRegex);
      
      if (match) {
        const matchedText = match[0];
        const tokenLength = matchedText.length;
        
        // Handle special patterns that affect lexer state
        if (pattern.onMatch) {
          pattern.onMatch({ line, column });
        }
        
        advance(tokenLength);
        
        // Skip tokens that should be ignored
        if (pattern.skip) {
          return nextToken();
        }
        
        // Create the token
        let token;
        
        if (pattern.process) {
          token = pattern.process(match, this);
        } else {
          token = {
            type: pattern.type,
            value: pattern.value || matchedText
          };
        }
        
        // Add location information
        token.line = startLocation.line;
        token.column = startLocation.column;
        
        return token;
      }
    }

    // No pattern matched - handle error
    const char = remaining[0];
    const suggestions = [];
    
    // Common character mistakes
    if (char === '"' || char === '"') {
      suggestions.push('Use straight quotes " instead of curly quotes');
    } else if (char === '–' || char === '—') {
      suggestions.push('Use regular minus - or arrow -> instead of em/en dash');
    } else if (/[^\x00-\x7F]/.test(char)) {
      suggestions.push('Use only ASCII characters in Baba Yaga code');
    } else {
      suggestions.push(`Character "${char}" is not valid in Baba Yaga syntax`);
    }
    
    throw new LexError(
      `Unexpected character: ${JSON.stringify(char)}`,
      { line, column, length: 1 },
      input,
      suggestions
    );
  }

  function allTokens() {
    const tokens = [];
    let token;
    
    do {
      token = nextToken();
      tokens.push(token);
    } while (token.type !== tokenTypes.EOF);
    
    return tokens;
  }

  return {
    allTokens,
    nextToken
  };
}

/**
 * Performance comparison utility
 */
async function createLexerWithFallback(input, useOptimized = true) {
  if (useOptimized) {
    try {
      return createOptimizedLexer(input);
    } catch (error) {
      // If optimized lexer fails, fall back to original
      console.warn('Falling back to original lexer:', error.message);
      const { createLexer } = await import('./lexer.js');
      return createLexer(input);
    }
  } else {
    const { createLexer } = await import('./lexer.js');
    return createLexer(input);
  }
}

/**
 * Benchmark function to compare lexer performance
 */
async function benchmarkLexers(input, iterations = 1000) {
  console.log(`Benchmarking lexers with ${iterations} iterations...`);
  
  // Warm up
  for (let i = 0; i < 10; i++) {
    createOptimizedLexer(input).allTokens();
  }
  
  // Benchmark optimized lexer
  const optimizedStart = performance.now();
  for (let i = 0; i < iterations; i++) {
    createOptimizedLexer(input).allTokens();
  }
  const optimizedTime = performance.now() - optimizedStart;
  
  // Benchmark original lexer
  const { createLexer } = await import('./lexer.js');
  const originalStart = performance.now();
  for (let i = 0; i < iterations; i++) {
    createLexer(input).allTokens();
  }
  const originalTime = performance.now() - originalStart;
  
  console.log(`Original lexer: ${originalTime.toFixed(2)}ms`);
  console.log(`Optimized lexer: ${optimizedTime.toFixed(2)}ms`);
  console.log(`Speedup: ${(originalTime / optimizedTime).toFixed(2)}x`);
  
  return {
    originalTime,
    optimizedTime,
    speedup: originalTime / optimizedTime
  };
}

export { 
  createOptimizedLexer,
  createLexerWithFallback,
  benchmarkLexers,
  tokenTypes 
};