/** * Pattern Inference Engine * * Infers regular expression patterns from user examples. * Uses an extensible library of common token patterns with solid defaults. */ export interface TokenPattern { name: string; description: string; regex: RegExp; examples: string[]; priority: number; // Higher priority patterns are tried first } export interface InferenceResult { pattern: TokenPattern | null; confidence: number; // 0-1 score indicating match quality matchedExamples: string[]; rejectedExamples: string[]; } /** * Default token pattern library with common programming language constructs */ export const DEFAULT_PATTERNS: TokenPattern[] = [ // Identifiers { name: 'c_identifier', description: 'C-style identifier (letters, digits, underscore, must start with letter/underscore)', regex: /^[a-zA-Z_][a-zA-Z0-9_]*$/, examples: ['myVar', 'userName', '_private', 'MAX_SIZE'], priority: 10 }, { name: 'js_identifier', description: 'JavaScript-style identifier (letters, digits, $, _, must start with letter/$/_)', regex: /^[A-Za-z_$][A-Za-z0-9_$]*$/, examples: ['x', 'var1', '$var', '_var', 'Var3', 'BananaFruitStand'], priority: 11 }, { name: 'kebab_identifier', description: 'Kebab-case identifier (letters, digits, hyphens)', regex: /^[a-zA-Z][a-zA-Z0-9-]*[a-zA-Z0-9]$/, examples: ['my-var', 'user-name', 'max-size'], priority: 8 }, { name: 'camel_identifier', description: 'CamelCase identifier (letters and digits, no separators)', regex: /^[a-zA-Z][a-zA-Z0-9]*$/, examples: ['myVar', 'userName', 'maxSize'], priority: 9 }, // Numbers { name: 'number_general', description: 'Integer or floating point number (optional sign)', regex: /^[+-]?(?:\d*\.\d+|\d+\.\d*|\d+)$/, examples: ['1', '-7', '1.24', '10000', '+0.5', '2.'], priority: 16 }, { name: 'integer', description: 'Integer number (optional sign, digits)', regex: /^[+-]?\d+$/, examples: ['42', '-17', '+123', '0'], priority: 15 }, { name: 'float', description: 'Floating point number (optional sign, decimal point)', regex: /^[+-]?\d*\.\d+$/, examples: ['3.14', '-2.5', '+0.123', '.5'], priority: 14 }, { name: 'scientific', description: 'Scientific notation number', regex: /^[+-]?\d*\.?\d+[eE][+-]?\d+$/, examples: ['1e10', '3.14e-2', '-2.5E+3'], priority: 12 }, { name: 'hex_number', description: 'Hexadecimal number (0x prefix)', regex: /^0[xX][0-9a-fA-F]+$/, examples: ['0xFF', '0x123ABC', '0X00'], priority: 13 }, // Strings { name: 'double_quoted_string', description: 'Double-quoted string literal', regex: /^".*"$/, examples: ['"hello"', '"world"', '""'], priority: 11 }, { name: 'single_quoted_string', description: 'Single-quoted string literal', regex: /^'.*'$/, examples: ["'hello'", "'world'", "''"], priority: 11 }, { name: 'backtick_string', description: 'Backtick-quoted string literal (template strings)', regex: /^`.*`$/, examples: ['`hello`', '`world ${var}`', '``'], priority: 7 }, // Comments { name: 'c_line_comment', description: 'C-style line comment (// prefix)', regex: /^\/\/.*$/, examples: ['// comment', '// TODO: fix this'], priority: 16 }, { name: 'hash_line_comment', description: 'Hash line comment (# prefix)', regex: /^#.*$/, examples: ['# comment', '# TODO: fix this'], priority: 16 }, { name: 'semicolon_line_comment', description: 'Semicolon line comment (; prefix)', regex: /^;.*$/, examples: ['; comment', '; TODO: fix this'], priority: 16 }, // Special patterns { name: 'boolean', description: 'Boolean literal', regex: /^(true|false)$/, examples: ['true', 'false'], priority: 17 }, { name: 'null_literal', description: 'Null/nil literal', regex: /^(null|nil|None|undefined)$/, examples: ['null', 'nil', 'None', 'undefined'], priority: 17 } ]; /** * Infer a pattern from valid and invalid examples */ export function inferPattern( validExamples: string[], invalidExamples: string[] = [], customPatterns: TokenPattern[] = [] ): InferenceResult { if (validExamples.length === 0) { return { pattern: null, confidence: 0, matchedExamples: [], rejectedExamples: invalidExamples }; } // Combine default patterns with custom patterns const allPatterns = [...customPatterns, ...DEFAULT_PATTERNS] .sort((a, b) => b.priority - a.priority); // Try each pattern for (const pattern of allPatterns) { const validMatches = validExamples.filter(example => pattern.regex.test(example)); const invalidMatches = invalidExamples.filter(example => pattern.regex.test(example)); // Pattern must match ALL valid examples and NO invalid examples if (validMatches.length === validExamples.length && invalidMatches.length === 0) { const confidence = calculateConfidence(validExamples, invalidExamples, pattern); return { pattern, confidence, matchedExamples: validMatches, rejectedExamples: invalidExamples }; } } // No pattern found return { pattern: null, confidence: 0, matchedExamples: [], rejectedExamples: invalidExamples }; } /** * Calculate confidence score for a pattern match */ function calculateConfidence( validExamples: string[], invalidExamples: string[], pattern: TokenPattern ): number { let confidence = 0.8; // Base confidence // Boost confidence for more valid examples if (validExamples.length >= 3) confidence += 0.1; if (validExamples.length >= 5) confidence += 0.05; // Boost confidence for having invalid examples that were correctly rejected if (invalidExamples.length > 0) confidence += 0.05; // Boost confidence if examples match the pattern's own examples const patternExampleMatches = validExamples.filter(ex => pattern.examples.some(pex => ex === pex) ); if (patternExampleMatches.length > 0) { confidence += 0.05 * patternExampleMatches.length; } return Math.min(confidence, 1.0); } /** * Generate a custom regex pattern from examples (fallback when inference fails) */ export function generateCustomPattern( validExamples: string[], invalidExamples: string[] = [] ): string { if (validExamples.length === 0) return ''; // Simple approach: create alternation of literal examples // This is a fallback - not as robust as proper pattern matching const escapedExamples = validExamples.map(ex => ex.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') ); return `^(${escapedExamples.join('|')})$`; } /** * Validate that a pattern works correctly with given examples */ export function validatePattern( pattern: string, validExamples: string[], invalidExamples: string[] = [] ): { isValid: boolean; errors: string[] } { const errors: string[] = []; try { const regex = new RegExp(pattern); // Check valid examples for (const example of validExamples) { if (!regex.test(example)) { errors.push(`Pattern does not match valid example: "${example}"`); } } // Check invalid examples for (const example of invalidExamples) { if (regex.test(example)) { errors.push(`Pattern incorrectly matches invalid example: "${example}"`); } } } catch (e) { errors.push(`Invalid regular expression: ${e instanceof Error ? e.message : 'Unknown error'}`); } return { isValid: errors.length === 0, errors }; }