diff options
Diffstat (limited to 'tree-sitter/dsk/dsk-cli/src/utils/inference.ts')
-rw-r--r-- | tree-sitter/dsk/dsk-cli/src/utils/inference.ts | 286 |
1 files changed, 286 insertions, 0 deletions
diff --git a/tree-sitter/dsk/dsk-cli/src/utils/inference.ts b/tree-sitter/dsk/dsk-cli/src/utils/inference.ts new file mode 100644 index 0000000..f49e176 --- /dev/null +++ b/tree-sitter/dsk/dsk-cli/src/utils/inference.ts @@ -0,0 +1,286 @@ +/** + * Pattern Inference Engine + * + * Infers regular expression patterns from user examples. + * Uses an extensible library of common token patterns with solid defaults. + */ + +export interface TokenPattern { + name: string; + description: string; + regex: RegExp; + examples: string[]; + priority: number; // Higher priority patterns are tried first +} + +export interface InferenceResult { + pattern: TokenPattern | null; + confidence: number; // 0-1 score indicating match quality + matchedExamples: string[]; + rejectedExamples: string[]; +} + +/** + * Default token pattern library with common programming language constructs + */ +export const DEFAULT_PATTERNS: TokenPattern[] = [ + // Identifiers + { + name: 'c_identifier', + description: 'C-style identifier (letters, digits, underscore, must start with letter/underscore)', + regex: /^[a-zA-Z_][a-zA-Z0-9_]*$/, + examples: ['myVar', 'userName', '_private', 'MAX_SIZE'], + priority: 10 + }, + { + name: 'js_identifier', + description: 'JavaScript-style identifier (letters, digits, $, _, must start with letter/$/_)', + regex: /^[A-Za-z_$][A-Za-z0-9_$]*$/, + examples: ['x', 'var1', '$var', '_var', 'Var3', 'BananaFruitStand'], + priority: 11 + }, + { + name: 'kebab_identifier', + description: 'Kebab-case identifier (letters, digits, hyphens)', + regex: /^[a-zA-Z][a-zA-Z0-9-]*[a-zA-Z0-9]$/, + examples: ['my-var', 'user-name', 'max-size'], + priority: 8 + }, + { + name: 'camel_identifier', + description: 'CamelCase identifier (letters and digits, no separators)', + regex: /^[a-zA-Z][a-zA-Z0-9]*$/, + examples: ['myVar', 'userName', 'maxSize'], + priority: 9 + }, + + // Numbers + { + name: 'number_general', + description: 'Integer or floating point number (optional sign)', + regex: /^[+-]?(?:\d*\.\d+|\d+\.\d*|\d+)$/, + examples: ['1', '-7', '1.24', '10000', '+0.5', '2.'], + priority: 16 + }, + { + name: 'integer', + description: 'Integer number (optional sign, digits)', + regex: /^[+-]?\d+$/, + examples: ['42', '-17', '+123', '0'], + priority: 15 + }, + { + name: 'float', + description: 'Floating point number (optional sign, decimal point)', + regex: /^[+-]?\d*\.\d+$/, + examples: ['3.14', '-2.5', '+0.123', '.5'], + priority: 14 + }, + { + name: 'scientific', + description: 'Scientific notation number', + regex: /^[+-]?\d*\.?\d+[eE][+-]?\d+$/, + examples: ['1e10', '3.14e-2', '-2.5E+3'], + priority: 12 + }, + { + name: 'hex_number', + description: 'Hexadecimal number (0x prefix)', + regex: /^0[xX][0-9a-fA-F]+$/, + examples: ['0xFF', '0x123ABC', '0X00'], + priority: 13 + }, + + // Strings + { + name: 'double_quoted_string', + description: 'Double-quoted string literal', + regex: /^".*"$/, + examples: ['"hello"', '"world"', '""'], + priority: 11 + }, + { + name: 'single_quoted_string', + description: 'Single-quoted string literal', + regex: /^'.*'$/, + examples: ["'hello'", "'world'", "''"], + priority: 11 + }, + { + name: 'backtick_string', + description: 'Backtick-quoted string literal (template strings)', + regex: /^`.*`$/, + examples: ['`hello`', '`world ${var}`', '``'], + priority: 7 + }, + + // Comments + { + name: 'c_line_comment', + description: 'C-style line comment (// prefix)', + regex: /^\/\/.*$/, + examples: ['// comment', '// TODO: fix this'], + priority: 16 + }, + { + name: 'hash_line_comment', + description: 'Hash line comment (# prefix)', + regex: /^#.*$/, + examples: ['# comment', '# TODO: fix this'], + priority: 16 + }, + { + name: 'semicolon_line_comment', + description: 'Semicolon line comment (; prefix)', + regex: /^;.*$/, + examples: ['; comment', '; TODO: fix this'], + priority: 16 + }, + + // Special patterns + { + name: 'boolean', + description: 'Boolean literal', + regex: /^(true|false)$/, + examples: ['true', 'false'], + priority: 17 + }, + { + name: 'null_literal', + description: 'Null/nil literal', + regex: /^(null|nil|None|undefined)$/, + examples: ['null', 'nil', 'None', 'undefined'], + priority: 17 + } +]; + +/** + * Infer a pattern from valid and invalid examples + */ +export function inferPattern( + validExamples: string[], + invalidExamples: string[] = [], + customPatterns: TokenPattern[] = [] +): InferenceResult { + if (validExamples.length === 0) { + return { + pattern: null, + confidence: 0, + matchedExamples: [], + rejectedExamples: invalidExamples + }; + } + + // Combine default patterns with custom patterns + const allPatterns = [...customPatterns, ...DEFAULT_PATTERNS] + .sort((a, b) => b.priority - a.priority); + + // Try each pattern + for (const pattern of allPatterns) { + const validMatches = validExamples.filter(example => pattern.regex.test(example)); + const invalidMatches = invalidExamples.filter(example => pattern.regex.test(example)); + + // Pattern must match ALL valid examples and NO invalid examples + if (validMatches.length === validExamples.length && invalidMatches.length === 0) { + const confidence = calculateConfidence(validExamples, invalidExamples, pattern); + + return { + pattern, + confidence, + matchedExamples: validMatches, + rejectedExamples: invalidExamples + }; + } + } + + // No pattern found + return { + pattern: null, + confidence: 0, + matchedExamples: [], + rejectedExamples: invalidExamples + }; +} + +/** + * Calculate confidence score for a pattern match + */ +function calculateConfidence( + validExamples: string[], + invalidExamples: string[], + pattern: TokenPattern +): number { + let confidence = 0.8; // Base confidence + + // Boost confidence for more valid examples + if (validExamples.length >= 3) confidence += 0.1; + if (validExamples.length >= 5) confidence += 0.05; + + // Boost confidence for having invalid examples that were correctly rejected + if (invalidExamples.length > 0) confidence += 0.05; + + // Boost confidence if examples match the pattern's own examples + const patternExampleMatches = validExamples.filter(ex => + pattern.examples.some(pex => ex === pex) + ); + if (patternExampleMatches.length > 0) { + confidence += 0.05 * patternExampleMatches.length; + } + + return Math.min(confidence, 1.0); +} + +/** + * Generate a custom regex pattern from examples (fallback when inference fails) + */ +export function generateCustomPattern( + validExamples: string[], + invalidExamples: string[] = [] +): string { + if (validExamples.length === 0) return ''; + + // Simple approach: create alternation of literal examples + // This is a fallback - not as robust as proper pattern matching + const escapedExamples = validExamples.map(ex => + ex.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + ); + + return `^(${escapedExamples.join('|')})$`; +} + +/** + * Validate that a pattern works correctly with given examples + */ +export function validatePattern( + pattern: string, + validExamples: string[], + invalidExamples: string[] = [] +): { isValid: boolean; errors: string[] } { + const errors: string[] = []; + + try { + const regex = new RegExp(pattern); + + // Check valid examples + for (const example of validExamples) { + if (!regex.test(example)) { + errors.push(`Pattern does not match valid example: "${example}"`); + } + } + + // Check invalid examples + for (const example of invalidExamples) { + if (regex.test(example)) { + errors.push(`Pattern incorrectly matches invalid example: "${example}"`); + } + } + + } catch (e) { + errors.push(`Invalid regular expression: ${e instanceof Error ? e.message : 'Unknown error'}`); + } + + return { + isValid: errors.length === 0, + errors + }; +} |