about summary refs log tree commit diff stats
path: root/tree-sitter/dsk/dsk-cli/src/utils/inference.ts
diff options
context:
space:
mode:
Diffstat (limited to 'tree-sitter/dsk/dsk-cli/src/utils/inference.ts')
-rw-r--r--tree-sitter/dsk/dsk-cli/src/utils/inference.ts286
1 files changed, 286 insertions, 0 deletions
diff --git a/tree-sitter/dsk/dsk-cli/src/utils/inference.ts b/tree-sitter/dsk/dsk-cli/src/utils/inference.ts
new file mode 100644
index 0000000..f49e176
--- /dev/null
+++ b/tree-sitter/dsk/dsk-cli/src/utils/inference.ts
@@ -0,0 +1,286 @@
+/**
+ * Pattern Inference Engine
+ * 
+ * Infers regular expression patterns from user examples.
+ * Uses an extensible library of common token patterns with solid defaults.
+ */
+
+export interface TokenPattern {
+  name: string;
+  description: string;
+  regex: RegExp;
+  examples: string[];
+  priority: number; // Higher priority patterns are tried first
+}
+
+export interface InferenceResult {
+  pattern: TokenPattern | null;
+  confidence: number; // 0-1 score indicating match quality
+  matchedExamples: string[];
+  rejectedExamples: string[];
+}
+
+/**
+ * Default token pattern library with common programming language constructs
+ */
+export const DEFAULT_PATTERNS: TokenPattern[] = [
+  // Identifiers
+  {
+    name: 'c_identifier',
+    description: 'C-style identifier (letters, digits, underscore, must start with letter/underscore)',
+    regex: /^[a-zA-Z_][a-zA-Z0-9_]*$/,
+    examples: ['myVar', 'userName', '_private', 'MAX_SIZE'],
+    priority: 10
+  },
+  {
+    name: 'js_identifier',
+    description: 'JavaScript-style identifier (letters, digits, $, _, must start with letter/$/_)',
+    regex: /^[A-Za-z_$][A-Za-z0-9_$]*$/,
+    examples: ['x', 'var1', '$var', '_var', 'Var3', 'BananaFruitStand'],
+    priority: 11
+  },
+  {
+    name: 'kebab_identifier', 
+    description: 'Kebab-case identifier (letters, digits, hyphens)',
+    regex: /^[a-zA-Z][a-zA-Z0-9-]*[a-zA-Z0-9]$/,
+    examples: ['my-var', 'user-name', 'max-size'],
+    priority: 8
+  },
+  {
+    name: 'camel_identifier',
+    description: 'CamelCase identifier (letters and digits, no separators)',
+    regex: /^[a-zA-Z][a-zA-Z0-9]*$/,
+    examples: ['myVar', 'userName', 'maxSize'],
+    priority: 9
+  },
+
+  // Numbers
+  {
+    name: 'number_general',
+    description: 'Integer or floating point number (optional sign)',
+    regex: /^[+-]?(?:\d*\.\d+|\d+\.\d*|\d+)$/,
+    examples: ['1', '-7', '1.24', '10000', '+0.5', '2.'],
+    priority: 16
+  },
+  {
+    name: 'integer',
+    description: 'Integer number (optional sign, digits)',
+    regex: /^[+-]?\d+$/,
+    examples: ['42', '-17', '+123', '0'],
+    priority: 15
+  },
+  {
+    name: 'float',
+    description: 'Floating point number (optional sign, decimal point)',
+    regex: /^[+-]?\d*\.\d+$/,
+    examples: ['3.14', '-2.5', '+0.123', '.5'],
+    priority: 14
+  },
+  {
+    name: 'scientific',
+    description: 'Scientific notation number',
+    regex: /^[+-]?\d*\.?\d+[eE][+-]?\d+$/,
+    examples: ['1e10', '3.14e-2', '-2.5E+3'],
+    priority: 12
+  },
+  {
+    name: 'hex_number',
+    description: 'Hexadecimal number (0x prefix)',
+    regex: /^0[xX][0-9a-fA-F]+$/,
+    examples: ['0xFF', '0x123ABC', '0X00'],
+    priority: 13
+  },
+
+  // Strings
+  {
+    name: 'double_quoted_string',
+    description: 'Double-quoted string literal',
+    regex: /^".*"$/,
+    examples: ['"hello"', '"world"', '""'],
+    priority: 11
+  },
+  {
+    name: 'single_quoted_string',
+    description: 'Single-quoted string literal',
+    regex: /^'.*'$/,
+    examples: ["'hello'", "'world'", "''"],
+    priority: 11
+  },
+  {
+    name: 'backtick_string',
+    description: 'Backtick-quoted string literal (template strings)',
+    regex: /^`.*`$/,
+    examples: ['`hello`', '`world ${var}`', '``'],
+    priority: 7
+  },
+
+  // Comments
+  {
+    name: 'c_line_comment',
+    description: 'C-style line comment (// prefix)',
+    regex: /^\/\/.*$/,
+    examples: ['// comment', '// TODO: fix this'],
+    priority: 16
+  },
+  {
+    name: 'hash_line_comment',
+    description: 'Hash line comment (# prefix)',
+    regex: /^#.*$/,
+    examples: ['# comment', '# TODO: fix this'],
+    priority: 16
+  },
+  {
+    name: 'semicolon_line_comment',
+    description: 'Semicolon line comment (; prefix)',
+    regex: /^;.*$/,
+    examples: ['; comment', '; TODO: fix this'],
+    priority: 16
+  },
+
+  // Special patterns
+  {
+    name: 'boolean',
+    description: 'Boolean literal',
+    regex: /^(true|false)$/,
+    examples: ['true', 'false'],
+    priority: 17
+  },
+  {
+    name: 'null_literal',
+    description: 'Null/nil literal',
+    regex: /^(null|nil|None|undefined)$/,
+    examples: ['null', 'nil', 'None', 'undefined'],
+    priority: 17
+  }
+];
+
+/**
+ * Infer a pattern from valid and invalid examples
+ */
+export function inferPattern(
+  validExamples: string[],
+  invalidExamples: string[] = [],
+  customPatterns: TokenPattern[] = []
+): InferenceResult {
+  if (validExamples.length === 0) {
+    return {
+      pattern: null,
+      confidence: 0,
+      matchedExamples: [],
+      rejectedExamples: invalidExamples
+    };
+  }
+
+  // Combine default patterns with custom patterns
+  const allPatterns = [...customPatterns, ...DEFAULT_PATTERNS]
+    .sort((a, b) => b.priority - a.priority);
+
+  // Try each pattern
+  for (const pattern of allPatterns) {
+    const validMatches = validExamples.filter(example => pattern.regex.test(example));
+    const invalidMatches = invalidExamples.filter(example => pattern.regex.test(example));
+
+    // Pattern must match ALL valid examples and NO invalid examples
+    if (validMatches.length === validExamples.length && invalidMatches.length === 0) {
+      const confidence = calculateConfidence(validExamples, invalidExamples, pattern);
+      
+      return {
+        pattern,
+        confidence,
+        matchedExamples: validMatches,
+        rejectedExamples: invalidExamples
+      };
+    }
+  }
+
+  // No pattern found
+  return {
+    pattern: null,
+    confidence: 0,
+    matchedExamples: [],
+    rejectedExamples: invalidExamples
+  };
+}
+
+/**
+ * Calculate confidence score for a pattern match
+ */
+function calculateConfidence(
+  validExamples: string[],
+  invalidExamples: string[],
+  pattern: TokenPattern
+): number {
+  let confidence = 0.8; // Base confidence
+
+  // Boost confidence for more valid examples
+  if (validExamples.length >= 3) confidence += 0.1;
+  if (validExamples.length >= 5) confidence += 0.05;
+
+  // Boost confidence for having invalid examples that were correctly rejected
+  if (invalidExamples.length > 0) confidence += 0.05;
+
+  // Boost confidence if examples match the pattern's own examples
+  const patternExampleMatches = validExamples.filter(ex => 
+    pattern.examples.some(pex => ex === pex)
+  );
+  if (patternExampleMatches.length > 0) {
+    confidence += 0.05 * patternExampleMatches.length;
+  }
+
+  return Math.min(confidence, 1.0);
+}
+
+/**
+ * Generate a custom regex pattern from examples (fallback when inference fails)
+ */
+export function generateCustomPattern(
+  validExamples: string[],
+  invalidExamples: string[] = []
+): string {
+  if (validExamples.length === 0) return '';
+
+  // Simple approach: create alternation of literal examples
+  // This is a fallback - not as robust as proper pattern matching
+  const escapedExamples = validExamples.map(ex => 
+    ex.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
+  );
+
+  return `^(${escapedExamples.join('|')})$`;
+}
+
+/**
+ * Validate that a pattern works correctly with given examples
+ */
+export function validatePattern(
+  pattern: string,
+  validExamples: string[],
+  invalidExamples: string[] = []
+): { isValid: boolean; errors: string[] } {
+  const errors: string[] = [];
+  
+  try {
+    const regex = new RegExp(pattern);
+    
+    // Check valid examples
+    for (const example of validExamples) {
+      if (!regex.test(example)) {
+        errors.push(`Pattern does not match valid example: "${example}"`);
+      }
+    }
+    
+    // Check invalid examples
+    for (const example of invalidExamples) {
+      if (regex.test(example)) {
+        errors.push(`Pattern incorrectly matches invalid example: "${example}"`);
+      }
+    }
+    
+  } catch (e) {
+    errors.push(`Invalid regular expression: ${e instanceof Error ? e.message : 'Unknown error'}`);
+  }
+
+  return {
+    isValid: errors.length === 0,
+    errors
+  };
+}