about summary refs log tree commit diff stats
path: root/bash/talk-to-computer/rag_config.sh
diff options
context:
space:
mode:
Diffstat (limited to 'bash/talk-to-computer/rag_config.sh')
-rw-r--r--bash/talk-to-computer/rag_config.sh118
1 files changed, 118 insertions, 0 deletions
diff --git a/bash/talk-to-computer/rag_config.sh b/bash/talk-to-computer/rag_config.sh
new file mode 100644
index 0000000..27b724b
--- /dev/null
+++ b/bash/talk-to-computer/rag_config.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+# RAG (Retrieval-Augmented Generation) Configuration
+# This file configures the RAG system for corpus-based knowledge augmentation
+
+# --- Corpus Configuration ---
+CORPUS_DIR="corpus"
+CORPUS_REGISTRY="${CORPUS_DIR}/corpus_registry.txt"
+CORPUS_CACHE_FILE="${CORPUS_DIR}/.corpus_cache"
+CORPUS_CACHE_TTL=3600  # Cache TTL in seconds (1 hour)
+
+# --- Search Configuration ---
+MAX_SEARCH_RESULTS=5
+MIN_CONTENT_LENGTH=50
+MAX_CONTENT_LENGTH=5000
+SEARCH_CONTEXT_LINES=3  # Lines of context around search matches
+
+# --- Topic Classification ---
+# Keywords that trigger specific topic matching (format: topic|keywords)
+TOPIC_KEYWORDS_FILE="${CORPUS_DIR}/.topic_keywords"
+
+# Initialize topic keywords file if it doesn't exist
+if [ ! -f "$TOPIC_KEYWORDS_FILE" ]; then
+    cat > "$TOPIC_KEYWORDS_FILE" << 'EOF'
+programming|bash shell scripting code algorithm programming software development
+lil|decker lil language terse programming scripting deck
+science|physics chemistry biology research scientific experiment
+physics|quantum relativity mechanics thermodynamics energy force
+literature|book author writing novel poem analysis criticism
+general|knowledge fact information general misc miscellaneous
+EOF
+fi
+
+# --- File Processing ---
+# Supported file extensions and their processing commands (format: ext|command)
+FILE_PROCESSORS_FILE="${CORPUS_DIR}/.file_processors"
+
+# Initialize file processors if it doesn't exist
+if [ ! -f "$FILE_PROCESSORS_FILE" ]; then
+    cat > "$FILE_PROCESSORS_FILE" << 'EOF'
+txt|cat
+md|cat
+html|cat
+EOF
+fi
+
+# --- Search Tools ---
+# Commands used for searching different file types
+GREP_CMD="grep -r -i --include=\"*.txt\" --include=\"*.md\" --include=\"*.html\""
+SED_CMD="sed"
+AWK_CMD="awk"
+
+# --- RAG Behavior ---
+RAG_ENABLED=true
+RAG_CONFIDENCE_THRESHOLD=0.7  # Minimum confidence to trigger RAG
+RAG_MAX_CONTEXT_LENGTH=4000  # Maximum context to include in prompt
+RAG_CACHE_ENABLED=true
+
+# --- Debug and Logging ---
+RAG_DEBUG=false
+RAG_LOG_FILE="logs/rag_system.log"
+
+# --- Utility Functions ---
+
+# Check if RAG system is properly configured
+check_rag_system() {
+    local issues=()
+
+    # Check if corpus directory exists
+    if [ ! -d "$CORPUS_DIR" ]; then
+        issues+=("Corpus directory not found: $CORPUS_DIR")
+    fi
+
+    # Check if registry exists
+    if [ ! -f "$CORPUS_REGISTRY" ]; then
+        issues+=("Corpus registry not found: $CORPUS_REGISTRY")
+    fi
+
+    # Check if corpus manager exists
+    if [ ! -f "corpus_manager.sh" ]; then
+        issues+=("Corpus manager not found: corpus_manager.sh")
+    fi
+
+    # Report issues
+    if [ ${#issues[@]} -gt 0 ]; then
+        echo "❌ RAG System Issues Found:"
+        for issue in "${issues[@]}"; do
+            echo "   - $issue"
+        done
+        return 1
+    else
+        echo "✅ RAG System is properly configured"
+        return 0
+    fi
+}
+
+# Get corpus statistics
+get_corpus_stats() {
+    if [ -f "$CORPUS_REGISTRY" ]; then
+        local topic_count=$(grep -c "|" "$CORPUS_REGISTRY")
+        local file_count=$(find "$CORPUS_DIR" -type f \( -name "*.txt" -o -name "*.md" -o -name "*.html" \) 2>/dev/null | wc -l)
+        echo "📊 Corpus Statistics:"
+        echo "   Topics: $topic_count"
+        echo "   Files: $file_count"
+    else
+        echo "❌ No corpus registry found"
+    fi
+}
+
+# Export configuration for use by other scripts
+export CORPUS_DIR CORPUS_REGISTRY CORPUS_CACHE_FILE CORPUS_CACHE_TTL
+export MAX_SEARCH_RESULTS MIN_CONTENT_LENGTH MAX_CONTENT_LENGTH SEARCH_CONTEXT_LINES
+export RAG_ENABLED RAG_CONFIDENCE_THRESHOLD RAG_MAX_CONTEXT_LENGTH RAG_CACHE_ENABLED
+export RAG_DEBUG RAG_LOG_FILE
+export GREP_CMD SED_CMD AWK_CMD
+
+# Make utility functions available
+export -f check_rag_system get_corpus_stats