#!/bin/bash # RAG (Retrieval-Augmented Generation) Configuration # This file configures the RAG system for corpus-based knowledge augmentation # --- Corpus Configuration --- CORPUS_DIR="corpus" CORPUS_REGISTRY="${CORPUS_DIR}/corpus_registry.txt" CORPUS_CACHE_FILE="${CORPUS_DIR}/.corpus_cache" CORPUS_CACHE_TTL=3600 # Cache TTL in seconds (1 hour) # --- Search Configuration --- MAX_SEARCH_RESULTS=5 MIN_CONTENT_LENGTH=50 MAX_CONTENT_LENGTH=5000 SEARCH_CONTEXT_LINES=3 # Lines of context around search matches # --- Topic Classification --- # Keywords that trigger specific topic matching (format: topic|keywords) TOPIC_KEYWORDS_FILE="${CORPUS_DIR}/.topic_keywords" # Initialize topic keywords file if it doesn't exist if [ ! -f "$TOPIC_KEYWORDS_FILE" ]; then cat > "$TOPIC_KEYWORDS_FILE" << 'EOF' programming|bash shell scripting code algorithm programming software development lil|decker lil language terse programming scripting deck science|physics chemistry biology research scientific experiment physics|quantum relativity mechanics thermodynamics energy force literature|book author writing novel poem analysis criticism general|knowledge fact information general misc miscellaneous EOF fi # --- File Processing --- # Supported file extensions and their processing commands (format: ext|command) FILE_PROCESSORS_FILE="${CORPUS_DIR}/.file_processors" # Initialize file processors if it doesn't exist if [ ! -f "$FILE_PROCESSORS_FILE" ]; then cat > "$FILE_PROCESSORS_FILE" << 'EOF' txt|cat md|cat html|cat EOF fi # --- Search Tools --- # Commands used for searching different file types GREP_CMD="grep -r -i --include=\"*.txt\" --include=\"*.md\" --include=\"*.html\"" SED_CMD="sed" AWK_CMD="awk" # --- RAG Behavior --- RAG_ENABLED=true RAG_CONFIDENCE_THRESHOLD=0.7 # Minimum confidence to trigger RAG RAG_MAX_CONTEXT_LENGTH=4000 # Maximum context to include in prompt RAG_CACHE_ENABLED=true # --- Debug and Logging --- RAG_DEBUG=false RAG_LOG_FILE="logs/rag_system.log" # --- Utility Functions --- # Check if RAG system is properly configured check_rag_system() { local issues=() # Check if corpus directory exists if [ ! -d "$CORPUS_DIR" ]; then issues+=("Corpus directory not found: $CORPUS_DIR") fi # Check if registry exists if [ ! -f "$CORPUS_REGISTRY" ]; then issues+=("Corpus registry not found: $CORPUS_REGISTRY") fi # Check if corpus manager exists if [ ! -f "corpus_manager.sh" ]; then issues+=("Corpus manager not found: corpus_manager.sh") fi # Report issues if [ ${#issues[@]} -gt 0 ]; then echo "❌ RAG System Issues Found:" for issue in "${issues[@]}"; do echo " - $issue" done return 1 else echo "✅ RAG System is properly configured" return 0 fi } # Get corpus statistics get_corpus_stats() { if [ -f "$CORPUS_REGISTRY" ]; then local topic_count=$(grep -c "|" "$CORPUS_REGISTRY") local file_count=$(find "$CORPUS_DIR" -type f \( -name "*.txt" -o -name "*.md" -o -name "*.html" \) 2>/dev/null | wc -l) echo "📊 Corpus Statistics:" echo " Topics: $topic_count" echo " Files: $file_count" else echo "❌ No corpus registry found" fi } # Export configuration for use by other scripts export CORPUS_DIR CORPUS_REGISTRY CORPUS_CACHE_FILE CORPUS_CACHE_TTL export MAX_SEARCH_RESULTS MIN_CONTENT_LENGTH MAX_CONTENT_LENGTH SEARCH_CONTEXT_LINES export RAG_ENABLED RAG_CONFIDENCE_THRESHOLD RAG_MAX_CONTEXT_LENGTH RAG_CACHE_ENABLED export RAG_DEBUG RAG_LOG_FILE export GREP_CMD SED_CMD AWK_CMD # Make utility functions available export -f check_rag_system get_corpus_stats