diff options
Diffstat (limited to 'bash/talk-to-computer/rag_config.sh')
-rw-r--r-- | bash/talk-to-computer/rag_config.sh | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/bash/talk-to-computer/rag_config.sh b/bash/talk-to-computer/rag_config.sh new file mode 100644 index 0000000..27b724b --- /dev/null +++ b/bash/talk-to-computer/rag_config.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# RAG (Retrieval-Augmented Generation) Configuration +# This file configures the RAG system for corpus-based knowledge augmentation + +# --- Corpus Configuration --- +CORPUS_DIR="corpus" +CORPUS_REGISTRY="${CORPUS_DIR}/corpus_registry.txt" +CORPUS_CACHE_FILE="${CORPUS_DIR}/.corpus_cache" +CORPUS_CACHE_TTL=3600 # Cache TTL in seconds (1 hour) + +# --- Search Configuration --- +MAX_SEARCH_RESULTS=5 +MIN_CONTENT_LENGTH=50 +MAX_CONTENT_LENGTH=5000 +SEARCH_CONTEXT_LINES=3 # Lines of context around search matches + +# --- Topic Classification --- +# Keywords that trigger specific topic matching (format: topic|keywords) +TOPIC_KEYWORDS_FILE="${CORPUS_DIR}/.topic_keywords" + +# Initialize topic keywords file if it doesn't exist +if [ ! -f "$TOPIC_KEYWORDS_FILE" ]; then + cat > "$TOPIC_KEYWORDS_FILE" << 'EOF' +programming|bash shell scripting code algorithm programming software development +lil|decker lil language terse programming scripting deck +science|physics chemistry biology research scientific experiment +physics|quantum relativity mechanics thermodynamics energy force +literature|book author writing novel poem analysis criticism +general|knowledge fact information general misc miscellaneous +EOF +fi + +# --- File Processing --- +# Supported file extensions and their processing commands (format: ext|command) +FILE_PROCESSORS_FILE="${CORPUS_DIR}/.file_processors" + +# Initialize file processors if it doesn't exist +if [ ! -f "$FILE_PROCESSORS_FILE" ]; then + cat > "$FILE_PROCESSORS_FILE" << 'EOF' +txt|cat +md|cat +html|cat +EOF +fi + +# --- Search Tools --- +# Commands used for searching different file types +GREP_CMD="grep -r -i --include=\"*.txt\" --include=\"*.md\" --include=\"*.html\"" +SED_CMD="sed" +AWK_CMD="awk" + +# --- RAG Behavior --- +RAG_ENABLED=true +RAG_CONFIDENCE_THRESHOLD=0.7 # Minimum confidence to trigger RAG +RAG_MAX_CONTEXT_LENGTH=4000 # Maximum context to include in prompt +RAG_CACHE_ENABLED=true + +# --- Debug and Logging --- +RAG_DEBUG=false +RAG_LOG_FILE="logs/rag_system.log" + +# --- Utility Functions --- + +# Check if RAG system is properly configured +check_rag_system() { + local issues=() + + # Check if corpus directory exists + if [ ! -d "$CORPUS_DIR" ]; then + issues+=("Corpus directory not found: $CORPUS_DIR") + fi + + # Check if registry exists + if [ ! -f "$CORPUS_REGISTRY" ]; then + issues+=("Corpus registry not found: $CORPUS_REGISTRY") + fi + + # Check if corpus manager exists + if [ ! -f "corpus_manager.sh" ]; then + issues+=("Corpus manager not found: corpus_manager.sh") + fi + + # Report issues + if [ ${#issues[@]} -gt 0 ]; then + echo "❌ RAG System Issues Found:" + for issue in "${issues[@]}"; do + echo " - $issue" + done + return 1 + else + echo "✅ RAG System is properly configured" + return 0 + fi +} + +# Get corpus statistics +get_corpus_stats() { + if [ -f "$CORPUS_REGISTRY" ]; then + local topic_count=$(grep -c "|" "$CORPUS_REGISTRY") + local file_count=$(find "$CORPUS_DIR" -type f \( -name "*.txt" -o -name "*.md" -o -name "*.html" \) 2>/dev/null | wc -l) + echo "📊 Corpus Statistics:" + echo " Topics: $topic_count" + echo " Files: $file_count" + else + echo "❌ No corpus registry found" + fi +} + +# Export configuration for use by other scripts +export CORPUS_DIR CORPUS_REGISTRY CORPUS_CACHE_FILE CORPUS_CACHE_TTL +export MAX_SEARCH_RESULTS MIN_CONTENT_LENGTH MAX_CONTENT_LENGTH SEARCH_CONTEXT_LINES +export RAG_ENABLED RAG_CONFIDENCE_THRESHOLD RAG_MAX_CONTEXT_LENGTH RAG_CACHE_ENABLED +export RAG_DEBUG RAG_LOG_FILE +export GREP_CMD SED_CMD AWK_CMD + +# Make utility functions available +export -f check_rag_system get_corpus_stats |