blob: 27b724b58e8dcef324c5b8aacccea9dd17b46455 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
#!/bin/bash
# RAG (Retrieval-Augmented Generation) Configuration
# This file configures the RAG system for corpus-based knowledge augmentation
# --- Corpus Configuration ---
CORPUS_DIR="corpus"
CORPUS_REGISTRY="${CORPUS_DIR}/corpus_registry.txt"
CORPUS_CACHE_FILE="${CORPUS_DIR}/.corpus_cache"
CORPUS_CACHE_TTL=3600 # Cache TTL in seconds (1 hour)
# --- Search Configuration ---
MAX_SEARCH_RESULTS=5
MIN_CONTENT_LENGTH=50
MAX_CONTENT_LENGTH=5000
SEARCH_CONTEXT_LINES=3 # Lines of context around search matches
# --- Topic Classification ---
# Keywords that trigger specific topic matching (format: topic|keywords)
TOPIC_KEYWORDS_FILE="${CORPUS_DIR}/.topic_keywords"
# Initialize topic keywords file if it doesn't exist
if [ ! -f "$TOPIC_KEYWORDS_FILE" ]; then
cat > "$TOPIC_KEYWORDS_FILE" << 'EOF'
programming|bash shell scripting code algorithm programming software development
lil|decker lil language terse programming scripting deck
science|physics chemistry biology research scientific experiment
physics|quantum relativity mechanics thermodynamics energy force
literature|book author writing novel poem analysis criticism
general|knowledge fact information general misc miscellaneous
EOF
fi
# --- File Processing ---
# Supported file extensions and their processing commands (format: ext|command)
FILE_PROCESSORS_FILE="${CORPUS_DIR}/.file_processors"
# Initialize file processors if it doesn't exist
if [ ! -f "$FILE_PROCESSORS_FILE" ]; then
cat > "$FILE_PROCESSORS_FILE" << 'EOF'
txt|cat
md|cat
html|cat
EOF
fi
# --- Search Tools ---
# Commands used for searching different file types
GREP_CMD="grep -r -i --include=\"*.txt\" --include=\"*.md\" --include=\"*.html\""
SED_CMD="sed"
AWK_CMD="awk"
# --- RAG Behavior ---
RAG_ENABLED=true
RAG_CONFIDENCE_THRESHOLD=0.7 # Minimum confidence to trigger RAG
RAG_MAX_CONTEXT_LENGTH=4000 # Maximum context to include in prompt
RAG_CACHE_ENABLED=true
# --- Debug and Logging ---
RAG_DEBUG=false
RAG_LOG_FILE="logs/rag_system.log"
# --- Utility Functions ---
# Check if RAG system is properly configured
check_rag_system() {
local issues=()
# Check if corpus directory exists
if [ ! -d "$CORPUS_DIR" ]; then
issues+=("Corpus directory not found: $CORPUS_DIR")
fi
# Check if registry exists
if [ ! -f "$CORPUS_REGISTRY" ]; then
issues+=("Corpus registry not found: $CORPUS_REGISTRY")
fi
# Check if corpus manager exists
if [ ! -f "corpus_manager.sh" ]; then
issues+=("Corpus manager not found: corpus_manager.sh")
fi
# Report issues
if [ ${#issues[@]} -gt 0 ]; then
echo "❌ RAG System Issues Found:"
for issue in "${issues[@]}"; do
echo " - $issue"
done
return 1
else
echo "✅ RAG System is properly configured"
return 0
fi
}
# Get corpus statistics
get_corpus_stats() {
if [ -f "$CORPUS_REGISTRY" ]; then
local topic_count=$(grep -c "|" "$CORPUS_REGISTRY")
local file_count=$(find "$CORPUS_DIR" -type f \( -name "*.txt" -o -name "*.md" -o -name "*.html" \) 2>/dev/null | wc -l)
echo "📊 Corpus Statistics:"
echo " Topics: $topic_count"
echo " Files: $file_count"
else
echo "❌ No corpus registry found"
fi
}
# Export configuration for use by other scripts
export CORPUS_DIR CORPUS_REGISTRY CORPUS_CACHE_FILE CORPUS_CACHE_TTL
export MAX_SEARCH_RESULTS MIN_CONTENT_LENGTH MAX_CONTENT_LENGTH SEARCH_CONTEXT_LINES
export RAG_ENABLED RAG_CONFIDENCE_THRESHOLD RAG_MAX_CONTEXT_LENGTH RAG_CACHE_ENABLED
export RAG_DEBUG RAG_LOG_FILE
export GREP_CMD SED_CMD AWK_CMD
# Make utility functions available
export -f check_rag_system get_corpus_stats
|