#!/bin/bash # Corpus Manager - Manages RAG corpus discovery and maintenance # This script provides utilities for managing the knowledge corpus SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CORPUS_DIR="${SCRIPT_DIR}/corpus" REGISTRY_FILE="${CORPUS_DIR}/corpus_registry.txt" # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # --- Corpus Discovery Functions --- discover_corpus() { echo -e "${BLUE}🔍 Discovering corpus structure...${NC}" # Find all directories under corpus/ find "$CORPUS_DIR" -type d -mindepth 1 | while read -r dir; do local topic_name=$(basename "$dir") local parent_topic=$(basename "$(dirname "$dir")") # Skip if this is the corpus root if [ "$parent_topic" = "corpus" ]; then echo "Found topic directory: $topic_name" fi done } # Generate topic keywords based on directory name and content generate_topic_keywords() { local topic_name="$1" local keywords="" case "$topic_name" in "programming") keywords="bash,shell,scripting,programming,lil,algorithm,code,software,development" ;; "science") keywords="physics,chemistry,biology,science,research,scientific" ;; "literature") keywords="books,authors,literature,writing,analysis" ;; "lil") keywords="decker,lil,language,programming,scripting,terse,deck" ;; "physics") keywords="quantum,relativity,physics,mechanics,thermodynamics" ;; *) # Generate keywords from directory name keywords=$(echo "$topic_name" | sed 's/[-_]/,/g') ;; esac echo "$keywords" } # Update the corpus registry update_registry() { echo -e "${BLUE}📝 Updating corpus registry...${NC}" # Backup existing registry if [ -f "$REGISTRY_FILE" ]; then cp "$REGISTRY_FILE" "${REGISTRY_FILE}.backup" fi # Create new registry header cat > "$REGISTRY_FILE" << 'EOF' # Corpus Registry - Auto-generated by corpus_manager.sh # Format: TOPIC|PATH|KEYWORDS|DESCRIPTION # This file is automatically maintained - do not edit manually EOF # Find all directories and generate registry entries find "$CORPUS_DIR" -type d -mindepth 1 | sort | while read -r dir; do local topic_name=$(basename "$dir") local relative_path="${dir#${SCRIPT_DIR}/}" local keywords=$(generate_topic_keywords "$topic_name") local description="$(echo "${topic_name:0:1}" | tr '[:lower:]' '[:upper:]')${topic_name:1} topics and resources" # Determine parent topic for hierarchical structure local parent_dir=$(dirname "$dir") local parent_topic="" if [ "$parent_dir" != "$CORPUS_DIR" ]; then parent_topic=$(basename "$parent_dir") description="$(echo "${topic_name:0:1}" | tr '[:lower:]' '[:upper:]')${topic_name:1} subset of ${parent_topic}" fi # Add to registry echo "${parent_topic:-$topic_name}|$relative_path|$keywords|$description" >> "$REGISTRY_FILE" done echo -e "${GREEN}✅ Registry updated successfully${NC}" } # --- Corpus Query Functions --- # Check if corpus exists for a given topic corpus_exists() { local topic="$1" grep -q "^[^|]*${topic}|" "$REGISTRY_FILE" 2>/dev/null return $? } # Get corpus path for a topic get_corpus_path() { local topic="$1" grep "^[^|]*${topic}|" "$REGISTRY_FILE" | head -1 | cut -d'|' -f2 } # Get corpus keywords for a topic get_corpus_keywords() { local topic="$1" grep "^[^|]*${topic}|" "$REGISTRY_FILE" | head -1 | cut -d'|' -f3 } # List all available topics list_topics() { echo -e "${BLUE}📚 Available Corpus Topics:${NC}" echo "----------------------------------------" if [ ! -f "$REGISTRY_FILE" ]; then echo -e "${RED}No corpus registry found. Run 'update' first.${NC}" return 1 fi awk -F'|' 'NR>3 {print "• " $1 "/" $2 " - " $4}' "$REGISTRY_FILE" | sort } # --- Corpus Content Functions --- # Count files in a corpus directory count_corpus_files() { local topic="$1" local corpus_path=$(get_corpus_path "$topic") if [ -d "$corpus_path" ]; then find "$corpus_path" -type f \( -name "*.txt" -o -name "*.md" -o -name "*.html" \) | wc -l else echo "0" fi } # Get corpus file list list_corpus_files() { local topic="$1" local corpus_path=$(get_corpus_path "$topic") if [ -d "$corpus_path" ]; then echo -e "${BLUE}📄 Files in $topic corpus:${NC}" find "$corpus_path" -type f \( -name "*.txt" -o -name "*.md" -o -name "*.html" \) | sort else echo -e "${RED}Corpus directory not found: $corpus_path${NC}" fi } # --- Template and Setup Functions --- # Create template files for a new topic create_topic_template() { local topic="$1" local corpus_path="$CORPUS_DIR/$topic" echo -e "${BLUE}🛠️ Creating template for topic: $topic${NC}" # Create directory if it doesn't exist mkdir -p "$corpus_path" # Create template files cat > "$corpus_path/README.md" << EOF # $topic Corpus This directory contains documentation and resources for $topic. ## File Format Guidelines - Use **Markdown (.md)** for structured content with headers - Use **Plain text (.txt)** for simple notes and documentation - Use **HTML (.html)** for rich content and formatting - File names should be descriptive: \`topic_concept_name.md\` ## Content Organization - Group related concepts in single files - Use clear, descriptive headers - Include code examples where relevant - Add cross-references between related topics ## Adding New Content 1. Create new .md, .txt, or .html files in this directory 2. Run \`./corpus_manager.sh update\` to update the registry 3. Test with corpus queries EOF cat > "$corpus_path/example.md" << EOF # Example $topic Content This is an example file showing the expected format for $topic content. ## Introduction Add your content here using standard Markdown formatting. ## Key Concepts - Concept 1 - Concept 2 - Concept 3 ## Examples \`\`\`bash # Code examples go here echo "Hello, $topic!" \`\`\` ## References - Link to relevant resources - Additional reading materials EOF echo -e "${GREEN}✅ Template created in: $corpus_path${NC}" echo -e "${YELLOW}💡 Tip: Edit the files and run 'update' to refresh the registry${NC}" } # --- Main Command Interface --- case "${1:-help}" in "discover") discover_corpus ;; "update") update_registry ;; "list") list_topics ;; "files") if [ -n "$2" ]; then list_corpus_files "$2" else echo -e "${RED}Usage: $0 files ${NC}" fi ;; "count") if [ -n "$2" ]; then local count=$(count_corpus_files "$2") echo -e "${BLUE}📊 $2 corpus has $count files${NC}" else echo -e "${RED}Usage: $0 count ${NC}" fi ;; "template") if [ -n "$2" ]; then create_topic_template "$2" else echo -e "${RED}Usage: $0 template ${NC}" fi ;; "exists") if [ -n "$2" ]; then if corpus_exists "$2"; then echo -e "${GREEN}✅ Corpus exists for topic: $2${NC}" else echo -e "${RED}❌ No corpus found for topic: $2${NC}" fi else echo -e "${RED}Usage: $0 exists ${NC}" fi ;; "help"|*) echo -e "${BLUE}📚 Corpus Manager${NC}" echo "Manage the RAG knowledge corpus" echo "" echo -e "${YELLOW}Usage: $0 [arguments]${NC}" echo "" echo "Commands:" echo " discover Discover corpus structure" echo " update Update corpus registry" echo " list List all available topics" echo " files List files in a topic corpus" echo " count Count files in a topic corpus" echo " exists Check if corpus exists for topic" echo " template Create template files for new topic" echo " help Show this help message" echo "" echo "Examples:" echo " $0 update" echo " $0 list" echo " $0 template physics" echo " $0 exists programming" ;; esac