diff options
Diffstat (limited to 'bash/talk-to-computer/corpus_manager.sh')
-rwxr-xr-x | bash/talk-to-computer/corpus_manager.sh | 303 |
1 files changed, 303 insertions, 0 deletions
diff --git a/bash/talk-to-computer/corpus_manager.sh b/bash/talk-to-computer/corpus_manager.sh new file mode 100755 index 0000000..47c743c --- /dev/null +++ b/bash/talk-to-computer/corpus_manager.sh @@ -0,0 +1,303 @@ +#!/bin/bash + +# Corpus Manager - Manages RAG corpus discovery and maintenance +# This script provides utilities for managing the knowledge corpus + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CORPUS_DIR="${SCRIPT_DIR}/corpus" +REGISTRY_FILE="${CORPUS_DIR}/corpus_registry.txt" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# --- Corpus Discovery Functions --- + +discover_corpus() { + echo -e "${BLUE}🔍 Discovering corpus structure...${NC}" + + # Find all directories under corpus/ + find "$CORPUS_DIR" -type d -mindepth 1 | while read -r dir; do + local topic_name=$(basename "$dir") + local parent_topic=$(basename "$(dirname "$dir")") + + # Skip if this is the corpus root + if [ "$parent_topic" = "corpus" ]; then + echo "Found topic directory: $topic_name" + fi + done +} + +# Generate topic keywords based on directory name and content +generate_topic_keywords() { + local topic_name="$1" + local keywords="" + + case "$topic_name" in + "programming") + keywords="bash,shell,scripting,programming,lil,algorithm,code,software,development" + ;; + "science") + keywords="physics,chemistry,biology,science,research,scientific" + ;; + "literature") + keywords="books,authors,literature,writing,analysis" + ;; + "lil") + keywords="decker,lil,language,programming,scripting,terse,deck" + ;; + "physics") + keywords="quantum,relativity,physics,mechanics,thermodynamics" + ;; + *) + # Generate keywords from directory name + keywords=$(echo "$topic_name" | sed 's/[-_]/,/g') + ;; + esac + + echo "$keywords" +} + +# Update the corpus registry +update_registry() { + echo -e "${BLUE}📝 Updating corpus registry...${NC}" + + # Backup existing registry + if [ -f "$REGISTRY_FILE" ]; then + cp "$REGISTRY_FILE" "${REGISTRY_FILE}.backup" + fi + + # Create new registry header + cat > "$REGISTRY_FILE" << 'EOF' +# Corpus Registry - Auto-generated by corpus_manager.sh +# Format: TOPIC|PATH|KEYWORDS|DESCRIPTION +# This file is automatically maintained - do not edit manually + +EOF + + # Find all directories and generate registry entries + find "$CORPUS_DIR" -type d -mindepth 1 | sort | while read -r dir; do + local topic_name=$(basename "$dir") + local relative_path="${dir#${SCRIPT_DIR}/}" + local keywords=$(generate_topic_keywords "$topic_name") + local description="$(echo "${topic_name:0:1}" | tr '[:lower:]' '[:upper:]')${topic_name:1} topics and resources" + + # Determine parent topic for hierarchical structure + local parent_dir=$(dirname "$dir") + local parent_topic="" + + if [ "$parent_dir" != "$CORPUS_DIR" ]; then + parent_topic=$(basename "$parent_dir") + description="$(echo "${topic_name:0:1}" | tr '[:lower:]' '[:upper:]')${topic_name:1} subset of ${parent_topic}" + fi + + # Add to registry + echo "${parent_topic:-$topic_name}|$relative_path|$keywords|$description" >> "$REGISTRY_FILE" + done + + echo -e "${GREEN}✅ Registry updated successfully${NC}" +} + +# --- Corpus Query Functions --- + +# Check if corpus exists for a given topic +corpus_exists() { + local topic="$1" + grep -q "^[^|]*${topic}|" "$REGISTRY_FILE" 2>/dev/null + return $? +} + +# Get corpus path for a topic +get_corpus_path() { + local topic="$1" + grep "^[^|]*${topic}|" "$REGISTRY_FILE" | head -1 | cut -d'|' -f2 +} + +# Get corpus keywords for a topic +get_corpus_keywords() { + local topic="$1" + grep "^[^|]*${topic}|" "$REGISTRY_FILE" | head -1 | cut -d'|' -f3 +} + +# List all available topics +list_topics() { + echo -e "${BLUE}📚 Available Corpus Topics:${NC}" + echo "----------------------------------------" + + if [ ! -f "$REGISTRY_FILE" ]; then + echo -e "${RED}No corpus registry found. Run 'update' first.${NC}" + return 1 + fi + + awk -F'|' 'NR>3 {print "• " $1 "/" $2 " - " $4}' "$REGISTRY_FILE" | sort +} + +# --- Corpus Content Functions --- + +# Count files in a corpus directory +count_corpus_files() { + local topic="$1" + local corpus_path=$(get_corpus_path "$topic") + + if [ -d "$corpus_path" ]; then + find "$corpus_path" -type f \( -name "*.txt" -o -name "*.md" -o -name "*.html" \) | wc -l + else + echo "0" + fi +} + +# Get corpus file list +list_corpus_files() { + local topic="$1" + local corpus_path=$(get_corpus_path "$topic") + + if [ -d "$corpus_path" ]; then + echo -e "${BLUE}📄 Files in $topic corpus:${NC}" + find "$corpus_path" -type f \( -name "*.txt" -o -name "*.md" -o -name "*.html" \) | sort + else + echo -e "${RED}Corpus directory not found: $corpus_path${NC}" + fi +} + +# --- Template and Setup Functions --- + +# Create template files for a new topic +create_topic_template() { + local topic="$1" + local corpus_path="$CORPUS_DIR/$topic" + + echo -e "${BLUE}🛠️ Creating template for topic: $topic${NC}" + + # Create directory if it doesn't exist + mkdir -p "$corpus_path" + + # Create template files + cat > "$corpus_path/README.md" << EOF +# $topic Corpus + +This directory contains documentation and resources for $topic. + +## File Format Guidelines + +- Use **Markdown (.md)** for structured content with headers +- Use **Plain text (.txt)** for simple notes and documentation +- Use **HTML (.html)** for rich content and formatting +- File names should be descriptive: \`topic_concept_name.md\` + +## Content Organization + +- Group related concepts in single files +- Use clear, descriptive headers +- Include code examples where relevant +- Add cross-references between related topics + +## Adding New Content + +1. Create new .md, .txt, or .html files in this directory +2. Run \`./corpus_manager.sh update\` to update the registry +3. Test with corpus queries +EOF + + cat > "$corpus_path/example.md" << EOF +# Example $topic Content + +This is an example file showing the expected format for $topic content. + +## Introduction + +Add your content here using standard Markdown formatting. + +## Key Concepts + +- Concept 1 +- Concept 2 +- Concept 3 + +## Examples + +\`\`\`bash +# Code examples go here +echo "Hello, $topic!" +\`\`\` + +## References + +- Link to relevant resources +- Additional reading materials +EOF + + echo -e "${GREEN}✅ Template created in: $corpus_path${NC}" + echo -e "${YELLOW}💡 Tip: Edit the files and run 'update' to refresh the registry${NC}" +} + +# --- Main Command Interface --- + +case "${1:-help}" in + "discover") + discover_corpus + ;; + "update") + update_registry + ;; + "list") + list_topics + ;; + "files") + if [ -n "$2" ]; then + list_corpus_files "$2" + else + echo -e "${RED}Usage: $0 files <topic>${NC}" + fi + ;; + "count") + if [ -n "$2" ]; then + local count=$(count_corpus_files "$2") + echo -e "${BLUE}📊 $2 corpus has $count files${NC}" + else + echo -e "${RED}Usage: $0 count <topic>${NC}" + fi + ;; + "template") + if [ -n "$2" ]; then + create_topic_template "$2" + else + echo -e "${RED}Usage: $0 template <topic>${NC}" + fi + ;; + "exists") + if [ -n "$2" ]; then + if corpus_exists "$2"; then + echo -e "${GREEN}✅ Corpus exists for topic: $2${NC}" + else + echo -e "${RED}❌ No corpus found for topic: $2${NC}" + fi + else + echo -e "${RED}Usage: $0 exists <topic>${NC}" + fi + ;; + "help"|*) + echo -e "${BLUE}📚 Corpus Manager${NC}" + echo "Manage the RAG knowledge corpus" + echo "" + echo -e "${YELLOW}Usage: $0 <command> [arguments]${NC}" + echo "" + echo "Commands:" + echo " discover Discover corpus structure" + echo " update Update corpus registry" + echo " list List all available topics" + echo " files <topic> List files in a topic corpus" + echo " count <topic> Count files in a topic corpus" + echo " exists <topic> Check if corpus exists for topic" + echo " template <topic> Create template files for new topic" + echo " help Show this help message" + echo "" + echo "Examples:" + echo " $0 update" + echo " $0 list" + echo " $0 template physics" + echo " $0 exists programming" + ;; +esac |