about summary refs log tree commit diff stats
path: root/bash/talk-to-computer/corpus_manager.sh
diff options
context:
space:
mode:
Diffstat (limited to 'bash/talk-to-computer/corpus_manager.sh')
-rwxr-xr-xbash/talk-to-computer/corpus_manager.sh303
1 files changed, 303 insertions, 0 deletions
diff --git a/bash/talk-to-computer/corpus_manager.sh b/bash/talk-to-computer/corpus_manager.sh
new file mode 100755
index 0000000..47c743c
--- /dev/null
+++ b/bash/talk-to-computer/corpus_manager.sh
@@ -0,0 +1,303 @@
+#!/bin/bash
+
+# Corpus Manager - Manages RAG corpus discovery and maintenance
+# This script provides utilities for managing the knowledge corpus
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CORPUS_DIR="${SCRIPT_DIR}/corpus"
+REGISTRY_FILE="${CORPUS_DIR}/corpus_registry.txt"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# --- Corpus Discovery Functions ---
+
+discover_corpus() {
+    echo -e "${BLUE}🔍 Discovering corpus structure...${NC}"
+
+    # Find all directories under corpus/
+    find "$CORPUS_DIR" -type d -mindepth 1 | while read -r dir; do
+        local topic_name=$(basename "$dir")
+        local parent_topic=$(basename "$(dirname "$dir")")
+
+        # Skip if this is the corpus root
+        if [ "$parent_topic" = "corpus" ]; then
+            echo "Found topic directory: $topic_name"
+        fi
+    done
+}
+
+# Generate topic keywords based on directory name and content
+generate_topic_keywords() {
+    local topic_name="$1"
+    local keywords=""
+
+    case "$topic_name" in
+        "programming")
+            keywords="bash,shell,scripting,programming,lil,algorithm,code,software,development"
+            ;;
+        "science")
+            keywords="physics,chemistry,biology,science,research,scientific"
+            ;;
+        "literature")
+            keywords="books,authors,literature,writing,analysis"
+            ;;
+        "lil")
+            keywords="decker,lil,language,programming,scripting,terse,deck"
+            ;;
+        "physics")
+            keywords="quantum,relativity,physics,mechanics,thermodynamics"
+            ;;
+        *)
+            # Generate keywords from directory name
+            keywords=$(echo "$topic_name" | sed 's/[-_]/,/g')
+            ;;
+    esac
+
+    echo "$keywords"
+}
+
+# Update the corpus registry
+update_registry() {
+    echo -e "${BLUE}📝 Updating corpus registry...${NC}"
+
+    # Backup existing registry
+    if [ -f "$REGISTRY_FILE" ]; then
+        cp "$REGISTRY_FILE" "${REGISTRY_FILE}.backup"
+    fi
+
+    # Create new registry header
+    cat > "$REGISTRY_FILE" << 'EOF'
+# Corpus Registry - Auto-generated by corpus_manager.sh
+# Format: TOPIC|PATH|KEYWORDS|DESCRIPTION
+# This file is automatically maintained - do not edit manually
+
+EOF
+
+    # Find all directories and generate registry entries
+    find "$CORPUS_DIR" -type d -mindepth 1 | sort | while read -r dir; do
+        local topic_name=$(basename "$dir")
+        local relative_path="${dir#${SCRIPT_DIR}/}"
+        local keywords=$(generate_topic_keywords "$topic_name")
+        local description="$(echo "${topic_name:0:1}" | tr '[:lower:]' '[:upper:]')${topic_name:1} topics and resources"
+
+        # Determine parent topic for hierarchical structure
+        local parent_dir=$(dirname "$dir")
+        local parent_topic=""
+
+        if [ "$parent_dir" != "$CORPUS_DIR" ]; then
+            parent_topic=$(basename "$parent_dir")
+            description="$(echo "${topic_name:0:1}" | tr '[:lower:]' '[:upper:]')${topic_name:1} subset of ${parent_topic}"
+        fi
+
+        # Add to registry
+        echo "${parent_topic:-$topic_name}|$relative_path|$keywords|$description" >> "$REGISTRY_FILE"
+    done
+
+    echo -e "${GREEN}✅ Registry updated successfully${NC}"
+}
+
+# --- Corpus Query Functions ---
+
+# Check if corpus exists for a given topic
+corpus_exists() {
+    local topic="$1"
+    grep -q "^[^|]*${topic}|" "$REGISTRY_FILE" 2>/dev/null
+    return $?
+}
+
+# Get corpus path for a topic
+get_corpus_path() {
+    local topic="$1"
+    grep "^[^|]*${topic}|" "$REGISTRY_FILE" | head -1 | cut -d'|' -f2
+}
+
+# Get corpus keywords for a topic
+get_corpus_keywords() {
+    local topic="$1"
+    grep "^[^|]*${topic}|" "$REGISTRY_FILE" | head -1 | cut -d'|' -f3
+}
+
+# List all available topics
+list_topics() {
+    echo -e "${BLUE}📚 Available Corpus Topics:${NC}"
+    echo "----------------------------------------"
+
+    if [ ! -f "$REGISTRY_FILE" ]; then
+        echo -e "${RED}No corpus registry found. Run 'update' first.${NC}"
+        return 1
+    fi
+
+    awk -F'|' 'NR>3 {print "• " $1 "/" $2 " - " $4}' "$REGISTRY_FILE" | sort
+}
+
+# --- Corpus Content Functions ---
+
+# Count files in a corpus directory
+count_corpus_files() {
+    local topic="$1"
+    local corpus_path=$(get_corpus_path "$topic")
+
+    if [ -d "$corpus_path" ]; then
+        find "$corpus_path" -type f \( -name "*.txt" -o -name "*.md" -o -name "*.html" \) | wc -l
+    else
+        echo "0"
+    fi
+}
+
+# Get corpus file list
+list_corpus_files() {
+    local topic="$1"
+    local corpus_path=$(get_corpus_path "$topic")
+
+    if [ -d "$corpus_path" ]; then
+        echo -e "${BLUE}📄 Files in $topic corpus:${NC}"
+        find "$corpus_path" -type f \( -name "*.txt" -o -name "*.md" -o -name "*.html" \) | sort
+    else
+        echo -e "${RED}Corpus directory not found: $corpus_path${NC}"
+    fi
+}
+
+# --- Template and Setup Functions ---
+
+# Create template files for a new topic
+create_topic_template() {
+    local topic="$1"
+    local corpus_path="$CORPUS_DIR/$topic"
+
+    echo -e "${BLUE}🛠️  Creating template for topic: $topic${NC}"
+
+    # Create directory if it doesn't exist
+    mkdir -p "$corpus_path"
+
+    # Create template files
+    cat > "$corpus_path/README.md" << EOF
+# $topic Corpus
+
+This directory contains documentation and resources for $topic.
+
+## File Format Guidelines
+
+- Use **Markdown (.md)** for structured content with headers
+- Use **Plain text (.txt)** for simple notes and documentation
+- Use **HTML (.html)** for rich content and formatting
+- File names should be descriptive: \`topic_concept_name.md\`
+
+## Content Organization
+
+- Group related concepts in single files
+- Use clear, descriptive headers
+- Include code examples where relevant
+- Add cross-references between related topics
+
+## Adding New Content
+
+1. Create new .md, .txt, or .html files in this directory
+2. Run \`./corpus_manager.sh update\` to update the registry
+3. Test with corpus queries
+EOF
+
+    cat > "$corpus_path/example.md" << EOF
+# Example $topic Content
+
+This is an example file showing the expected format for $topic content.
+
+## Introduction
+
+Add your content here using standard Markdown formatting.
+
+## Key Concepts
+
+- Concept 1
+- Concept 2
+- Concept 3
+
+## Examples
+
+\`\`\`bash
+# Code examples go here
+echo "Hello, $topic!"
+\`\`\`
+
+## References
+
+- Link to relevant resources
+- Additional reading materials
+EOF
+
+    echo -e "${GREEN}✅ Template created in: $corpus_path${NC}"
+    echo -e "${YELLOW}💡 Tip: Edit the files and run 'update' to refresh the registry${NC}"
+}
+
+# --- Main Command Interface ---
+
+case "${1:-help}" in
+    "discover")
+        discover_corpus
+        ;;
+    "update")
+        update_registry
+        ;;
+    "list")
+        list_topics
+        ;;
+    "files")
+        if [ -n "$2" ]; then
+            list_corpus_files "$2"
+        else
+            echo -e "${RED}Usage: $0 files <topic>${NC}"
+        fi
+        ;;
+    "count")
+        if [ -n "$2" ]; then
+            local count=$(count_corpus_files "$2")
+            echo -e "${BLUE}📊 $2 corpus has $count files${NC}"
+        else
+            echo -e "${RED}Usage: $0 count <topic>${NC}"
+        fi
+        ;;
+    "template")
+        if [ -n "$2" ]; then
+            create_topic_template "$2"
+        else
+            echo -e "${RED}Usage: $0 template <topic>${NC}"
+        fi
+        ;;
+    "exists")
+        if [ -n "$2" ]; then
+            if corpus_exists "$2"; then
+                echo -e "${GREEN}✅ Corpus exists for topic: $2${NC}"
+            else
+                echo -e "${RED}❌ No corpus found for topic: $2${NC}"
+            fi
+        else
+            echo -e "${RED}Usage: $0 exists <topic>${NC}"
+        fi
+        ;;
+    "help"|*)
+        echo -e "${BLUE}📚 Corpus Manager${NC}"
+        echo "Manage the RAG knowledge corpus"
+        echo ""
+        echo -e "${YELLOW}Usage: $0 <command> [arguments]${NC}"
+        echo ""
+        echo "Commands:"
+        echo "  discover         Discover corpus structure"
+        echo "  update           Update corpus registry"
+        echo "  list             List all available topics"
+        echo "  files <topic>    List files in a topic corpus"
+        echo "  count <topic>    Count files in a topic corpus"
+        echo "  exists <topic>   Check if corpus exists for topic"
+        echo "  template <topic> Create template files for new topic"
+        echo "  help             Show this help message"
+        echo ""
+        echo "Examples:"
+        echo "  $0 update"
+        echo "  $0 list"
+        echo "  $0 template physics"
+        echo "  $0 exists programming"
+        ;;
+esac