about summary refs log tree commit diff stats
path: root/bash/talk-to-computer/corpus_manager.sh
blob: 47c743cf5efdc4e4a56dc535e71f85676923c02a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
#!/bin/bash

# Corpus Manager - Manages RAG corpus discovery and maintenance
# This script provides utilities for managing the knowledge corpus

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
CORPUS_DIR="${SCRIPT_DIR}/corpus"
REGISTRY_FILE="${CORPUS_DIR}/corpus_registry.txt"

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# --- Corpus Discovery Functions ---

discover_corpus() {
    echo -e "${BLUE}🔍 Discovering corpus structure...${NC}"

    # Find all directories under corpus/
    find "$CORPUS_DIR" -type d -mindepth 1 | while read -r dir; do
        local topic_name=$(basename "$dir")
        local parent_topic=$(basename "$(dirname "$dir")")

        # Skip if this is the corpus root
        if [ "$parent_topic" = "corpus" ]; then
            echo "Found topic directory: $topic_name"
        fi
    done
}

# Generate topic keywords based on directory name and content
generate_topic_keywords() {
    local topic_name="$1"
    local keywords=""

    case "$topic_name" in
        "programming")
            keywords="bash,shell,scripting,programming,lil,algorithm,code,software,development"
            ;;
        "science")
            keywords="physics,chemistry,biology,science,research,scientific"
            ;;
        "literature")
            keywords="books,authors,literature,writing,analysis"
            ;;
        "lil")
            keywords="decker,lil,language,programming,scripting,terse,deck"
            ;;
        "physics")
            keywords="quantum,relativity,physics,mechanics,thermodynamics"
            ;;
        *)
            # Generate keywords from directory name
            keywords=$(echo "$topic_name" | sed 's/[-_]/,/g')
            ;;
    esac

    echo "$keywords"
}

# Update the corpus registry
update_registry() {
    echo -e "${BLUE}📝 Updating corpus registry...${NC}"

    # Backup existing registry
    if [ -f "$REGISTRY_FILE" ]; then
        cp "$REGISTRY_FILE" "${REGISTRY_FILE}.backup"
    fi

    # Create new registry header
    cat > "$REGISTRY_FILE" << 'EOF'
# Corpus Registry - Auto-generated by corpus_manager.sh
# Format: TOPIC|PATH|KEYWORDS|DESCRIPTION
# This file is automatically maintained - do not edit manually

EOF

    # Find all directories and generate registry entries
    find "$CORPUS_DIR" -type d -mindepth 1 | sort | while read -r dir; do
        local topic_name=$(basename "$dir")
        local relative_path="${dir#${SCRIPT_DIR}/}"
        local keywords=$(generate_topic_keywords "$topic_name")
        local description="$(echo "${topic_name:0:1}" | tr '[:lower:]' '[:upper:]')${topic_name:1} topics and resources"

        # Determine parent topic for hierarchical structure
        local parent_dir=$(dirname "$dir")
        local parent_topic=""

        if [ "$parent_dir" != "$CORPUS_DIR" ]; then
            parent_topic=$(basename "$parent_dir")
            description="$(echo "${topic_name:0:1}" | tr '[:lower:]' '[:upper:]')${topic_name:1} subset of ${parent_topic}"
        fi

        # Add to registry
        echo "${parent_topic:-$topic_name}|$relative_path|$keywords|$description" >> "$REGISTRY_FILE"
    done

    echo -e "${GREEN}✅ Registry updated successfully${NC}"
}

# --- Corpus Query Functions ---

# Check if corpus exists for a given topic
corpus_exists() {
    local topic="$1"
    grep -q "^[^|]*${topic}|" "$REGISTRY_FILE" 2>/dev/null
    return $?
}

# Get corpus path for a topic
get_corpus_path() {
    local topic="$1"
    grep "^[^|]*${topic}|" "$REGISTRY_FILE" | head -1 | cut -d'|' -f2
}

# Get corpus keywords for a topic
get_corpus_keywords() {
    local topic="$1"
    grep "^[^|]*${topic}|" "$REGISTRY_FILE" | head -1 | cut -d'|' -f3
}

# List all available topics
list_topics() {
    echo -e "${BLUE}📚 Available Corpus Topics:${NC}"
    echo "----------------------------------------"

    if [ ! -f "$REGISTRY_FILE" ]; then
        echo -e "${RED}No corpus registry found. Run 'update' first.${NC}"
        return 1
    fi

    awk -F'|' 'NR>3 {print "• " $1 "/" $2 " - " $4}' "$REGISTRY_FILE" | sort
}

# --- Corpus Content Functions ---

# Count files in a corpus directory
count_corpus_files() {
    local topic="$1"
    local corpus_path=$(get_corpus_path "$topic")

    if [ -d "$corpus_path" ]; then
        find "$corpus_path" -type f \( -name "*.txt" -o -name "*.md" -o -name "*.html" \) | wc -l
    else
        echo "0"
    fi
}

# Get corpus file list
list_corpus_files() {
    local topic="$1"
    local corpus_path=$(get_corpus_path "$topic")

    if [ -d "$corpus_path" ]; then
        echo -e "${BLUE}📄 Files in $topic corpus:${NC}"
        find "$corpus_path" -type f \( -name "*.txt" -o -name "*.md" -o -name "*.html" \) | sort
    else
        echo -e "${RED}Corpus directory not found: $corpus_path${NC}"
    fi
}

# --- Template and Setup Functions ---

# Create template files for a new topic
create_topic_template() {
    local topic="$1"
    local corpus_path="$CORPUS_DIR/$topic"

    echo -e "${BLUE}🛠️  Creating template for topic: $topic${NC}"

    # Create directory if it doesn't exist
    mkdir -p "$corpus_path"

    # Create template files
    cat > "$corpus_path/README.md" << EOF
# $topic Corpus

This directory contains documentation and resources for $topic.

## File Format Guidelines

- Use **Markdown (.md)** for structured content with headers
- Use **Plain text (.txt)** for simple notes and documentation
- Use **HTML (.html)** for rich content and formatting
- File names should be descriptive: \`topic_concept_name.md\`

## Content Organization

- Group related concepts in single files
- Use clear, descriptive headers
- Include code examples where relevant
- Add cross-references between related topics

## Adding New Content

1. Create new .md, .txt, or .html files in this directory
2. Run \`./corpus_manager.sh update\` to update the registry
3. Test with corpus queries
EOF

    cat > "$corpus_path/example.md" << EOF
# Example $topic Content

This is an example file showing the expected format for $topic content.

## Introduction

Add your content here using standard Markdown formatting.

## Key Concepts

- Concept 1
- Concept 2
- Concept 3

## Examples

\`\`\`bash
# Code examples go here
echo "Hello, $topic!"
\`\`\`

## References

- Link to relevant resources
- Additional reading materials
EOF

    echo -e "${GREEN}✅ Template created in: $corpus_path${NC}"
    echo -e "${YELLOW}💡 Tip: Edit the files and run 'update' to refresh the registry${NC}"
}

# --- Main Command Interface ---

case "${1:-help}" in
    "discover")
        discover_corpus
        ;;
    "update")
        update_registry
        ;;
    "list")
        list_topics
        ;;
    "files")
        if [ -n "$2" ]; then
            list_corpus_files "$2"
        else
            echo -e "${RED}Usage: $0 files <topic>${NC}"
        fi
        ;;
    "count")
        if [ -n "$2" ]; then
            local count=$(count_corpus_files "$2")
            echo -e "${BLUE}📊 $2 corpus has $count files${NC}"
        else
            echo -e "${RED}Usage: $0 count <topic>${NC}"
        fi
        ;;
    "template")
        if [ -n "$2" ]; then
            create_topic_template "$2"
        else
            echo -e "${RED}Usage: $0 template <topic>${NC}"
        fi
        ;;
    "exists")
        if [ -n "$2" ]; then
            if corpus_exists "$2"; then
                echo -e "${GREEN}✅ Corpus exists for topic: $2${NC}"
            else
                echo -e "${RED}❌ No corpus found for topic: $2${NC}"
            fi
        else
            echo -e "${RED}Usage: $0 exists <topic>${NC}"
        fi
        ;;
    "help"|*)
        echo -e "${BLUE}📚 Corpus Manager${NC}"
        echo "Manage the RAG knowledge corpus"
        echo ""
        echo -e "${YELLOW}Usage: $0 <command> [arguments]${NC}"
        echo ""
        echo "Commands:"
        echo "  discover         Discover corpus structure"
        echo "  update           Update corpus registry"
        echo "  list             List all available topics"
        echo "  files <topic>    List files in a topic corpus"
        echo "  count <topic>    Count files in a topic corpus"
        echo "  exists <topic>   Check if corpus exists for topic"
        echo "  template <topic> Create template files for new topic"
        echo "  help             Show this help message"
        echo ""
        echo "Examples:"
        echo "  $0 update"
        echo "  $0 list"
        echo "  $0 template physics"
        echo "  $0 exists programming"
        ;;
esac