-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathstorage_setup.py
More file actions
457 lines (414 loc) · 17.9 KB
/
storage_setup.py
File metadata and controls
457 lines (414 loc) · 17.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
# --- File: ./storage_setup.py (UPDATED FOR 768 DIMENSIONS) ---
import sqlite3
import sqlite_vec
from pathlib import Path
DATABASE_FILE = "knowledge_base.db"
def create_unified_index(db_path, is_bake_operation=False):
"""
Creates the complete database schema for the Redleaf Engine.
This function establishes the "Unified Index" in a single SQLite file.
"""
print(f"--- Setting up the Unified Index at {db_path} ---")
conn = sqlite3.connect(db_path)
# --- Load sqlite-vec extension for native vector search ---
conn.enable_load_extension(True)
sqlite_vec.load(conn)
conn.enable_load_extension(False)
cursor = conn.cursor()
# --- Set up essential PRAGMA for performance and integrity ---
cursor.execute("PRAGMA foreign_keys = ON;")
cursor.execute("PRAGMA journal_mode = WAL;")
# === 0. Application Settings Table ===
print("Creating App Settings table...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS app_settings (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
);
""")
# Set default values for settings
cursor.execute("INSERT OR IGNORE INTO app_settings (key, value) VALUES ('max_workers', '1');")
cursor.execute("INSERT OR IGNORE INTO app_settings (key, value) VALUES ('html_parsing_mode', 'generic');")
cursor.execute("INSERT OR IGNORE INTO app_settings (key, value) VALUES ('use_gpu', 'false');")
# === 1. Document Registry ===
print("Creating Document Registry...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY,
relative_path TEXT NOT NULL UNIQUE,
file_hash TEXT NOT NULL,
file_type TEXT NOT NULL,
status TEXT NOT NULL,
status_message TEXT,
added_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
processed_at TIMESTAMP,
file_modified_at TIMESTAMP,
color TEXT,
page_count INTEGER,
file_size_bytes INTEGER,
duration_seconds INTEGER,
linked_audio_path TEXT,
linked_video_path TEXT,
linked_audio_url TEXT,
last_audio_position REAL DEFAULT 0.0,
audio_offset_seconds REAL DEFAULT 0.0,
last_pdf_zoom REAL,
last_pdf_page INTEGER,
-- Optimized Read Columns (Denormalization)
cached_comment_count INTEGER DEFAULT 0,
cached_tag_count INTEGER DEFAULT 0
);
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_doc_status ON documents (status);")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_doc_file_type ON documents (file_type);")
# High-performance indexes for sorting/filtering
cursor.execute("CREATE INDEX IF NOT EXISTS idx_docs_processed_at ON documents(processed_at DESC)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_docs_rel_path ON documents(relative_path COLLATE NOCASE)")
# === 2. Content Index (Full-Text Search) ===
print("Creating Content Index (FTS5)...")
cursor.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS content_index USING fts5(
doc_id UNINDEXED, page_number, page_content,
tokenize = 'porter unicode61'
);
""")
# === 3. Metadata Index (Extracted Entities) ===
print("Creating Metadata Index...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS entities (
id INTEGER PRIMARY KEY,
text TEXT NOT NULL,
label TEXT NOT NULL,
UNIQUE(text, label)
);
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_entity_label ON entities (label);")
cursor.execute("""
CREATE TABLE IF NOT EXISTS entity_appearances (
doc_id INTEGER NOT NULL,
entity_id INTEGER NOT NULL,
page_number INTEGER NOT NULL,
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE,
FOREIGN KEY (entity_id) REFERENCES entities(id) ON DELETE CASCADE,
PRIMARY KEY (doc_id, entity_id, page_number)
);
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_appearance_entity_id ON entity_appearances (entity_id);")
# === 4. Curation, User, and Tagging Layer ===
print("Creating Curation, User, and Tagging Layer...")
print("Creating Users table...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS users (
id INTEGER PRIMARY KEY,
username TEXT NOT NULL UNIQUE,
password_hash TEXT NOT NULL,
role TEXT NOT NULL, -- 'admin' or 'user'
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
print("Creating Invitation Tokens table...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS invitation_tokens (
id INTEGER PRIMARY KEY,
token_value TEXT NOT NULL UNIQUE,
created_by_user_id INTEGER NOT NULL,
claimed_by_user_id INTEGER UNIQUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
claimed_at TIMESTAMP,
FOREIGN KEY (created_by_user_id) REFERENCES users(id) ON DELETE CASCADE,
FOREIGN KEY (claimed_by_user_id) REFERENCES users(id) ON DELETE SET NULL
);
""")
print("Creating Catalogs table...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS catalogs (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
description TEXT,
catalog_type TEXT NOT NULL DEFAULT 'user', -- 'user', 'podcast', 'favorites'
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_catalog_type ON catalogs (catalog_type);")
if not is_bake_operation:
try:
cursor.execute(
"INSERT INTO catalogs (name, description, catalog_type) VALUES (?, ?, ?)",
('⭐ Favorites', 'Documents you have marked as a favorite.', 'favorites')
)
print("Created default '⭐ Favorites' catalog.")
except sqlite3.IntegrityError:
print("'⭐ Favorites' catalog already exists.")
print("Creating Document-Catalogs link table...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS document_catalogs (
doc_id INTEGER NOT NULL,
catalog_id INTEGER NOT NULL,
added_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE,
FOREIGN KEY (catalog_id) REFERENCES catalogs(id) ON DELETE CASCADE,
PRIMARY KEY (doc_id, catalog_id)
);
""")
print("Creating Document Curation (Private Notes) table...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS document_curation (
doc_id INTEGER NOT NULL,
user_id INTEGER NOT NULL,
note TEXT,
updated_at TIMESTAMP,
PRIMARY KEY (doc_id, user_id),
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE,
FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE
);
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_curation_user_doc ON document_curation(doc_id, user_id)")
print("Creating Document Comments table...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS document_comments (
id INTEGER PRIMARY KEY,
doc_id INTEGER NOT NULL,
user_id INTEGER NOT NULL,
comment_text TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE,
FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE
);
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_comments_doc_lookup ON document_comments(doc_id)")
print("Creating Tags table...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE
);
""")
print("Creating Document-Tags link table...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS document_tags (
doc_id INTEGER NOT NULL,
tag_id INTEGER NOT NULL,
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES tags(id) ON DELETE CASCADE,
PRIMARY KEY (doc_id, tag_id)
);
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_doctags_tag_id ON document_tags (tag_id);")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_tags_doc_lookup ON document_tags(doc_id)")
# === 5. Aggregated View Cache ===
print("Creating Aggregated View Cache...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS browse_cache (
entity_id INTEGER PRIMARY KEY,
entity_text TEXT NOT NULL,
entity_label TEXT NOT NULL,
document_count INTEGER NOT NULL,
appearance_count INTEGER NOT NULL,
FOREIGN KEY (entity_id) REFERENCES entities(id) ON DELETE CASCADE
);
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_cache_label_count ON browse_cache (entity_label, document_count DESC);")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_cache_label_text ON browse_cache (entity_label, entity_text COLLATE NOCASE);")
# === 6. Entity Relationship Graph ===
print("Creating Entity Relationship Graph...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS entity_relationships (
id INTEGER PRIMARY KEY,
subject_entity_id INTEGER NOT NULL,
object_entity_id INTEGER NOT NULL,
relationship_phrase TEXT NOT NULL,
doc_id INTEGER NOT NULL,
page_number INTEGER NOT NULL,
FOREIGN KEY (subject_entity_id) REFERENCES entities(id) ON DELETE CASCADE,
FOREIGN KEY (object_entity_id) REFERENCES entities(id) ON DELETE CASCADE,
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE
);
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_subject ON entity_relationships (subject_entity_id);")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_object ON entity_relationships (object_entity_id);")
# === 6b. SRT Cue Index ===
print("Creating SRT Cue Index...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS srt_cues (
id INTEGER PRIMARY KEY,
doc_id INTEGER NOT NULL,
sequence INTEGER NOT NULL,
timestamp TEXT NOT NULL,
dialogue TEXT NOT NULL,
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE,
UNIQUE(doc_id, sequence)
);
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_srt_cues_doc_id ON srt_cues (doc_id);")
# === 7. Archived Relationships ===
print("Creating Archived Relationships table...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS archived_relationships (
subject_entity_id INTEGER NOT NULL,
object_entity_id INTEGER NOT NULL,
relationship_phrase TEXT NOT NULL,
archived_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (subject_entity_id, object_entity_id, relationship_phrase)
);
""")
# === 8. Synthesis & Reporting Layer ===
print("Creating Synthesis & Reporting Layer...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS document_metadata (
doc_id INTEGER PRIMARY KEY,
csl_json TEXT,
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_by_user_id INTEGER,
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE,
FOREIGN KEY (updated_by_user_id) REFERENCES users(id) ON DELETE SET NULL
);
""")
print("Creating Email Metadata table...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS email_metadata (
doc_id INTEGER PRIMARY KEY,
from_address TEXT,
to_addresses TEXT,
cc_addresses TEXT,
subject TEXT,
sent_at TIMESTAMP,
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE
);
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS synthesis_reports (
id INTEGER PRIMARY KEY,
title TEXT NOT NULL,
content_json TEXT,
owner_id INTEGER NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP,
FOREIGN KEY (owner_id) REFERENCES users(id) ON DELETE CASCADE
);
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS synthesis_citations (
id INTEGER PRIMARY KEY,
citation_instance_uuid TEXT NOT NULL UNIQUE,
report_id INTEGER NOT NULL,
source_doc_id INTEGER NOT NULL,
page_number INTEGER,
quoted_text TEXT,
prefix TEXT,
suffix TEXT,
suppress_author BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (report_id) REFERENCES synthesis_reports(id) ON DELETE CASCADE,
FOREIGN KEY (source_doc_id) REFERENCES documents(id) ON DELETE CASCADE
);
""")
# === 9. Vector Search Index (Updated for sqlite-vec) ===
print("Creating Vector Metadata Tables...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS embedding_chunks (
id INTEGER PRIMARY KEY,
doc_id INTEGER NOT NULL,
page_number INTEGER NOT NULL,
chunk_text TEXT NOT NULL,
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE
);
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_embedding_doc_id ON embedding_chunks (doc_id);")
cursor.execute("""
CREATE TABLE IF NOT EXISTS super_embedding_chunks (
id INTEGER PRIMARY KEY,
doc_id INTEGER NOT NULL,
page_number INTEGER NOT NULL,
entity_id INTEGER NOT NULL,
chunk_text TEXT NOT NULL,
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE,
FOREIGN KEY (entity_id) REFERENCES entities(id) ON DELETE CASCADE
);
""")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_super_embedding_doc_id ON super_embedding_chunks (doc_id);")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_super_embedding_entity_id ON super_embedding_chunks (entity_id);")
print("Creating sqlite-vec Virtual Tables...")
# --- CHANGED TO 768 DIMENSIONS ---
cursor.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS vec_embedding_chunks USING vec0(
chunk_id INTEGER PRIMARY KEY,
embedding float[768]
);
""")
cursor.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS vec_super_embedding_chunks USING vec0(
chunk_id INTEGER PRIMARY KEY,
embedding float[768]
);
""")
# === 10. USER-DRIVEN WEIGHTING ===
print("Creating Boosted Relationships table...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS boosted_relationships (
id INTEGER PRIMARY KEY,
user_id INTEGER NOT NULL,
source_entity_id INTEGER NOT NULL,
target_entity_id INTEGER NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(user_id, source_entity_id, target_entity_id),
FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE,
FOREIGN KEY (source_entity_id) REFERENCES entities(id) ON DELETE CASCADE,
FOREIGN KEY (target_entity_id) REFERENCES entities(id) ON DELETE CASCADE
);
""")
# === 11. AUTOMATIC MAINTENANCE TRIGGERS ===
print("Installing automatic maintenance triggers...")
# Comments
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS trg_comment_added AFTER INSERT ON document_comments
BEGIN UPDATE documents SET cached_comment_count = cached_comment_count + 1 WHERE id = NEW.doc_id; END;
""")
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS trg_comment_deleted AFTER DELETE ON document_comments
BEGIN UPDATE documents SET cached_comment_count = MAX(0, cached_comment_count - 1) WHERE id = OLD.doc_id; END;
""")
# Tags
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS trg_tag_added AFTER INSERT ON document_tags
BEGIN UPDATE documents SET cached_tag_count = cached_tag_count + 1 WHERE id = NEW.doc_id; END;
""")
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS trg_tag_deleted AFTER DELETE ON document_tags
BEGIN UPDATE documents SET cached_tag_count = MAX(0, cached_tag_count - 1) WHERE id = OLD.doc_id; END;
""")
# --- Vector Garbage Collection Triggers ---
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS trg_delete_vec_embedding AFTER DELETE ON embedding_chunks
BEGIN
DELETE FROM vec_embedding_chunks WHERE chunk_id = OLD.id;
END;
""")
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS trg_delete_vec_super_embedding AFTER DELETE ON super_embedding_chunks
BEGIN
DELETE FROM vec_super_embedding_chunks WHERE chunk_id = OLD.id;
END;
""")
conn.commit()
conn.close()
print("--- Unified Index setup is complete. ---")
if __name__ == '__main__':
db_file = Path(DATABASE_FILE)
if db_file.exists():
response = input(f"Are you sure you want to delete the existing database at '{db_file}'? [y/N] ")
if response.lower() == 'y':
db_file.unlink()
print(f"Removed existing database file: {db_file}")
create_unified_index(db_file)
if db_file.exists():
print(f"\nSuccessfully created '{db_file}'.")
else:
print(f"\nError: Database file '{db_file}' was not created.")
else:
print("Aborting. No changes made.")
else:
create_unified_index(db_file)
if db_file.exists():
print(f"\nSuccessfully created '{db_file}'.")
else:
print(f"\nError: Database file '{db_file}' was not created.")