-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvectorstore.py
More file actions
90 lines (72 loc) · 2.69 KB
/
vectorstore.py
File metadata and controls
90 lines (72 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""Embedding model and Chroma vector store."""
from __future__ import annotations
import gc
import shutil
import time
from pathlib import Path
from typing import TYPE_CHECKING
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
if TYPE_CHECKING:
from langchain_core.documents import Document
from config import Settings
def _rmtree_robust(path: Path) -> None:
"""Remove a directory tree on Windows where Chroma may still hold file handles briefly."""
if not path.exists():
return
gc.collect()
time.sleep(0.2)
for attempt in range(8):
try:
shutil.rmtree(path)
return
except PermissionError:
gc.collect()
time.sleep(0.4 * (attempt + 1))
shutil.rmtree(path, ignore_errors=True)
def build_embeddings(settings: Settings) -> HuggingFaceEmbeddings:
"""Local sentence-transformers embeddings (OpenRouter does not expose an embeddings API)."""
return HuggingFaceEmbeddings(model_name=settings.embedding_model_name)
def _persist_has_chroma(persist_dir: Path) -> bool:
"""Return True if ``persist_dir`` looks like a Chroma 0.4+ SQLite store."""
return (persist_dir / "chroma.sqlite3").is_file()
def open_vectorstore(persist_dir: Path, embedding: HuggingFaceEmbeddings) -> Chroma:
"""Open an existing on-disk Chroma index."""
return Chroma(
persist_directory=str(persist_dir),
embedding_function=embedding,
)
def build_vectorstore(
splits: list[Document],
embedding: HuggingFaceEmbeddings,
persist_dir: Path,
) -> Chroma:
"""Persist Chroma under ``persist_dir``, replacing any existing store."""
if persist_dir.exists():
_rmtree_robust(persist_dir)
persist_dir.mkdir(parents=True, exist_ok=True)
return Chroma.from_documents(
documents=splits,
embedding=embedding,
persist_directory=str(persist_dir),
)
def ensure_vectorstore(
settings: Settings,
embedding: HuggingFaceEmbeddings,
persist_dir: Path,
*,
force_rebuild: bool = False,
) -> Chroma:
"""Load Chroma from disk when present; otherwise ingest PDFs. Optionally wipe and rebuild."""
if force_rebuild and persist_dir.exists():
gc.collect()
_rmtree_robust(persist_dir)
if persist_dir.exists() and _persist_has_chroma(persist_dir):
try:
return open_vectorstore(persist_dir, embedding)
except Exception:
_rmtree_robust(persist_dir)
from documents import load_pdf_documents, split_documents
docs = load_pdf_documents(settings)
splits = split_documents(docs)
return build_vectorstore(splits, embedding, persist_dir)