-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgenerate_vector_db.py
More file actions
49 lines (41 loc) · 1.73 KB
/
generate_vector_db.py
File metadata and controls
49 lines (41 loc) · 1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
from sentence_transformers import SentenceTransformer
from chromadb import PersistentClient
# Crear cliente persistente con la nueva arquitectura de ChromaDB
client = PersistentClient(path="./vector_db")
# Crear o recuperar la colección
collection = client.get_or_create_collection("codehelper_csharp")
# Cargar modelo multilingüe para embeddings (soporta español e inglés)
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
# Función para dividir texto en fragmentos (chunking básico)
def split_text(text, max_length=300):
sentences = text.split(". ")
chunks, current_chunk = [], ""
for sentence in sentences:
if len(current_chunk) + len(sentence) < max_length:
current_chunk += sentence + ". "
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# Leer archivos de la carpeta /data (ignorando el índice)
data_dir = "./data"
doc_id = 0
for filename in os.listdir(data_dir):
if filename.endswith(".txt") and "Index" not in filename:
filepath = os.path.join(data_dir, filename)
with open(filepath, "r", encoding="utf-8") as file:
content = file.read()
chunks = split_text(content)
for chunk in chunks:
embedding = model.encode(chunk).tolist()
collection.add(
documents=[chunk],
embeddings=[embedding],
ids=[f"chunk_{doc_id}"]
)
doc_id += 1
# ¡Listo! Base guardada automáticamente en vector_db/
print(f"✅ Base vectorial creada con {doc_id} fragmentos.")