|
4 | 4 | import time |
5 | 5 | import logging |
6 | 6 | import os |
| 7 | +import multiprocessing |
| 8 | +import psutil |
| 9 | +from concurrent.futures import ThreadPoolExecutor |
7 | 10 | from InstructorEmbedding import INSTRUCTOR |
8 | 11 |
|
9 | 12 | # Configure logs |
|
12 | 15 |
|
13 | 16 | app = FastAPI() |
14 | 17 |
|
| 18 | +def get_system_resources(): |
| 19 | + """Obtiene información sobre los recursos del sistema""" |
| 20 | + cpu_count = multiprocessing.cpu_count() |
| 21 | + memory = psutil.virtual_memory() |
| 22 | + return { |
| 23 | + "cpu_count": cpu_count, |
| 24 | + "memory_total": memory.total, |
| 25 | + "memory_available": memory.available |
| 26 | + } |
| 27 | + |
| 28 | +def calculate_optimal_parameters(system_resources): |
| 29 | + """Calcula los parámetros óptimos basados en los recursos del sistema""" |
| 30 | + # Usamos el 75% de los núcleos disponibles para workers |
| 31 | + max_workers = max(1, int(system_resources["cpu_count"] * 0.75)) |
| 32 | + |
| 33 | + # Calculamos el tamaño de chunk basado en la memoria disponible |
| 34 | + # Asumimos que cada embedding ocupa aproximadamente 1MB |
| 35 | + # y dejamos un margen de seguridad del 50% |
| 36 | + memory_per_chunk = 1024 * 1024 # 1MB por chunk |
| 37 | + available_chunks = int((system_resources["memory_available"] * 0.5) / memory_per_chunk) |
| 38 | + chunk_size = max(4, min(32, available_chunks // max_workers)) |
| 39 | + |
| 40 | + logger.info(f"System resources: {system_resources}") |
| 41 | + logger.info(f"Calculated parameters - max_workers: {max_workers}, chunk_size: {chunk_size}") |
| 42 | + |
| 43 | + return max_workers, chunk_size |
| 44 | + |
| 45 | +# Obtener recursos del sistema y calcular parámetros óptimos |
| 46 | +system_resources = get_system_resources() |
| 47 | +max_workers, chunk_size = calculate_optimal_parameters(system_resources) |
| 48 | + |
15 | 49 | # Global model state |
16 | 50 | model_state = { |
17 | 51 | "status": "starting", # starting, downloading, loading, warming_up, ready, error |
18 | 52 | "progress": 0, |
19 | 53 | "message": "Starting up...", |
20 | | - "model": None |
| 54 | + "model": None, |
| 55 | + "executor": ThreadPoolExecutor(max_workers=max_workers), |
| 56 | + "chunk_size": chunk_size, |
| 57 | + "system_resources": system_resources |
21 | 58 | } |
22 | 59 |
|
23 | 60 | # Input of the endpoint |
@@ -99,5 +136,34 @@ async def vectorize(req: VectorizeRequest): |
99 | 136 |
|
100 | 137 | # Pair each text with its corresponding instruction |
101 | 138 | pairs = [[instruction, text] for text, instruction in zip(req.instructions, req.texts)] |
102 | | - embeddings = model_state["model"].encode(pairs) |
103 | | - return {"embeddings": embeddings.tolist()} |
| 139 | + |
| 140 | + # Dividir los pares en chunks para procesamiento paralelo |
| 141 | + chunks = [pairs[i:i + model_state["chunk_size"]] for i in range(0, len(pairs), model_state["chunk_size"])] |
| 142 | + |
| 143 | + # Procesar chunks en paralelo |
| 144 | + futures = [] |
| 145 | + for chunk in chunks: |
| 146 | + future = model_state["executor"].submit(model_state["model"].encode, chunk) |
| 147 | + futures.append(future) |
| 148 | + |
| 149 | + # Recolectar resultados |
| 150 | + embeddings = [] |
| 151 | + for future in futures: |
| 152 | + embeddings.extend(future.result().tolist()) |
| 153 | + |
| 154 | + return {"embeddings": embeddings} |
| 155 | + |
| 156 | +@app.get("/system-info") |
| 157 | +async def get_system_info(): |
| 158 | + """Endpoint para obtener información del sistema y parámetros actuales""" |
| 159 | + return { |
| 160 | + "system_resources": model_state["system_resources"], |
| 161 | + "parameters": { |
| 162 | + "max_workers": model_state["executor"]._max_workers, |
| 163 | + "chunk_size": model_state["chunk_size"] |
| 164 | + } |
| 165 | + } |
| 166 | + |
| 167 | +@app.on_event("shutdown") |
| 168 | +async def shutdown_event(): |
| 169 | + model_state["executor"].shutdown(wait=True) |
0 commit comments