-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
101 lines (82 loc) · 2.92 KB
/
main.py
File metadata and controls
101 lines (82 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
FastAPI application that uses a Hugging Face cross-encoder reranker
to rank documents based on their similarity to a given query.
POST /api/v1/rerank
Request: { "query": "...", "documents": [ { "id": ..., "text": "..." }, ... ] }
Response: { "data": [ { "id": ..., "similarity": ... }, ... ] }
"""
import os
import logging
from uuid import UUID
from typing import List, Union
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Configuration
host = os.getenv("HOST", "127.0.0.1")
port = int(os.getenv("PORT", "8787"))
max_length = int(os.getenv("MAX_LENGTH", "512"))
model_name = os.getenv("MODEL", "BAAI/bge-reranker-v2-m3")
device = os.getenv("DEVICE") or ("cuda" if torch.cuda.is_available() else "cpu")
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logging.info("host: %s", host)
logging.info("port: %d", port)
logging.info("max_length: %d", max_length)
logging.info("model: %s", model_name)
logging.info("device: %s", device)
# Load model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model = model.to(device)
model.eval()
app = FastAPI()
# Data models
class Document(BaseModel):
id: Union[int, str, UUID]
text: str
class RequestData(BaseModel):
query: str
documents: List[Document]
def construct_pairs(self):
return [[self.query, doc.text] for doc in self.documents]
class ResponseData(BaseModel):
id: Union[int, str, UUID]
similarity: float
# Endpoint
@app.post("/api/v1/rerank")
async def rerank_documents(request: RequestData):
logging.info("Reranking %d documents for query: %.50s...", len(request.documents), request.query)
# Guard: no documents → return empty list
if not request.documents:
logging.info("No documents supplied, returning empty result.")
return {"data": []}
# Optional: filter out empty/whitespace-only texts
docs = [doc for doc in request.documents if (doc.text or "").strip()]
if not docs:
logging.info("All documents empty after filtering, returning empty result.")
return {"data": []}
pairs = [[request.query, doc.text] for doc in docs]
with torch.no_grad():
inputs = tokenizer(
pairs,
padding=True,
truncation=True,
return_tensors="pt",
max_length=max_length
).to(device)
scores = model(**inputs, return_dict=True).logits.view(-1).float()
response = [
ResponseData(id=doc.id, similarity=score.item())
for doc, score in zip(docs, scores)
]
response.sort(key=lambda r: r.similarity, reverse=True)
return {"data": response}
# Basic healthcheck
@app.get("/healthz")
async def healthz():
return {"status": "ok"}
# Entrypoint
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host=host, port=port)