FastAPIReranker/main.py at main · ToeiRei/FastAPIReranker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
FastAPI application that uses a Hugging Face cross-encoder reranker
to rank documents based on their similarity to a given query.

POST /api/v1/rerank
Request: { "query": "...", "documents": [ { "id": ..., "text": "..." }, ... ] }
Response: { "data": [ { "id": ..., "similarity": ... }, ... ] }
"""

import os
import logging
from uuid import UUID
from typing import List, Union
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Configuration
host = os.getenv("HOST", "127.0.0.1")
port = int(os.getenv("PORT", "8787"))
max_length = int(os.getenv("MAX_LENGTH", "512"))
model_name = os.getenv("MODEL", "BAAI/bge-reranker-v2-m3")
device = os.getenv("DEVICE") or ("cuda" if torch.cuda.is_available() else "cpu")

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logging.info("host: %s", host)
logging.info("port: %d", port)
logging.info("max_length: %d", max_length)
logging.info("model: %s", model_name)
logging.info("device: %s", device)

# Load model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model = model.to(device)
model.eval()

app = FastAPI()

# Data models
class Document(BaseModel):
    id: Union[int, str, UUID]
    text: str

class RequestData(BaseModel):
    query: str
    documents: List[Document]

    def construct_pairs(self):
        return [[self.query, doc.text] for doc in self.documents]

class ResponseData(BaseModel):
    id: Union[int, str, UUID]
    similarity: float

# Endpoint
@app.post("/api/v1/rerank")
async def rerank_documents(request: RequestData):
    logging.info("Reranking %d documents for query: %.50s...", len(request.documents), request.query)

    # Guard: no documents → return empty list
    if not request.documents:
        logging.info("No documents supplied, returning empty result.")
        return {"data": []}

    # Optional: filter out empty/whitespace-only texts
    docs = [doc for doc in request.documents if (doc.text or "").strip()]
    if not docs:
        logging.info("All documents empty after filtering, returning empty result.")
        return {"data": []}

    pairs = [[request.query, doc.text] for doc in docs]

    with torch.no_grad():
        inputs = tokenizer(
            pairs,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=max_length
        ).to(device)
        scores = model(**inputs, return_dict=True).logits.view(-1).float()

    response = [
        ResponseData(id=doc.id, similarity=score.item())
        for doc, score in zip(docs, scores)
    ]
    response.sort(key=lambda r: r.similarity, reverse=True)

    return {"data": response}

# Basic healthcheck
@app.get("/healthz")
async def healthz():
    return {"status": "ok"}

# Entrypoint
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host=host, port=port)