fix: async FastAPI routes with asyncio.to_thread, parallel batch, correct utils import and top_p is implimented and set to .85

aiKunalBisht · aiKunalBisht · commit 6711d8b52c92 · 2026-05-24T09:22:50.000+05:30
diff --git a/analysis/analyzer.py b/analysis/analyzer.py
@@ -256,6 +256,7 @@ def _call_groq(prompt: str, max_tokens: int) -> str:
                     "model": GROQ_MODEL,
                     "messages": [{"role": "user", "content": prompt}],
                     "temperature": 0.1,
+                    "top_p": 0.85,
                     "max_tokens": max_tokens,
                     "response_format": {"type": "json_object"},
                 },
@@ -299,7 +300,8 @@ def stream_transcript_groq(text: str, language: str = "en"):
             json={
                 "model":    GROQ_MODEL,
                 "messages": [{"role": "user", "content": stream_prompt}],
-                "temperature": 0.2,
+                "temperature": 0.1,
+                "top_p": 0.85,
                 "max_tokens":  1000,
                 "stream":   True,
             },
@@ -330,7 +332,7 @@ def _call_ollama(prompt: str, max_tokens: int) -> str:
             "prompt":  prompt,
             "stream":  False,
             "format":  "json",
-            "options": {"temperature": 0.1, "num_predict": max_tokens},
+            "options": {"temperature": 0.1, "num_predict": max_tokens, "top_p": 0.85},
             "think":   False
         },
         timeout=90
@@ -350,6 +352,7 @@ def _call_groq_langchain(prompt: str, max_tokens: int) -> str:
         api_key=api_key,
         model=GROQ_MODEL,
         temperature=0.1,
+        top_p=0.85,
         max_tokens=max_tokens,
         timeout=25,
         model_kwargs={"response_format": {"type": "json_object"}},
@@ -367,6 +370,7 @@ def _call_ollama_langchain(prompt: str, max_tokens: int) -> str:
         base_url=OLLAMA_URL.replace("/api/generate", ""),
         model=OLLAMA_MODEL,
         temperature=0.1,
+        top_p=0.85,
         num_predict=max_tokens,
         format="json",
     )
diff --git a/api/api.py b/api/api.py
@@ -1,31 +1,35 @@
 # api.py
-# FastAPI REST API for TranscriptAI
+# FastAPI REST API for TranscriptAI — v2
 #
-# This makes TranscriptAI enterprise-ready:
-# - Any CRM, HR system, or dashboard can call this endpoint
-# - Streamlit app continues working unchanged (calls analyzer.py directly)
-# - This API layer is for external integrations
+# v2 FIXES:
+#   FIX-1: analyze_transcript() wrapped in asyncio.to_thread() — was blocking
+#           the entire FastAPI event loop on every request (sync call inside async route).
+#           Now truly non-blocking: multiple users can hit /analyze simultaneously.
+#   FIX-2: utils import path corrected (was utils.utils, now utils directly).
+#   FIX-3: Temperature 0.1 + top_p 0.85 applied in analyzer.py — api.py inherits
+#           these automatically since it calls analyze_transcript().
+#   FIX-4: Batch endpoint now uses asyncio.gather() for true parallel execution
+#           instead of sequential await in a loop.
 #
 # Run with:
-#   pip install fastapi uvicorn
+#   pip install fastapi uvicorn httpx
 #   uvicorn api:app --reload --port 8000
 #
-# Then call:
-#   POST http://localhost:8000/analyze
-#   GET  http://localhost:8000/health
+# Docs: http://localhost:8000/docs
 
-from fastapi import FastAPI, HTTPException, BackgroundTasks
+import asyncio
+import uuid
+from datetime import datetime
+from typing import Optional
+
+from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
-from typing import Optional
-from datetime import datetime
-import uuid
-import json
 
 from analysis.analyzer import analyze_transcript
-from utils.utils import detect_language, clean_text
+from utils import detect_language, clean_text
 
-# Optional modules
+# ── Optional modules ──────────────────────────────────────────────────────────
 try:
     from transcription.pii_masker import mask_transcript, restore_pii_in_result, get_pii_report
     PII_AVAILABLE = True
@@ -48,11 +52,11 @@
 app = FastAPI(
     title="TranscriptAI API",
     description=(
-        "Japanese Business Intelligence — Call Transcript Analyzer API. "
+        "Japanese Business Intelligence — Meeting Transcript Analyzer. "
         "Extracts action items, sentiment, speaker breakdown, and Japan-specific "
-        "insights from meeting transcripts. APPI compliant."
+        "insights. APPI compliant via local PII masking before any LLM call."
     ),
-    version="1.0.0",
+    version="2.0.0",
 )
 
 app.add_middleware(
@@ -62,20 +66,21 @@
     allow_headers=["*"],
 )
 
+
 # ── REQUEST / RESPONSE MODELS ─────────────────────────────────────────────────
 class AnalyzeRequest(BaseModel):
     transcript: str = Field(
         ...,
         min_length=20,
-        description="The meeting transcript text. Supports Japanese, English, mixed JA/EN."
+        description="Meeting transcript. Supports Japanese, English, Hindi, mixed."
     )
     language: Optional[str] = Field(
         None,
-        description="Force language: 'ja', 'en', or 'mixed'. Leave null for auto-detect."
+        description="Force language: 'ja', 'en', 'hi', 'mixed'. Null = auto-detect."
     )
     mask_pii: bool = Field(
         True,
-        description="Anonymize PII before analysis (APPI compliance). Recommended: true."
+        description="Anonymize PII before LLM (APPI compliance). Recommended: true."
     )
     include_soft_rejections: bool = Field(
         True,
@@ -94,136 +99,158 @@ class Config:
 
 
 class AnalyzeResponse(BaseModel):
-    request_id: str
-    timestamp: str
-    language_detected: str
-    pii_masked: bool
-    pii_items_found: int
+    request_id:         str
+    timestamp:          str
+    language_detected:  str
+    pii_masked:         bool
+    pii_items_found:    int
     processing_time_ms: float
-    result: dict
+    result:             dict
 
 
 # ── HEALTH CHECK ──────────────────────────────────────────────────────────────
 @app.get("/health")
-def health():
+async def health():
     """Check API status and available modules."""
+    import os
+    groq_key_present = bool(os.getenv("GROQ_API_KEY", "").strip())
     return {
-        "status": "healthy",
-        "version": "1.0.0",
-        "modules": {
+        "status":   "healthy",
+        "version":  "2.0.0",
+        "modules":  {
             "pii_masker":          PII_AVAILABLE,
             "soft_rejection":      SOFT_REJECTION_AVAILABLE,
             "hallucination_guard": HALLUCINATION_GUARD_AVAILABLE,
         },
-        "model": "qwen3:8b via Ollama",
+        "provider":       "groq" if groq_key_present else "mock",
+        "groq_key":       groq_key_present,
         "appi_compliant": PII_AVAILABLE,
+        "async_mode":     True,
     }
 
 
-# ── MAIN ANALYZE ENDPOINT ─────────────────────────────────────────────────────
+# ── SINGLE ANALYZE ENDPOINT ───────────────────────────────────────────────────
 @app.post("/analyze", response_model=AnalyzeResponse)
 async def analyze(request: AnalyzeRequest):
     """
-    Analyze a meeting transcript and return structured intelligence.
+    Analyze a meeting transcript — returns structured intelligence.
 
-    - Detects language automatically (or use forced language)
-    - Masks PII before LLM processing (APPI compliant)
-    - Extracts: summary, action items, sentiment, speakers, Japan insights
-    - Detects soft rejections (検討します, 難しいかもしれません, etc.)
-    - Runs hallucination prevention on all outputs
+    Non-blocking: uses asyncio.to_thread() so multiple requests run concurrently.
+    analyze_transcript() itself is CPU/IO-bound sync code — thread pool handles it.
     """
-    start_time  = datetime.now()
-    request_id  = str(uuid.uuid4())[:8]
+    start_time = datetime.now()
+    request_id = str(uuid.uuid4())[:8]
 
     # Clean and validate
     transcript = clean_text(request.transcript)
     if len(transcript.strip()) < 20:
-        raise HTTPException(status_code=400, detail="Transcript too short (minimum 20 characters)")
+        raise HTTPException(
+            status_code=400,
+            detail="Transcript too short (minimum 20 characters after cleaning)"
+        )
 
     # Detect language
     detected_lang = detect_language(transcript)
     active_lang   = request.language or detected_lang
 
-    # PII masking
+    # PII masking — runs locally before any LLM call
     pii_items_found = 0
     pii_mask        = None
     text_to_analyze = transcript
 
     if request.mask_pii and PII_AVAILABLE:
+        # mask_transcript is fast/sync — ok to call directly
         text_to_analyze, pii_mask = mask_transcript(transcript)
         pii_report      = get_pii_report(pii_mask)
         pii_items_found = pii_report.get("total_pii_found", 0)
 
-    # Run analysis
+    # FIX-1: Run blocking analyze_transcript in thread pool
+    # This is the key async fix — Groq HTTP call is I/O bound but uses
+    # requests (sync). asyncio.to_thread() offloads it without blocking
+    # the event loop, so concurrent users don't queue behind each other.
     try:
-        result = analyze_transcript(text_to_analyze, active_lang)
+        result = await asyncio.to_thread(
+            analyze_transcript,
+            text_to_analyze,
+            active_lang
+        )
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
 
-    # Restore PII in results
+    # Restore PII in results (local operation, fast)
     if pii_mask is not None:
         result = restore_pii_in_result(result, pii_mask)
 
-    # Soft rejection detection
+    # Soft rejection detection (local pattern matching, fast)
     if request.include_soft_rejections and SOFT_REJECTION_AVAILABLE:
         result["soft_rejections"] = detect_soft_rejections(transcript)
 
-    # Calculate processing time
     elapsed_ms = (datetime.now() - start_time).total_seconds() * 1000
 
     return AnalyzeResponse(
-        request_id       = request_id,
-        timestamp        = datetime.now().isoformat(),
-        language_detected = active_lang,
-        pii_masked       = request.mask_pii and PII_AVAILABLE,
-        pii_items_found  = pii_items_found,
-        processing_time_ms = round(elapsed_ms, 1),
-        result           = result
+        request_id          = request_id,
+        timestamp           = datetime.now().isoformat(),
+        language_detected   = active_lang,
+        pii_masked          = request.mask_pii and PII_AVAILABLE,
+        pii_items_found     = pii_items_found,
+        processing_time_ms  = round(elapsed_ms, 1),
+        result              = result
     )
 
 
 # ── BATCH ENDPOINT ────────────────────────────────────────────────────────────
 @app.post("/analyze/batch")
 async def analyze_batch(requests: list[AnalyzeRequest]):
     """
-    Analyze multiple transcripts in sequence.
-    For high-volume use (10,000+/day), combine with Redis Queue + vLLM.
+    Analyze multiple transcripts in parallel.
+
+    FIX-4: Uses asyncio.gather() for true concurrent execution.
+    Was previously sequential (await in loop) — now all run simultaneously.
+    Max 10 per batch. For 10,000+/day use Redis Queue + vLLM.
     """
     if len(requests) > 10:
         raise HTTPException(
             status_code=400,
-            detail="Batch limit is 10 transcripts. For larger volumes use async queue."
+            detail="Batch limit is 10. For larger volumes use the async job queue."
         )
 
-    results = []
-    for req in requests:
+    # FIX-4: gather runs all requests concurrently, not one by one
+    async def _safe_analyze(req: AnalyzeRequest) -> dict:
         try:
             result = await analyze(req)
-            results.append({"status": "success", "data": result})
+            return {"status": "success", "data": result.dict()}
+        except HTTPException as e:
+            return {"status": "error", "error": e.detail}
         except Exception as e:
-            results.append({"status": "error", "error": str(e)})
+            return {"status": "error", "error": str(e)}
+
+    results = await asyncio.gather(*[_safe_analyze(req) for req in requests])
 
     return {
         "batch_size": len(requests),
         "successful": sum(1 for r in results if r["status"] == "success"),
         "failed":     sum(1 for r in results if r["status"] == "error"),
-        "results":    results
+        "results":    list(results)
     }
 
 
 # ── PATTERNS ENDPOINT ─────────────────────────────────────────────────────────
 @app.get("/patterns/soft-rejections")
-def get_soft_rejection_patterns():
+async def get_soft_rejection_patterns():
     """Returns the full soft rejection pattern dictionary with cultural explanations."""
     if not SOFT_REJECTION_AVAILABLE:
-        raise HTTPException(status_code=503, detail="soft_rejection_detector.py not available")
+        raise HTTPException(
+            status_code=503,
+            detail="soft_rejection_detector.py not found"
+        )
     from analysis.soft_rejection_detector import SOFT_REJECTION_PATTERNS
     return {
         "total_patterns": len(SOFT_REJECTION_PATTERNS),
-        "patterns": SOFT_REJECTION_PATTERNS,
+        "patterns":       SOFT_REJECTION_PATTERNS,
         "cultural_context": (
             "Japanese business communication avoids direct refusal. "
-            "These patterns encode the speaker's true intent through indirect language."
+            "These patterns encode the speaker's true intent through indirect language. "
+            "Examples: 検討いたします (likely rejection), 難しいかもしれません (high rejection signal)."
         )
     }
 
diff --git a/app.py b/app.py
@@ -1020,7 +1020,7 @@ def _cold_start_tasks():
             st.session_state.pii_report = get_pii_report(pii_mask)
 
         bar.progress(35, text="Running AI analysis…")
-        with st.spinner("Analyzing · ~3s with Groq · 1–2 min with Ollama"):
+        with st.spinner("Its is working may take some time please wait a moment..."):
             results = analyze_transcript(text_in, active_lang)
 
         if pii_mask is not None: