diff --git a/cv_eval/llm_scorer.py b/cv_eval/llm_scorer.py
index 970ea7a..3a4ec24 100644
--- a/cv_eval/llm_scorer.py
+++ b/cv_eval/llm_scorer.py
@@ -304,15 +304,27 @@ def unified_evaluate(self, cv_text: str, jd_text: str = "") -> dict:
         else:
             prompt = CV_ONLY_EVALUATION_PROMPT.format(cv_text=cv_text)
 
-        raw = self._call_llm(prompt)
-        cleaned = self._extract_json_from_response(raw)
-        return json.loads(cleaned)
+        return self._generate_and_parse_json(prompt)
 
     # ---------- CV only (legacy alias) ----------
     def evaluate_cv_only(self, cv_text: str) -> dict:
         return self.unified_evaluate(cv_text=cv_text, jd_text="")
 
     # ---------- Internals ----------
+    def _generate_and_parse_json(self, prompt: str) -> dict:
+        """Retry LLM call if JSON parsing fails."""
+        for attempt in range(3):
+            try:
+                raw = self._call_llm(prompt)
+                cleaned = self._extract_json_from_response(raw)
+                return json.loads(cleaned)
+            except json.JSONDecodeError as e:
+                logger.warning(f"JSON parsing failed (attempt {attempt+1}/3): {e}. Retrying.")
+                if attempt == 2:
+                    logger.error(f"Final JSON parsing failure. Raw response: {raw}")
+                    raise
+        raise ValueError("Failed to generate valid JSON")
+
     def _call_llm(self, prompt: str) -> str:
         for attempt in range(3):
             try:
@@ -324,6 +336,7 @@ def _call_llm(self, prompt: str) -> str:
                     ],
                     temperature=self.temperature,
                     max_tokens=3500,
+                    response_format={"type": "json_object"},
                 )
                 return resp.choices[0].message.content.strip()
             except Exception as e:
@@ -337,9 +350,7 @@ def improvement(self, cv_text: str, jd_text: str) -> dict:
             raise ValueError("Both CV text and JD text are required for improvement")
 
         prompt = IMPROVEMENT_PROMPT.format(cv_text=cv_text, jd_text=jd_text)
-        raw = self._call_llm(prompt)
-        cleaned = self._extract_json_from_response(raw)
-        return json.loads(cleaned)
+        return self._generate_and_parse_json(prompt)
 
 
     @staticmethod
diff --git a/debug_groq.py b/debug_groq.py
new file mode 100644
index 0000000..f928eb6
--- /dev/null
+++ b/debug_groq.py
@@ -0,0 +1,20 @@
+
+try:
+    from groq import Groq
+    import inspect
+    
+    print(f"Groq imported successfully: {Groq}")
+    # Use dummy key just to init client
+    client = Groq(api_key="gsk_dummy")
+    print(f"Client type: {type(client)}")
+    print(f"Client dir: {dir(client)}")
+    
+    if hasattr(client, 'audio'):
+        print("Client has 'audio' attribute")
+        print(f"Audio type: {type(client.audio)}")
+        print(f"Audio dir: {dir(client.audio)}")
+    else:
+        print("Client MISSING 'audio' attribute")
+        
+except Exception as e:
+    print(f"Error: {e}")
diff --git a/debug_voice_import.py b/debug_voice_import.py
new file mode 100644
index 0000000..e191bdc
--- /dev/null
+++ b/debug_voice_import.py
@@ -0,0 +1,16 @@
+
+import sys
+import os
+import traceback
+
+print(f"Python executable: {sys.executable}")
+try:
+    print("Attempting to import interview.voice_analyzer...")
+    from interview.voice_analyzer import voice_analyzer
+    print(f"Successfully imported voice_analyzer: {voice_analyzer}")
+except ImportError:
+    print("Caught ImportError!")
+    traceback.print_exc()
+except Exception:
+    print("Caught unexpected exception!")
+    traceback.print_exc()
diff --git a/interview/session_manager.py b/interview/session_manager.py
index 8d92880..5eca608 100644
--- a/interview/session_manager.py
+++ b/interview/session_manager.py
@@ -544,7 +544,12 @@ def _combine_text_voice_scores(self, tech_eval: Dict, voice_eval: Dict) -> Dict:
         # If fallback local 'raw' exists, try to use its finer-grained breakdown for suggestions
         raw_text_eval = tech_eval.get("raw") or {}
 
-        voice_score = voice_eval.get("voice_scores", {}).get("total", 0.0)  # Out of 6
+        voice_data = voice_eval.get("voice_scores", {})
+        # Handle both flat (legacy) and nested (new) structures
+        if "raw" in voice_data:
+             voice_score = voice_data.get("raw", {}).get("total", 0.0)
+        else:
+             voice_score = voice_data.get("total", 0.0)
 
         # If no voice data, slightly penalize overall outcome
         if voice_score == 0.0:
diff --git a/interview/speech_to_text.py b/interview/speech_to_text.py
index c7770ad..2b02bbb 100644
--- a/interview/speech_to_text.py
+++ b/interview/speech_to_text.py
@@ -248,13 +248,20 @@ def convert_audio_to_text(self, audio_data: bytes, language: Optional[str] = Non
 
             if isinstance(response, dict):
                 text = response.get("text")
-
                 if not text and "segments" in response:
                     text = " ".join(
                         seg.get("text", "") for seg in response.get("segments", [])
                     )
             else:
+                # With Groq Python SDK v1.0+, response is a Transcription object
+                # It might have a 'text' attribute.
                 text = getattr(response, "text", None)
+                if text is None:
+                     logger.warning(f"ASR response object content: {dir(response)}")
+            
+            # Explicitly check for string "None." which might be a hallucination or artifact
+            if text == "None." or text == "None":
+                 text = ""
 
             text = (text or "").strip()
             if not text:
diff --git a/interview/voice_analyzer.py b/interview/voice_analyzer.py
index e7f9d1a..87f036b 100644
--- a/interview/voice_analyzer.py
+++ b/interview/voice_analyzer.py
@@ -63,11 +63,15 @@ def analyze_voice(
                 return self._fail("no_audio_data")
 
             # -------- IN-MEMORY AUDIO DECODE (NO TEMP FILES) --------
+            # logger.info(f"[VoiceAnalyzer] Decoding {len(audio_data)} bytes...")
             audio_buffer = io.BytesIO(audio_data)
             y, sr = sf.read(audio_buffer, dtype="float32")
 
             if y is None or len(y) == 0:
+                logger.warning("[VoiceAnalyzer] sf.read returned empty array")
                 return self._fail("empty_audio_after_decode")
+            
+            # logger.info(f"[VoiceAnalyzer] Decoded: {len(y)} samples at {sr}Hz (Duration: {len(y)/sr:.2f}s)")
 
             # Convert to mono if needed
             if y.ndim > 1:
@@ -75,11 +79,18 @@ def analyze_voice(
 
             # Resample if needed
             if sr != self.sample_rate:
+                # logger.info(f"[VoiceAnalyzer] Resampling from {sr} to {self.sample_rate}")
                 y = librosa.resample(y, orig_sr=sr, target_sr=self.sample_rate)
                 sr = self.sample_rate
 
             analysis = self._analyze_audio_features(y, sr, transcript)
             analysis["analysis_ok"] = True
+            
+            # Check if metrics are all zero, which is suspicious
+            metrics = analysis.get("voice_metrics", {})
+            if metrics.get("duration", 0) == 0:
+                logger.warning(f"[VoiceAnalyzer] Processed audio but duration is 0. Metrics: {metrics}")
+
             return analysis
 
         except Exception as e:
@@ -341,3 +352,7 @@ def _fail(self, code: str) -> Dict[str, Any]:
                 "wpm_source": "none",
             },
         }
+
+
+# Global instance
+voice_analyzer = VoiceAnalyzer()
diff --git a/requirements.txt b/requirements.txt
index e6318d2..3881de2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,7 +35,7 @@ langchain-openai==0.3.31
 langgraph==0.6.6
 
 # LLM provider for CV evaluation
-groq==0.4.2
+groq>=1.0.0
 
 # Vector embeddings and search
 sentence-transformers==3.0.1
diff --git a/tests/test_api.py.disabled b/tests/test_api.py.disabled
new file mode 100644
index 0000000..20c5511
--- /dev/null
+++ b/tests/test_api.py.disabled
@@ -0,0 +1,449 @@
+import pytest
+import uuid
+from unittest.mock import patch, MagicMock
+from fastapi.testclient import TestClient
+from sqlalchemy.orm import Session
+
+from core.models import User, Session as SessionModel, Artifact, Question, Answer, Score
+from apps.api.deps.auth import get_password_hash
+
+
+class TestAuthEndpoints:
+    """Test authentication endpoints"""
+    
+    def test_register_user(self, client: TestClient, db_session: Session):
+        """Test user registration"""
+        user_data = {
+            "email": "newuser@example.com",
+            "password": "newpassword123",
+            "full_name": "New User"
+        }
+        
+        response = client.post("/v1/auth/register", json=user_data)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert data["email"] == user_data["email"]
+        assert data["full_name"] == user_data["full_name"]
+        assert "id" in data
+        assert "hashed_password" not in data
+    
+    def test_login_user(self, client: TestClient, db_session: Session):
+        """Test user login"""
+        # Create a user first
+        user = User(
+            email="loginuser@example.com",
+            hashed_password=get_password_hash("loginpassword123"),
+            full_name="Login User"
+        )
+        db_session.add(user)
+        db_session.commit()
+        
+        login_data = {
+            "username": "loginuser@example.com",
+            "password": "loginpassword123"
+        }
+        
+        response = client.post("/v1/auth/login", data=login_data)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert "access_token" in data
+        assert data["token_type"] == "bearer"
+    
+    def test_get_current_user(self, client: TestClient, db_session: Session):
+        """Test getting current user info"""
+        # Create a user and get token
+        user = User(
+            email="currentuser@example.com",
+            hashed_password=get_password_hash("currentpassword123"),
+            full_name="Current User"
+        )
+        db_session.add(user)
+        db_session.commit()
+        
+        login_data = {
+            "username": "currentuser@example.com",
+            "password": "currentpassword123"
+        }
+        
+        login_response = client.post("/v1/auth/login", data=login_data)
+        token = login_response.json()["access_token"]
+        
+        headers = {"Authorization": f"Bearer {token}"}
+        response = client.get("/v1/auth/me", headers=headers)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert data["email"] == user.email
+        assert data["full_name"] == user.full_name
+
+
+class TestSessionEndpoints:
+    """Test session management endpoints"""
+    
+    def test_create_session(self, client: TestClient, db_session: Session):
+        """Test creating a new interview session"""
+        # Create user and get token
+        user = User(
+            email="sessionuser@example.com",
+            hashed_password=get_password_hash("sessionpassword123"),
+            full_name="Session User"
+        )
+        db_session.add(user)
+        db_session.commit()
+        
+        login_response = client.post("/v1/auth/login", data={
+            "username": "sessionuser@example.com",
+            "password": "sessionpassword123"
+        })
+        token = login_response.json()["access_token"]
+        
+        headers = {"Authorization": f"Bearer {token}"}
+        session_data = {
+            "role": "Software Engineer",
+            "industry": "Technology",
+            "company": "TestCorp"
+        }
+        
+        with patch('interview.graph.InterviewGraph.create_plan') as mock_plan:
+            mock_plan.return_value = {
+                "total_questions": 10,
+                "competencies": ["technical", "behavioral"],
+                "estimated_duration": 45
+            }
+            
+            response = client.post("/v1/sessions", json=session_data, headers=headers)
+            assert response.status_code == 200
+            
+            data = response.json()
+            assert data["role"] == session_data["role"]
+            assert data["industry"] == session_data["industry"]
+            assert data["company"] == session_data["company"]
+            assert data["status"] == "planned"
+            assert "id" in data
+    
+    def test_get_session(self, client: TestClient, db_session: Session):
+        """Test getting a specific session"""
+        # Create user, session and get token
+        user = User(
+            email="getsessionuser@example.com",
+            hashed_password=get_password_hash("getsessionpassword123"),
+            full_name="Get Session User"
+        )
+        db_session.add(user)
+        db_session.commit()
+        
+        session = SessionModel(
+            user_id=user.id,
+            role="Software Engineer",
+            industry="Technology",
+            company="TestCorp",
+            status="planned"
+        )
+        db_session.add(session)
+        db_session.commit()
+        
+        login_response = client.post("/v1/auth/login", data={
+            "username": "getsessionuser@example.com",
+            "password": "getsessionpassword123"
+        })
+        token = login_response.json()["access_token"]
+        
+        headers = {"Authorization": f"Bearer {token}"}
+        response = client.get(f"/v1/sessions/{session.id}", headers=headers)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert data["id"] == str(session.id)
+        assert data["role"] == session.role
+    
+    def test_get_next_question(self, client: TestClient, db_session: Session):
+        """Test getting the next question in a session"""
+        # Create user, session, question and get token
+        user = User(
+            email="nextquestionuser@example.com",
+            hashed_password=get_password_hash("nextquestionpassword123"),
+            full_name="Next Question User"
+        )
+        db_session.add(user)
+        db_session.commit()
+        
+        session = SessionModel(
+            user_id=user.id,
+            role="Software Engineer",
+            industry="Technology",
+            company="TestCorp",
+            status="in_progress"
+        )
+        db_session.add(session)
+        db_session.commit()
+        
+        question = Question(
+            session_id=session.id,
+            competency="technical",
+            difficulty="medium",
+            text="What is the time complexity of binary search?",
+            meta={"signals_expected": ["algorithmic thinking"], "pitfalls": ["not considering edge cases"]}
+        )
+        db_session.add(question)
+        db_session.commit()
+        
+        login_response = client.post("/v1/auth/login", data={
+            "username": "nextquestionuser@example.com",
+            "password": "nextquestionpassword123"
+        })
+        token = login_response.json()["access_token"]
+        
+        headers = {"Authorization": f"Bearer {token}"}
+        response = client.get(f"/v1/sessions/{session.id}/next-question", headers=headers)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert data["question_id"] == str(question.id)
+        assert data["text"] == question.text
+        assert data["competency"] == question.competency
+        assert data["difficulty"] == question.difficulty
+    
+    def test_submit_answer(self, client: TestClient, db_session: Session):
+        """Test submitting an answer to a question"""
+        # Create user, session, question and get token
+        user = User(
+            email="submitansweruser@example.com",
+            hashed_password=get_password_hash("submitanswerpassword123"),
+            full_name="Submit Answer User"
+        )
+        db_session.add(user)
+        db_session.commit()
+        
+        session = SessionModel(
+            user_id=user.id,
+            role="Software Engineer",
+            industry="Technology",
+            company="TestCorp",
+            status="in_progress"
+        )
+        db_session.add(session)
+        db_session.commit()
+        
+        question = Question(
+            session_id=session.id,
+            competency="technical",
+            difficulty="medium",
+            text="What is the time complexity of binary search?",
+            meta={"signals_expected": ["algorithmic thinking"], "pitfalls": ["not considering edge cases"]}
+        )
+        db_session.add(question)
+        db_session.commit()
+        
+        login_response = client.post("/v1/auth/login", data={
+            "username": "submitansweruser@example.com",
+            "password": "submitanswerpassword123"
+        })
+        token = login_response.json()["access_token"]
+        
+        headers = {"Authorization": f"Bearer {token}"}
+        answer_data = {
+            "question_id": str(question.id),
+            "text": "Binary search has O(log n) time complexity because it divides the search space in half with each iteration."
+        }
+        
+        with patch('apps.worker.jobs.enqueue_scoring_job') as mock_enqueue:
+            mock_enqueue.return_value = "job-123"
+            
+            response = client.post(f"/v1/sessions/{session.id}/answer", json=answer_data, headers=headers)
+            assert response.status_code == 200
+            
+            data = response.json()
+            assert data["message"] == "Answer submitted successfully"
+            assert data["job_id"] == "job-123"
+    
+    def test_get_session_report(self, client: TestClient, db_session: Session):
+        """Test getting a session report"""
+        # Create user, session, questions, answers, scores and get token
+        user = User(
+            email="reportuser@example.com",
+            hashed_password=get_password_hash("reportpassword123"),
+            full_name="Report User"
+        )
+        db_session.add(user)
+        db_session.commit()
+        
+        session = SessionModel(
+            user_id=user.id,
+            role="Software Engineer",
+            industry="Technology",
+            company="TestCorp",
+            status="completed"
+        )
+        db_session.add(session)
+        db_session.commit()
+        
+        question = Question(
+            session_id=session.id,
+            competency="technical",
+            difficulty="medium",
+            text="What is the time complexity of binary search?",
+            meta={"signals_expected": ["algorithmic thinking"], "pitfalls": ["not considering edge cases"]}
+        )
+        db_session.add(question)
+        db_session.commit()
+        
+        answer = Answer(
+            session_id=session.id,
+            question_id=question.id,
+            text="Binary search has O(log n) time complexity.",
+            meta={}
+        )
+        db_session.add(answer)
+        db_session.commit()
+        
+        score = Score(
+            answer_id=answer.id,
+            rubric_json={
+                "scores": {
+                    "clarity": 4.0,
+                    "structure": 3.5,
+                    "depth_specificity": 4.0,
+                    "role_fit": 4.5,
+                    "technical": 4.0,
+                    "communication": 4.0,
+                    "ownership": 3.5
+                },
+                "rationale": "Good technical explanation with clear structure",
+                "action_items": ["Provide more specific examples"],
+                "exemplar_snippet": None,
+                "meta": {}
+            },
+            clarity=4.0,
+            structure=3.5,
+            depth_specificity=4.0,
+            role_fit=4.5,
+            technical=4.0,
+            communication=4.0,
+            ownership=3.5
+        )
+        db_session.add(score)
+        db_session.commit()
+        
+        login_response = client.post("/v1/auth/login", data={
+            "username": "reportuser@example.com",
+            "password": "reportpassword123"
+        })
+        token = login_response.json()["access_token"]
+        
+        headers = {"Authorization": f"Bearer {token}"}
+        response = client.get(f"/v1/sessions/{session.id}/report", headers=headers)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert data["session_id"] == str(session.id)
+        assert data["status"] == "completed"
+        assert "summary" in data
+        assert "detailed_scores" in data
+        assert "recommendations" in data
+
+
+class TestUploadEndpoints:
+    """Test file upload endpoints"""
+    
+    def test_upload_cv(self, client: TestClient, db_session: Session, tmp_path):
+        """Test CV file upload"""
+        # Create user and get token
+        user = User(
+            email="uploadcvuser@example.com",
+            hashed_password=get_password_hash("uploadcvpassword123"),
+            full_name="Upload CV User"
+        )
+        db_session.add(user)
+        db_session.commit()
+        
+        login_response = client.post("/v1/auth/login", data={
+            "username": "uploadcvuser@example.com",
+            "password": "uploadcvpassword123"
+        })
+        token = login_response.json()["access_token"]
+        
+        headers = {"Authorization": f"Bearer {token}"}
+        
+        # Create a test CV file
+        cv_content = "John Doe\nSoftware Engineer\n5 years experience\nPython, JavaScript"
+        cv_file = tmp_path / "test_cv.txt"
+        cv_file.write_text(cv_content)
+        
+        with open(cv_file, "rb") as f:
+            files = {"file": ("test_cv.txt", f, "text/plain")}
+            response = client.post("/v1/uploads/cv", files=files, headers=headers)
+        
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert "artifact_id" in data
+        assert data["message"] == "CV uploaded successfully"
+    
+    def test_upload_jd(self, client: TestClient, db_session: Session, tmp_path):
+        """Test JD file upload"""
+        # Create user and get token
+        user = User(
+            email="uploadjduser@example.com",
+            hashed_password=get_password_hash("uploadjdpassword123"),
+            full_name="Upload JD User"
+        )
+        db_session.add(user)
+        db_session.commit()
+        
+        login_response = client.post("/v1/auth/login", data={
+            "username": "uploadjduser@example.com",
+            "password": "uploadjdpassword123"
+        })
+        token = login_response.json()["access_token"]
+        
+        headers = {"Authorization": f"Bearer {token}"}
+        
+        # Create a test JD file
+        jd_content = "Software Engineer Position\nRequirements: Python, 3+ years experience\nResponsibilities: Develop web applications"
+        jd_file = tmp_path / "test_jd.txt"
+        jd_file.write_text(jd_content)
+        
+        with open(jd_file, "rb") as f:
+            files = {"file": ("test_jd.txt", f, "text/plain")}
+            response = client.post("/v1/uploads/jd", files=files, headers=headers)
+        
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert "artifact_id" in data
+        assert data["message"] == "JD uploaded successfully"
+
+
+class TestHealthEndpoint:
+    """Test health check endpoint"""
+    
+    def test_health_check(self, client: TestClient):
+        """Test health check endpoint"""
+        response = client.get("/healthz")
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert data["status"] == "healthy"
+        assert "timestamp" in data
+        assert "version" in data
+
+
+class TestErrorHandling:
+    """Test error handling and validation"""
+    
+    def test_invalid_json_422(self, client: TestClient):
+        """Test 422 error for invalid JSON"""
+        response = client.post("/v1/sessions", data="invalid json")
+        assert response.status_code == 422
+    
+    def test_unauthorized_401(self, client: TestClient):
+        """Test 401 error for unauthorized access"""
+        response = client.get("/v1/sessions")
+        assert response.status_code == 401
+    
+    def test_not_found_404(self, client: TestClient):
+        """Test 404 error for non-existent resource"""
+        response = client.get(f"/v1/sessions/{uuid.uuid4()}")
+        assert response.status_code == 404
diff --git a/tests/test_evaluator.py.disabled b/tests/test_evaluator.py.disabled
new file mode 100644
index 0000000..796c389
--- /dev/null
+++ b/tests/test_evaluator.py.disabled
@@ -0,0 +1,277 @@
+import pytest
+from interview.evaluate.rules import RulesBasedEvaluator
+from interview.evaluate.judge import LLMAsJudgeEvaluator
+from core.schemas import ScoreDetail
+
+
+class TestRulesBasedEvaluator:
+    """Test the rules-based evaluator for determinism and accuracy"""
+    
+    def setup_method(self):
+        """Set up evaluator for each test"""
+        self.evaluator = RulesBasedEvaluator()
+    
+    def test_star_pattern_detection(self):
+        """Test STAR method pattern detection"""
+        answer_with_star = """
+        When I was working as a software engineer at TechCorp, my responsibility was to 
+        optimize the database queries. I analyzed the slow queries and implemented indexing 
+        strategies. As a result, query performance improved by 40% and user satisfaction increased.
+        """
+        
+        star_scores = self.evaluator._analyze_star_patterns(answer_with_star)
+        
+        assert star_scores["situation"] > 0.5
+        assert star_scores["task"] > 0.5
+        assert star_scores["action"] > 0.5
+        assert star_scores["result"] > 0.5
+    
+    def test_metrics_detection(self):
+        """Test metrics and numbers detection"""
+        answer_with_metrics = "I improved performance by 25% and reduced costs by $50,000 annually."
+        
+        # Test metrics detection
+        metrics_found = any(re.search(r'\d+\s*%', answer_with_metrics) for re in [__import__('re')])
+        assert metrics_found
+    
+    def test_tradeoff_detection(self):
+        """Test trade-off recognition"""
+        answer_with_tradeoff = "We had to balance speed with accuracy. On the other hand, we could have prioritized quality over time."
+        
+        # Test trade-off detection
+        tradeoff_found = any(re.search(r'trade.?off|on\s+the\s+other\s+hand', answer_with_tradeoff, re.IGNORECASE) for re in [__import__('re')])
+        assert tradeoff_found
+    
+    def test_clarity_scoring(self):
+        """Test clarity score calculation"""
+        clear_answer = "I implemented a caching layer using Redis to improve response times. The solution reduced API latency from 200ms to 50ms."
+        unclear_answer = "I did some stuff with caching and it made things faster."
+        
+        clear_score = self.evaluator._calculate_clarity_score(clear_answer, {"situation": 0.8, "task": 0.7, "action": 0.9, "result": 0.8})
+        unclear_score = self.evaluator._calculate_clarity_score(unclear_answer, {"situation": 0.2, "task": 0.3, "action": 0.1, "result": 0.2})
+        
+        assert clear_score > unclear_score
+        assert clear_score >= 3.0  # Good clarity should score 3+
+        assert unclear_score <= 2.5  # Poor clarity should score 2.5 or below
+    
+    def test_structure_scoring(self):
+        """Test structure score calculation"""
+        structured_answer = "First, I analyzed the problem. Then, I designed a solution. Finally, I implemented and tested it."
+        unstructured_answer = "I did this and that and then some other things happened and it worked out."
+        
+        structured_score = self.evaluator._calculate_structure_score(structured_answer, {"situation": 0.8, "task": 0.7, "action": 0.9, "result": 0.8})
+        unstructured_score = self.evaluator._calculate_structure_score(unstructured_answer, {"situation": 0.2, "task": 0.3, "action": 0.1, "result": 0.2})
+        
+        assert structured_score > unstructured_score
+        assert structured_score >= 3.0
+        assert unstructured_score <= 2.5
+    
+    def test_depth_specificity_scoring(self):
+        """Test depth and specificity scoring"""
+        specific_answer = "I used Redis with TTL of 300 seconds, implemented cache invalidation on database updates, and added monitoring with Prometheus metrics."
+        vague_answer = "I added some caching and monitoring."
+        
+        specific_score = self.evaluator._calculate_depth_specificity_score(specific_answer, {"situation": 0.8, "task": 0.7, "action": 0.9, "result": 0.8})
+        vague_score = self.evaluator._calculate_depth_specificity_score(vague_answer, {"situation": 0.2, "task": 0.3, "action": 0.1, "result": 0.2})
+        
+        assert specific_score > vague_score
+        assert specific_score >= 3.5
+        assert vague_score <= 2.0
+    
+    def test_evaluation_determinism(self):
+        """Test that evaluations are deterministic (same input = same output)"""
+        answer_text = "I implemented a microservices architecture using Docker and Kubernetes. The solution improved scalability by 60% and reduced deployment time from hours to minutes."
+        question_meta = {
+            "competency": "technical",
+            "difficulty": "medium",
+            "signals_expected": ["architecture", "containerization", "metrics"],
+            "pitfalls": ["over-engineering", "lack of monitoring"]
+        }
+        
+        # Run evaluation multiple times
+        evaluation1 = self.evaluator.evaluate_answer(answer_text, question_meta)
+        evaluation2 = self.evaluator.evaluate_answer(answer_text, question_meta)
+        evaluation3 = self.evaluator.evaluate_answer(answer_text, question_meta)
+        
+        # Scores should be identical
+        assert evaluation1.scores.dict() == evaluation2.scores.dict()
+        assert evaluation2.scores.dict() == evaluation3.scores.dict()
+        
+        # Rationale should be identical
+        assert evaluation1.rationale == evaluation2.rationale
+        assert evaluation2.rationale == evaluation3.rationale
+    
+    def test_complete_evaluation(self):
+        """Test complete evaluation with all rubric dimensions"""
+        answer_text = """
+        When I was leading the migration from monolithic to microservices architecture, 
+        my responsibility was to ensure zero downtime during the transition. I implemented 
+        a blue-green deployment strategy with database migration scripts and rollback procedures. 
+        The result was a successful migration with only 15 minutes of planned maintenance window, 
+        and we achieved 99.9% uptime during the process.
+        """
+        
+        question_meta = {
+            "competency": "leadership",
+            "difficulty": "hard",
+            "signals_expected": ["planning", "risk management", "metrics"],
+            "pitfalls": ["lack of testing", "poor communication"]
+        }
+        
+        evaluation = self.evaluator.evaluate_answer(answer_text, question_meta)
+        
+        # Check all rubric dimensions are scored
+        assert 0 <= evaluation.scores.clarity <= 5
+        assert 0 <= evaluation.scores.structure <= 5
+        assert 0 <= evaluation.scores.depth_specificity <= 5
+        assert 0 <= evaluation.scores.role_fit <= 5
+        assert 0 <= evaluation.scores.technical <= 5
+        assert 0 <= evaluation.scores.communication <= 5
+        assert 0 <= evaluation.scores.ownership <= 5
+        
+        # Check that rationale is provided
+        assert len(evaluation.rationale) > 50
+        
+        # Check that action items are provided
+        assert len(evaluation.action_items) > 0
+        
+        # This answer should score well due to STAR structure and specific metrics
+        assert evaluation.scores.structure >= 3.5
+        assert evaluation.scores.depth_specificity >= 3.5
+
+
+class TestLLMAsJudgeEvaluator:
+    """Test the LLM-as-judge evaluator interface"""
+    
+    def setup_method(self):
+        """Set up evaluator for each test"""
+        self.evaluator = LLMAsJudgeEvaluator()
+    
+    @pytest.mark.asyncio
+    async def test_local_baseline_evaluation(self):
+        """Test local baseline evaluation functionality"""
+        answer_text = "I implemented caching to improve performance by 30%."
+        question_meta = {
+            "competency": "technical",
+            "difficulty": "medium",
+            "signals_expected": ["optimization", "metrics"],
+            "pitfalls": ["over-caching", "stale data"]
+        }
+        
+        evaluation = await self.evaluator._local_baseline_evaluation(
+            answer_text, question_meta, "How did you optimize performance?"
+        )
+        
+        assert isinstance(evaluation, ScoreDetail)
+        assert 0 <= evaluation.scores.clarity <= 5
+        assert 0 <= evaluation.scores.structure <= 5
+        assert 0 <= evaluation.scores.depth_specificity <= 5
+        assert 0 <= evaluation.scores.role_fit <= 5
+        assert 0 <= evaluation.scores.technical <= 5
+        assert 0 <= evaluation.scores.communication <= 5
+        assert 0 <= evaluation.scores.ownership <= 5
+    
+    @pytest.mark.asyncio
+    async def test_evaluation_consistency(self):
+        """Test that LLM evaluator maintains consistency with rules evaluator"""
+        answer_text = "I used Redis caching with TTL and monitoring to improve API response times from 200ms to 50ms."
+        question_meta = {
+            "competency": "technical",
+            "difficulty": "medium",
+            "signals_expected": ["caching", "monitoring", "metrics"],
+            "pitfalls": ["cache invalidation", "memory usage"]
+        }
+        
+        # Test local baseline evaluation
+        evaluation = await self.evaluator.evaluate_answer(
+            answer_text, question_meta, "How did you implement caching?"
+        )
+        
+        # Should return a valid ScoreDetail
+        assert isinstance(evaluation, ScoreDetail)
+        assert hasattr(evaluation.scores, 'clarity')
+        assert hasattr(evaluation.scores, 'technical')
+        assert hasattr(evaluation.scores, 'depth_specificity')
+        
+        # Technical answer with specific metrics should score well
+        assert evaluation.scores.technical >= 3.0
+        assert evaluation.scores.depth_specificity >= 3.0
+    
+    def test_evaluator_initialization(self):
+        """Test evaluator initialization and configuration"""
+        # Should use local baseline by default
+        assert self.evaluator.use_local_baseline == True
+        
+        # Should have rules evaluator instance
+        assert hasattr(self.evaluator, 'rules_evaluator')
+        assert isinstance(self.evaluator.rules_evaluator, RulesBasedEvaluator)
+
+
+class TestEvaluationEdgeCases:
+    """Test evaluator behavior with edge cases"""
+    
+    def setup_method(self):
+        """Set up evaluators for each test"""
+        self.rules_evaluator = RulesBasedEvaluator()
+        self.llm_evaluator = LLMAsJudgeEvaluator()
+    
+    def test_empty_answer(self):
+        """Test evaluation of empty or very short answers"""
+        empty_answer = ""
+        short_answer = "Yes."
+        
+        question_meta = {
+            "competency": "behavioral",
+            "difficulty": "easy",
+            "signals_expected": ["communication"],
+            "pitfalls": ["lack of detail"]
+        }
+        
+        empty_eval = self.rules_evaluator.evaluate_answer(empty_answer, question_meta)
+        short_eval = self.rules_evaluator.evaluate_answer(short_answer, question_meta)
+        
+        # Empty answers should score very low
+        assert empty_eval.scores.clarity <= 1.0
+        assert empty_eval.scores.depth_specificity <= 1.0
+        
+        # Short answers should score low but not as low as empty
+        assert short_eval.scores.clarity <= 2.0
+        assert short_eval.scores.depth_specificity <= 2.0
+    
+    def test_very_long_answer(self):
+        """Test evaluation of very long, verbose answers"""
+        long_answer = "I would like to tell you about this time when I was working at a company and there was this project that I was involved with and it was really interesting because we had to solve this problem that was quite challenging and I remember that I spent a lot of time thinking about it and discussing it with my colleagues and we came up with several different approaches and we had to evaluate each one carefully and consider the pros and cons of each approach and think about the trade-offs involved and how it would impact the users and the system as a whole and what the long-term implications might be and whether it would be scalable and maintainable and cost-effective and all of those factors that are important when making architectural decisions and so we went through this process and eventually we decided on the best approach and we implemented it and it worked out really well and the users were happy and the system performed better and we were able to handle more load and it was more reliable and we had fewer bugs and the maintenance was easier and overall it was a great success and I learned a lot from that experience and I think it really helped me grow as an engineer and understand the importance of careful planning and consideration of all the factors involved in making technical decisions."
+        
+        question_meta = {
+            "competency": "communication",
+            "difficulty": "medium",
+            "signals_expected": ["clarity", "conciseness"],
+            "pitfalls": ["verbosity", "lack of structure"]
+        }
+        
+        evaluation = self.rules_evaluator.evaluate_answer(long_answer, question_meta)
+        
+        # Long answers might score well on depth but poorly on clarity/structure
+        assert evaluation.scores.depth_specificity >= 3.0  # Lots of detail
+        assert evaluation.scores.clarity <= 3.0  # Hard to follow
+        assert evaluation.scores.structure <= 3.0  # Poor structure
+    
+    def test_technical_jargon(self):
+        """Test evaluation of answers with technical jargon"""
+        jargon_answer = "I implemented a distributed caching layer using Redis Cluster with consistent hashing, implemented circuit breaker pattern with Hystrix, and added distributed tracing with Jaeger for observability."
+        
+        question_meta = {
+            "competency": "technical",
+            "difficulty": "hard",
+            "signals_expected": ["technical depth", "architecture knowledge"],
+            "pitfalls": ["over-complication", "lack of explanation"]
+        }
+        
+        evaluation = self.rules_evaluator.evaluate_answer(jargon_answer, question_meta)
+        
+        # Technical depth should score well
+        assert evaluation.scores.technical >= 3.5
+        assert evaluation.scores.depth_specificity >= 3.5
+        
+        # But clarity might suffer if jargon isn't explained
+        assert evaluation.scores.clarity <= 4.0
diff --git a/tests/test_followup.py.disabled b/tests/test_followup.py.disabled
new file mode 100644
index 0000000..46d21b5
--- /dev/null
+++ b/tests/test_followup.py.disabled
@@ -0,0 +1,387 @@
+import pytest
+from interview.followup import FollowUpGenerator
+from core.schemas import ScoreDetail, RubricScore
+
+
+class TestFollowUpGenerator:
+    """Test the follow-up question generator"""
+    
+    def setup_method(self):
+        """Set up generator for each test"""
+        self.generator = FollowUpGenerator()
+    
+    def test_identify_improvement_areas(self):
+        """Test identification of areas needing improvement"""
+        # Create a score detail with some low scores
+        scores = RubricScore(
+            clarity=2.0,
+            structure=3.5,
+            depth_specificity=1.5,
+            role_fit=4.0,
+            technical=3.0,
+            communication=2.5,
+            ownership=4.5
+        )
+        
+        evaluation = ScoreDetail(
+            scores=scores,
+            rationale="The answer lacked depth and clarity",
+            action_items=["Provide more specific examples", "Improve structure"],
+            exemplar_snippet=None,
+            meta={}
+        )
+        
+        improvement_areas = self.generator._identify_improvement_areas(evaluation)
+        
+        # Should identify areas with scores below 3.0
+        assert "clarity" in improvement_areas
+        assert "depth_specificity" in improvement_areas
+        assert "communication" in improvement_areas
+        
+        # Should not include areas with good scores
+        assert "role_fit" not in improvement_areas
+        assert "ownership" not in improvement_areas
+    
+    def test_generate_clarity_follow_up(self):
+        """Test generation of clarity-focused follow-up questions"""
+        scores = RubricScore(
+            clarity=2.0,
+            structure=3.0,
+            depth_specificity=3.0,
+            role_fit=4.0,
+            technical=4.0,
+            communication=2.5,
+            ownership=3.5
+        )
+        
+        evaluation = ScoreDetail(
+            scores=scores,
+            rationale="The answer was unclear and lacked structure",
+            action_items=["Be more specific", "Use concrete examples"],
+            exemplar_snippet=None,
+            meta={}
+        )
+        
+        question_meta = {
+            "competency": "communication",
+            "difficulty": "medium",
+            "signals_expected": ["clarity", "specificity"],
+            "pitfalls": ["vagueness", "lack of examples"]
+        }
+        
+        follow_up = self.generator._generate_area_follow_up("clarity", evaluation, question_meta)
+        
+        assert follow_up is not None
+        assert "text" in follow_up
+        assert "competency" in follow_up
+        assert "difficulty" in follow_up
+        assert "meta" in follow_up
+        
+        # Should be related to clarity improvement
+        assert "clarity" in follow_up["text"].lower() or "specific" in follow_up["text"].lower()
+    
+    def test_generate_structure_follow_up(self):
+        """Test generation of structure-focused follow-up questions"""
+        scores = RubricScore(
+            clarity=3.0,
+            structure=1.5,
+            depth_specificity=3.0,
+            role_fit=4.0,
+            technical=4.0,
+            communication=3.0,
+            ownership=3.5
+        )
+        
+        evaluation = ScoreDetail(
+            scores=scores,
+            rationale="The answer lacked proper structure and organization",
+            action_items=["Organize thoughts better", "Use clear sections"],
+            exemplar_snippet=None,
+            meta={}
+        )
+        
+        question_meta = {
+            "competency": "communication",
+            "difficulty": "medium",
+            "signals_expected": ["organization", "structure"],
+            "pitfalls": ["rambling", "lack of flow"]
+        }
+        
+        follow_up = self.generator._generate_area_follow_up("structure", evaluation, question_meta)
+        
+        assert follow_up is not None
+        assert "text" in follow_up
+        assert "competency" in follow_up
+        assert "difficulty" in follow_up
+        
+        # Should be related to structure improvement
+        assert any(word in follow_up["text"].lower() for word in ["structure", "organize", "step", "process"])
+    
+    def test_generate_depth_follow_up(self):
+        """Test generation of depth-focused follow-up questions"""
+        scores = RubricScore(
+            clarity=3.5,
+            structure=3.0,
+            depth_specificity=1.0,
+            role_fit=4.0,
+            technical=3.5,
+            communication=3.0,
+            ownership=3.5
+        )
+        
+        evaluation = ScoreDetail(
+            scores=scores,
+            rationale="The answer was too superficial and lacked detail",
+            action_items=["Provide more depth", "Include specific examples"],
+            exemplar_snippet=None,
+            meta={}
+        )
+        
+        question_meta = {
+            "competency": "technical",
+            "difficulty": "medium",
+            "signals_expected": ["technical depth", "specifics"],
+            "pitfalls": ["superficial", "lack of detail"]
+        }
+        
+        follow_up = self.generator._generate_area_follow_up("depth_specificity", evaluation, question_meta)
+        
+        assert follow_up is not None
+        assert "text" in follow_up
+        assert "competency" in follow_up
+        assert "difficulty" in follow_up
+        
+        # Should be related to depth improvement
+        assert any(word in follow_up["text"].lower() for word in ["detail", "specific", "depth", "example", "how"])
+    
+    def test_generate_multiple_follow_ups(self):
+        """Test generation of multiple follow-up questions"""
+        scores = RubricScore(
+            clarity=2.0,
+            structure=2.0,
+            depth_specificity=2.0,
+            role_fit=4.0,
+            technical=3.0,
+            communication=2.5,
+            ownership=3.5
+        )
+        
+        evaluation = ScoreDetail(
+            scores=scores,
+            rationale="Multiple areas need improvement",
+            action_items=["Improve clarity", "Better structure", "More depth"],
+            exemplar_snippet=None,
+            meta={}
+        )
+        
+        question_meta = {
+            "competency": "communication",
+            "difficulty": "medium",
+            "signals_expected": ["clarity", "structure", "depth"],
+            "pitfalls": ["unclear", "unstructured", "superficial"]
+        }
+        
+        follow_ups = self.generator.generate_follow_ups(evaluation, question_meta, max_follow_ups=3)
+        
+        assert len(follow_ups) <= 3
+        assert len(follow_ups) > 0
+        
+        # Should cover different improvement areas
+        competencies = [f["competency"] for f in follow_ups]
+        assert len(set(competencies)) >= 1  # At least one competency
+        
+        # All follow-ups should have required fields
+        for follow_up in follow_ups:
+            assert "text" in follow_up
+            assert "competency" in follow_up
+            assert "difficulty" in follow_up
+            assert "meta" in follow_up
+    
+    def test_follow_up_meta_information(self):
+        """Test that follow-up questions include proper meta information"""
+        scores = RubricScore(
+            clarity=2.0,
+            structure=3.0,
+            depth_specificity=3.0,
+            role_fit=4.0,
+            technical=4.0,
+            communication=2.5,
+            ownership=3.5
+        )
+        
+        evaluation = ScoreDetail(
+            scores=scores,
+            rationale="Clarity needs improvement",
+            action_items=["Be more specific"],
+            exemplar_snippet=None,
+            meta={}
+        )
+        
+        question_meta = {
+            "competency": "communication",
+            "difficulty": "medium",
+            "signals_expected": ["clarity", "specificity"],
+            "pitfalls": ["vagueness"]
+        }
+        
+        follow_up = self.generator._generate_area_follow_up("clarity", evaluation, question_meta)
+        
+        assert follow_up is not None
+        assert "meta" in follow_up
+        
+        # Meta should include relevant information
+        meta = follow_up["meta"]
+        assert "signals_expected" in meta
+        assert "pitfalls" in meta
+        assert "improvement_area" in meta
+        assert meta["improvement_area"] == "clarity"
+    
+    def test_follow_up_difficulty_adjustment(self):
+        """Test that follow-up difficulty is appropriately adjusted"""
+        scores = RubricScore(
+            clarity=1.0,  # Very low score
+            structure=3.0,
+            depth_specificity=3.0,
+            role_fit=4.0,
+            technical=4.0,
+            communication=2.0,
+            ownership=3.5
+        )
+        
+        evaluation = ScoreDetail(
+            scores=scores,
+            rationale="Very poor clarity",
+            action_items=["Start with basics"],
+            exemplar_snippet=None,
+            meta={}
+        )
+        
+        question_meta = {
+            "competency": "communication",
+            "difficulty": "hard",  # Original question was hard
+            "signals_expected": ["clarity"],
+            "pitfalls": ["unclear"]
+        }
+        
+        follow_up = self.generator._generate_area_follow_up("clarity", evaluation, question_meta)
+        
+        assert follow_up is not None
+        # Follow-up should be easier than original question
+        assert follow_up["difficulty"] in ["easy", "medium"]
+    
+    def test_no_improvement_areas(self):
+        """Test behavior when no improvement areas are identified"""
+        scores = RubricScore(
+            clarity=4.5,
+            structure=4.0,
+            depth_specificity=4.0,
+            role_fit=4.5,
+            technical=4.0,
+            communication=4.0,
+            ownership=4.5
+        )
+        
+        evaluation = ScoreDetail(
+            scores=scores,
+            rationale="Excellent answer across all dimensions",
+            action_items=["Continue this level of performance"],
+            exemplar_snippet=None,
+            meta={}
+        )
+        
+        question_meta = {
+            "competency": "technical",
+            "difficulty": "medium",
+            "signals_expected": ["technical depth"],
+            "pitfalls": ["over-confidence"]
+        }
+        
+        follow_ups = self.generator.generate_follow_ups(evaluation, question_meta, max_follow_ups=3)
+        
+        # Should still generate some follow-ups for continued assessment
+        assert len(follow_ups) > 0
+        
+        # Follow-ups should be more challenging
+        for follow_up in follow_ups:
+            assert follow_up["difficulty"] in ["medium", "hard"]
+    
+    def test_follow_up_content_quality(self):
+        """Test that generated follow-ups have meaningful content"""
+        scores = RubricScore(
+            clarity=2.0,
+            structure=3.0,
+            depth_specificity=3.0,
+            role_fit=4.0,
+            technical=4.0,
+            communication=2.5,
+            ownership=3.5
+        )
+        
+        evaluation = ScoreDetail(
+            scores=scores,
+            rationale="Clarity needs improvement",
+            action_items=["Be more specific"],
+            exemplar_snippet=None,
+            meta={}
+        )
+        
+        question_meta = {
+            "competency": "communication",
+            "difficulty": "medium",
+            "signals_expected": ["clarity", "specificity"],
+            "pitfalls": ["vagueness"]
+        }
+        
+        follow_up = self.generator._generate_area_follow_up("clarity", evaluation, question_meta)
+        
+        assert follow_up is not None
+        
+        # Text should be substantial
+        assert len(follow_up["text"]) > 20
+        
+        # Should be a question
+        assert "?" in follow_up["text"]
+        
+        # Should be relevant to the improvement area
+        text_lower = follow_up["text"].lower()
+        assert any(word in text_lower for word in ["clarify", "specific", "explain", "describe", "how", "what"])
+    
+    def test_follow_up_competency_mapping(self):
+        """Test that follow-ups are mapped to appropriate competencies"""
+        scores = RubricScore(
+            clarity=2.0,
+            structure=2.0,
+            depth_specificity=3.0,
+            role_fit=4.0,
+            technical=3.0,
+            communication=2.5,
+            ownership=3.5
+        )
+        
+        evaluation = ScoreDetail(
+            scores=scores,
+            rationale="Multiple areas need improvement",
+            action_items=["Improve clarity and structure"],
+            exemplar_snippet=None,
+            meta={}
+        )
+        
+        question_meta = {
+            "competency": "technical",
+            "difficulty": "medium",
+            "signals_expected": ["technical depth"],
+            "pitfalls": ["lack of detail"]
+        }
+        
+        follow_ups = self.generator.generate_follow_ups(evaluation, question_meta, max_follow_ups=2)
+        
+        assert len(follow_ups) > 0
+        
+        # Should include both technical and communication competencies
+        competencies = [f["competency"] for f in follow_ups]
+        assert len(set(competencies)) >= 1  # At least one competency
+        
+        # Technical follow-ups should maintain technical competency
+        technical_follow_ups = [f for f in follow_ups if f["competency"] == "technical"]
+        if technical_follow_ups:
+            assert all("technical" in f["meta"]["signals_expected"] for f in technical_follow_ups)
diff --git a/tests/test_interview_flow.py b/tests/test_interview_flow.py
new file mode 100644
index 0000000..1a3319e
--- /dev/null
+++ b/tests/test_interview_flow.py
@@ -0,0 +1,99 @@
+
+import requests
+import json
+import time
+import os
+
+BASE_URL = "http://localhost:8080"
+AUDIO_FILE_PATH = "/Volumes/Data/ai-interview-rahat/test.mp3"
+
+def test_interview_flow():
+    print(f"Testing Interview Flow at {BASE_URL}")
+
+    # 1. Health Check
+    try:
+        response = requests.get(f"{BASE_URL}/healthz")
+        assert response.status_code == 200, f"Health check failed: {response.text}"
+        print("✅ Health check passed")
+    except Exception as e:
+        print(f"❌ Health check failed: {e}")
+        print("Make sure the server is running on port 8000")
+        return
+
+    # 2. Start Interview Session
+    user_id = "test_user_123"
+    session_id = f"test_session_{int(time.time())}"
+    
+    start_payload = {
+        "user_id": user_id,
+        "session_id": session_id,
+        "role_title": "Python Developer",
+        "company_name": "Tech Corp",
+        "industry": "Technology"
+    }
+
+    print(f"\nStarting Interview Session (ID: {session_id})...")
+    response = requests.post(f"{BASE_URL}/v1/interview/start", json=start_payload)
+    if response.status_code != 200:
+        print(f"❌ Failed to start interview: {response.status_code} - {response.text}")
+        return
+    else:
+        print("✅ Interview session started")
+        # print(f"Response: {json.dumps(response.json(), indent=2)}")
+        first_question = response.json().get("first_question")
+        print(f"First Question: {first_question}")
+
+    # 3. Simulate answering questions
+    rounds_to_test = 2
+    
+    if not os.path.exists(AUDIO_FILE_PATH):
+        print(f"⚠️ Warning: Audio file not found at {AUDIO_FILE_PATH}. Skipping answer submission test.")
+        return
+
+    for i in range(rounds_to_test):
+        print(f"\n--- Round {i+1} ---")
+        
+        # Prepare multipart/form-data upload
+        with open(AUDIO_FILE_PATH, "rb") as audio_file:
+            files = {
+                "audio_file": ("test.mp3", audio_file, "audio/mpeg")
+            }
+            data = {
+                "user_id": user_id,
+                "session_id": session_id
+            }
+            
+            print("Submitting answer...")
+            start_time = time.time()
+            response = requests.post(f"{BASE_URL}/v1/interview/answer", data=data, files=files)
+            duration = time.time() - start_time
+            
+            if response.status_code == 200:
+                result = response.json()
+                print(f"✅ Answer submittted in {duration:.2f}s")
+                
+                next_q = result.get("next_question")
+                if next_q:
+                    print(f"Next Question: {next_q}")
+                else:
+                    print("No next question (End of interview?)")
+                    
+                tech_eval = result.get("technical", {})
+                print(f"Technical Depth: {tech_eval.get('technical_depth')}")
+                print(f"Feedback: {result.get('evaluation', {}).get('feedback')}")
+            else:
+                print(f"❌ Failed to submit answer: {response.status_code} - {response.text}")
+                break
+
+    # 4. Generate Report
+    print(f"\nGenerating Report for {session_id}...")
+    response = requests.get(f"{BASE_URL}/v1/interview/report/{user_id}/{session_id}")
+    if response.status_code == 200:
+        report = response.json()
+        print("✅ Report generated successfully")
+        print(f"Average Scores: {json.dumps(report.get('avg_scores'), indent=2)}")
+    else:
+        print(f"❌ Failed to generate report: {response.status_code} - {response.text}")
+
+if __name__ == "__main__":
+    test_interview_flow()
diff --git a/tests/test_interview_flow_draft.py b/tests/test_interview_flow_draft.py
new file mode 100644
index 0000000..52f3042
--- /dev/null
+++ b/tests/test_interview_flow_draft.py
@@ -0,0 +1,77 @@
+
+import requests
+import json
+import time
+
+BASE_URL = "http://localhost:8000"
+
+def test_interview_flow():
+    print(f"Testing Interview Flow at {BASE_URL}")
+
+    # 1. Health Check
+    try:
+        response = requests.get(f"{BASE_URL}/healthz")
+        assert response.status_code == 200, f"Health check failed: {response.text}"
+        print("✅ Health check passed")
+    except Exception as e:
+        print(f"❌ Health check failed: {e}")
+        return
+
+    # 2. Start Interview Session
+    # Using a dummy user_id and session_id for testing
+    user_id = "test_user_123"
+    session_id = "test_session_456"
+    
+    start_payload = {
+        "user_id": user_id,
+        "session_id": session_id,
+        "role_title": "Python Developer",
+        "company_name": "Tech Corp",
+        "industry": "Technology"
+    }
+
+    print("\nStarting Interview Session...")
+    response = requests.post(f"{BASE_URL}/v1/interview/start", json=start_payload)
+    if response.status_code != 200:
+        print(f"❌ Failed to start interview: {response.status_code} - {response.text}")
+        return
+    else:
+        print("✅ Interview session started")
+        print(f"Response: {response.json()}")
+
+    # 3. Simulate answering a few questions
+    # We will simulate 3 rounds of QA
+    for i in range(3):
+        print(f"\n--- Round {i+1} ---")
+        
+        # Get current state to know the question (optional, as start/answer returns it)
+        state_response = requests.get(f"{BASE_URL}/v1/interview/state/{user_id}/{session_id}")
+        if state_response.status_code == 200:
+             state_data = state_response.json()
+             current_question = state_data.get("history", [])[-1]["question"] if state_data.get("history") else "No question found"
+             print(f"Current Question: {current_question}")
+        
+        # Answer the question
+        answer_payload = {
+             "user_id": user_id,
+             "session_id": session_id,
+             "answer_text": "I am a skilled Python developer with experience in FastAPI and Django. I have built scalable web applications and worked with microservices.",
+             "audio_enabled": False 
+             # Note: logic might depend on how answer is submitted (audio vs text). 
+             # Check API_STRUCTURE again. It says /answer submits audio answer. 
+             # Implementation of POST /v1/interview/answer in interview_routes.py needs checking.
+             # Assuming we can send text answer for testing or need to mock audio.
+             # Let's check interview_routes.py if it supports text answer directly or verify logic.
+             # The implementation plan mentioned sending dummy audio/text.
+        }
+        
+        # Wait, the API_STRUCTURE says /v1/interview/answer is for "Submit audio answer"
+        # but /v1/sessions/{id}/answer is "Submit text answer".
+        # Let's verify if /v1/interview/answer supports text only.
+        # I will check interview_routes.py content first to be sure.
+        
+        # For now, I will pause writing the full loop until I check interview_routes.py
+        pass
+
+if __name__ == "__main__":
+    test_interview_flow()
diff --git a/tests/test_llm_evaluation.py.disabled b/tests/test_llm_evaluation.py.disabled
new file mode 100644
index 0000000..0f70dd1
--- /dev/null
+++ b/tests/test_llm_evaluation.py.disabled
@@ -0,0 +1,218 @@
+import pytest
+import json
+from unittest.mock import Mock, patch
+from cv_eval.engine import CVEvaluationEngine
+from cv_eval.llm_scorer import LLMScorer
+from cv_eval.schemas import CVEvaluationRequest, ScoreResult, SubScore
+
+
+class TestLLMScorer:
+    """Test LLM scorer functionality"""
+    
+    def test_llm_scorer_initialization(self):
+        """Test LLM scorer initialization with Groq"""
+        with patch.dict('os.environ', {'GROQ_API_KEY': 'test-key'}):
+            scorer = LLMScorer()
+            assert scorer.model == "llama3-8b-8192"
+    
+    def test_llm_scorer_initialization_failure(self):
+        """Test LLM scorer initialization failure without API key"""
+        with pytest.raises(ValueError, match="GROQ_API_KEY environment variable not set"):
+            LLMScorer()
+    
+    def test_extract_json_from_response(self):
+        """Test JSON extraction from LLM response"""
+        scorer = LLMScorer()
+        
+        # Test with markdown code blocks
+        response = "Here's the result:\n```json\n{\"test\": \"value\"}\n```\nEnd"
+        result = scorer._extract_json_from_response(response)
+        assert result == '{"test": "value"}'
+        
+        # Test with plain JSON
+        response = '{"test": "value"}'
+        result = scorer._extract_json_from_response(response)
+        assert result == '{"test": "value"}'
+        
+        # Test with extra content
+        response = "Some text {\"test\": \"value\"} more text"
+        result = scorer._extract_json_from_response(response)
+        assert result == '{"test": "value"}'
+    
+    def test_parse_llm_response(self):
+        """Test LLM response parsing"""
+        scorer = LLMScorer()
+        
+        # Valid response
+        response = '{"cv_quality": {"overall_score": 85.0, "band": "Strong", "subscores": []}}'
+        result = scorer._parse_llm_response(response, "cv_quality")
+        assert "cv_quality" in result
+        assert result["cv_quality"]["overall_score"] == 85.0
+        
+        # Invalid JSON
+        with pytest.raises(ValueError):
+            scorer._parse_llm_response("invalid json", "cv_quality")
+        
+        # Missing expected key
+        with pytest.raises(ValueError):
+            scorer._parse_llm_response('{"wrong_key": {}}', "cv_quality")
+    
+    def test_convert_to_score_result(self):
+        """Test conversion of LLM response to ScoreResult"""
+        scorer = LLMScorer()
+        
+        data = {
+            "overall_score": 85.0,
+            "band": "Strong",
+            "subscores": [
+                {
+                    "dimension": "ats_structure",
+                    "score": 8.0,
+                    "max_score": 10.0,
+                    "evidence": ["email present", "phone present"]
+                }
+            ]
+        }
+        
+        result = scorer._convert_to_score_result(data)
+        assert isinstance(result, ScoreResult)
+        assert result.overall_score == 85.0
+        assert result.band == "Strong"
+        assert len(result.subscores) == 1
+        assert result.subscores[0].dimension == "ats_structure"
+        assert result.subscores[0].score == 8.0
+
+
+class TestCVEvaluationEngine:
+    """Test CV evaluation engine with LLM integration"""
+    
+    def test_engine_initialization_with_llm(self):
+        """Test engine initialization with LLM enabled"""
+        with patch.dict('os.environ', {'GROQ_API_KEY': 'test-key'}):
+            engine = CVEvaluationEngine(use_llm=True)
+            assert engine.use_llm is True
+            assert engine.llm_scorer is not None
+    
+    def test_engine_initialization_without_llm(self):
+        """Test engine initialization without LLM"""
+        engine = CVEvaluationEngine(use_llm=False)
+        assert engine.use_llm is False
+        assert engine.llm_scorer is None
+    
+    def test_engine_initialization_llm_failure(self):
+        """Test engine initialization when LLM fails"""
+        engine = CVEvaluationEngine(use_llm=True)
+        assert engine.use_llm is False  # Should fall back to heuristic
+    
+    @patch('evaluation.llm_scorer.LLMScorer')
+    def test_evaluate_cv_quality_llm_success(self, mock_llm_scorer_class):
+        """Test CV quality evaluation with successful LLM call"""
+        # Mock LLM scorer
+        mock_scorer = Mock()
+        mock_scorer.score_cv_quality.return_value = ScoreResult(
+            overall_score=85.0,
+            band="Strong",
+            subscores=[]
+        )
+        mock_llm_scorer_class.return_value = mock_scorer
+        
+        with patch.dict('os.environ', {'GROQ_API_KEY': 'test-key'}):
+            engine = CVEvaluationEngine(use_llm=True)
+            
+            cv_text = "Test CV content"
+            result = engine.evaluate_cv_quality(cv_text)
+            
+            assert result.overall_score == 85.0
+            assert result.band == "Strong"
+            mock_scorer.score_cv_quality.assert_called_once_with(cv_text)
+    
+    @patch('evaluation.llm_scorer.LLMScorer')
+    def test_evaluate_cv_quality_llm_failure_fallback(self, mock_llm_scorer_class):
+        """Test CV quality evaluation with LLM failure and heuristic fallback"""
+        # Mock LLM scorer to raise exception
+        mock_scorer = Mock()
+        mock_scorer.score_cv_quality.side_effect = Exception("LLM API error")
+        mock_llm_scorer_class.return_value = mock_scorer
+        
+        with patch.dict('os.environ', {'GROQ_API_KEY': 'test-key'}):
+            engine = CVEvaluationEngine(use_llm=True)
+            
+            cv_text = "JOHN DOE\nSoftware Engineer\njohn.doe@email.com"
+            result = engine.evaluate_cv_quality(cv_text)
+            
+            # Should fall back to heuristic scoring
+            assert isinstance(result, ScoreResult)
+            assert result.overall_score >= 0
+            assert result.band in ["Excellent", "Strong", "Partial", "Weak"]
+    
+    @patch('evaluation.llm_scorer.LLMScorer')
+    def test_evaluate_jd_match_llm_success(self, mock_llm_scorer_class):
+        """Test JD match evaluation with successful LLM call"""
+        # Mock LLM scorer
+        mock_scorer = Mock()
+        mock_scorer.score_jd_match.return_value = ScoreResult(
+            overall_score=80.0,
+            band="Strong",
+            subscores=[]
+        )
+        mock_llm_scorer_class.return_value = mock_scorer
+        
+        with patch.dict('os.environ', {'GROQ_API_KEY': 'test-key'}):
+            engine = CVEvaluationEngine(use_llm=True)
+            
+            cv_text = "Test CV content"
+            jd_text = "Test JD content"
+            result = engine.evaluate_jd_match(cv_text, jd_text, include_constraints=True)
+            
+            assert result.overall_score == 80.0
+            assert result.band == "Strong"
+            mock_scorer.score_jd_match.assert_called_once_with(cv_text, jd_text, True)
+    
+    def test_calculate_fit_index(self):
+        """Test fit index calculation"""
+        engine = CVEvaluationEngine(use_llm=False)
+        
+        # Test with constraints included
+        fit_index = engine.calculate_fit_index(80.0, 85.0, include_constraints=True)
+        expected = 0.6 * 85.0 + 0.4 * 80.0
+        assert abs(fit_index - expected) < 1e-6
+        
+        # Test with constraints excluded
+        fit_index = engine.calculate_fit_index(80.0, 85.0, include_constraints=False)
+        expected = 0.6 * 85.0 + 0.4 * 80.0
+        assert abs(fit_index - expected) < 1e-6
+    
+    def test_get_score_band(self):
+        """Test score band calculation"""
+        engine = CVEvaluationEngine(use_llm=False)
+        
+        assert engine._get_score_band(95.0) == "Excellent"
+        assert engine._get_score_band(85.0) == "Strong"
+        assert engine._get_score_band(70.0) == "Partial"
+        assert engine._get_score_band(50.0) == "Weak"
+    
+    def test_full_evaluation_workflow(self):
+        """Test complete evaluation workflow"""
+        engine = CVEvaluationEngine(use_llm=False)  # Use heuristic for testing
+        
+        request = CVEvaluationRequest(
+            cv_text="JOHN DOE\nSoftware Engineer\njohn.doe@email.com",
+            jd_text="Senior Software Engineer\nRequirements: Python, AWS",
+            include_constraints=True
+        )
+        
+        result = engine.evaluate(request)
+        
+        assert isinstance(result.cv_quality, ScoreResult)
+        assert isinstance(result.jd_match, ScoreResult)
+        assert isinstance(result.fit_index, float)
+        assert result.fit_index >= 0 and result.fit_index <= 100
+        assert result.band in ["Excellent", "Strong", "Partial", "Weak"]
+        
+        # Verify fit index calculation
+        expected_fit_index = 0.6 * result.jd_match.overall_score + 0.4 * result.cv_quality.overall_score
+        assert abs(result.fit_index - expected_fit_index) < 1e-6
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])