fix: add strict mode to prevent silent fallback degradation during benchmarking (#154)

abrichr · claude · web-flow · commit 62934ab54cbe · 2026-03-19T20:54:56.000-04:00
When strict=True, components that previously degraded silently now raise
errors instead, ensuring benchmarking/training runs use the intended
system configuration (e.g., PII scrubbing active, VLM extraction working).

- ScrubMiddleware: raise ImportError if openadapt-privacy missing,
  re-raise on scrubbing failure
- extract_workflow(): raise ValueError on VLM parse failure,
  re-raise on VLM call failure
- generate_transcript(): re-raise on VLM call failure,
  raise ValueError if parser returns only placeholders

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/openadapt_evals/adapters/scrub_middleware.py b/openadapt_evals/adapters/scrub_middleware.py
@@ -52,17 +52,22 @@ class ScrubMiddleware(BenchmarkAdapter):
         scrub_text: Whether to redact detected text PII (default True).
         scrub_images: Whether to redact detected image PII such as faces
             (default True).
+        strict: When True, raise errors instead of silently falling back
+            to unscrubbed data. Use during benchmarking/training to ensure
+            scrubbing is actually active.
     """
 
     def __init__(
         self,
         adapter: BenchmarkAdapter,
         scrub_text: bool = True,
         scrub_images: bool = True,
+        strict: bool = False,
     ):
         self._adapter = adapter
         self._scrub_text = scrub_text
         self._scrub_images = scrub_images
+        self._strict = strict
         self._provider = None  # lazy-loaded
         self._provider_load_attempted = False
         self.last_original_screenshot: bytes | None = None
@@ -156,6 +161,8 @@ def _scrub(self, screenshot_bytes: bytes) -> bytes:
         try:
             return provider.scrub_image(screenshot_bytes)
         except Exception:
+            if self._strict:
+                raise
             logger.warning(
                 "PII scrubbing failed, returning original screenshot",
                 exc_info=True,
@@ -178,6 +185,12 @@ def _get_provider(self):
             logger.info("PII scrubbing enabled via openadapt-privacy (Presidio)")
             return self._provider
         except ImportError:
+            if self._strict:
+                raise ImportError(
+                    "openadapt-privacy is not installed and strict mode is enabled. "
+                    "PII scrubbing cannot be skipped. "
+                    "Install with: pip install openadapt-privacy"
+                )
             logger.warning(
                 "openadapt-privacy is not installed. "
                 "PII scrubbing is DISABLED — screenshots will pass through unmodified. "
diff --git a/openadapt_evals/workflow/pipeline/extract.py b/openadapt_evals/workflow/pipeline/extract.py
@@ -259,6 +259,7 @@ def extract_workflow(
     model: str = "gpt-4.1-mini",
     provider: str = "openai",
     recording_source: RecordingSource = RecordingSource.NATIVE_CAPTURE,
+    strict: bool = False,
 ) -> Workflow:
     """Extract a structured Workflow from an EpisodeTranscript.
 
@@ -270,6 +271,9 @@ def extract_workflow(
         model: VLM model name.
         provider: VLM provider (``"openai"`` or ``"anthropic"``).
         recording_source: Source of the original recording.
+        strict: When True, raise errors instead of falling back to 1:1
+            transcript-to-step mapping. Use during benchmarking/training
+            to ensure VLM extraction is actually working.
 
     Returns:
         A Workflow with WorkflowStep entries derived from the transcript.
@@ -293,16 +297,31 @@ def extract_workflow(
         len(transcript.entries),
     )
 
-    raw = vlm_call(
-        prompt,
-        model=model,
-        provider=provider,
-        max_tokens=4096,
-    )
+    try:
+        raw = vlm_call(
+            prompt,
+            model=model,
+            provider=provider,
+            max_tokens=4096,
+        )
+    except Exception:
+        if strict:
+            raise
+        logger.warning(
+            "VLM call failed for transcript %s, using fallback",
+            transcript.transcript_id,
+            exc_info=True,
+        )
+        return _build_fallback_workflow(transcript, recording_source)
 
     parsed = _parse_extraction_response(raw, transcript)
 
     if parsed is None:
+        if strict:
+            raise ValueError(
+                f"VLM extraction failed to parse response for transcript "
+                f"{transcript.transcript_id}. Raw response: {raw!r:.500}"
+            )
         logger.warning(
             "VLM extraction failed for transcript %s, using fallback",
             transcript.transcript_id,
diff --git a/openadapt_evals/workflow/pipeline/transcript.py b/openadapt_evals/workflow/pipeline/transcript.py
@@ -141,6 +141,7 @@ def generate_transcript(
     model: str = "gpt-4.1-mini",
     provider: str = "openai",
     batch_size: int = 6,
+    strict: bool = False,
 ) -> EpisodeTranscript:
     """Generate a natural language transcript from a recording session.
 
@@ -152,6 +153,9 @@ def generate_transcript(
         model: VLM model name.
         provider: VLM provider ("openai" or "anthropic").
         batch_size: Number of actions per VLM call.
+        strict: When True, raise errors instead of returning partial
+            or placeholder results. Use during benchmarking/training
+            to ensure VLM transcript generation is actually working.
 
     Returns:
         EpisodeTranscript with one TranscriptEntry per action.
@@ -178,16 +182,38 @@ def generate_transcript(
             batch_num, idx, end_idx - 1, len(images) if images else 0,
         )
 
-        raw = vlm_call(
-            prompt,
-            images=images,
-            model=model,
-            provider=provider,
-            max_tokens=2048,
-        )
+        try:
+            raw = vlm_call(
+                prompt,
+                images=images,
+                model=model,
+                provider=provider,
+                max_tokens=2048,
+            )
+        except Exception:
+            if strict:
+                raise
+            logger.warning(
+                "VLM call failed for batch %d (actions %d-%d), skipping",
+                batch_num, idx, end_idx - 1,
+                exc_info=True,
+            )
+            idx = end_idx if end_idx >= len(session.actions) else end_idx - overlap
+            batch_num += 1
+            continue
 
         parsed = _parse_transcript_response(raw, len(batch_actions))
 
+        if strict and all(
+            entry.get("vlm_confidence", 1.0) == 0.0
+            and entry.get("narration") == "Action performed"
+            for entry in parsed
+        ):
+            raise ValueError(
+                f"VLM transcript parsing returned only placeholders for batch "
+                f"{batch_num} (actions {idx}-{end_idx - 1}). Raw response: {raw!r:.500}"
+            )
+
         # Create TranscriptEntry objects
         for i, (action, entry_data) in enumerate(zip(batch_actions, parsed)):
             entry = TranscriptEntry(