Skip to content

Commit 62934ab

Browse files
abrichrclaude
andauthored
fix: add strict mode to prevent silent fallback degradation during benchmarking (#154)
When strict=True, components that previously degraded silently now raise errors instead, ensuring benchmarking/training runs use the intended system configuration (e.g., PII scrubbing active, VLM extraction working). - ScrubMiddleware: raise ImportError if openadapt-privacy missing, re-raise on scrubbing failure - extract_workflow(): raise ValueError on VLM parse failure, re-raise on VLM call failure - generate_transcript(): re-raise on VLM call failure, raise ValueError if parser returns only placeholders Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 101289b commit 62934ab

3 files changed

Lines changed: 71 additions & 13 deletions

File tree

openadapt_evals/adapters/scrub_middleware.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,17 +52,22 @@ class ScrubMiddleware(BenchmarkAdapter):
5252
scrub_text: Whether to redact detected text PII (default True).
5353
scrub_images: Whether to redact detected image PII such as faces
5454
(default True).
55+
strict: When True, raise errors instead of silently falling back
56+
to unscrubbed data. Use during benchmarking/training to ensure
57+
scrubbing is actually active.
5558
"""
5659

5760
def __init__(
5861
self,
5962
adapter: BenchmarkAdapter,
6063
scrub_text: bool = True,
6164
scrub_images: bool = True,
65+
strict: bool = False,
6266
):
6367
self._adapter = adapter
6468
self._scrub_text = scrub_text
6569
self._scrub_images = scrub_images
70+
self._strict = strict
6671
self._provider = None # lazy-loaded
6772
self._provider_load_attempted = False
6873
self.last_original_screenshot: bytes | None = None
@@ -156,6 +161,8 @@ def _scrub(self, screenshot_bytes: bytes) -> bytes:
156161
try:
157162
return provider.scrub_image(screenshot_bytes)
158163
except Exception:
164+
if self._strict:
165+
raise
159166
logger.warning(
160167
"PII scrubbing failed, returning original screenshot",
161168
exc_info=True,
@@ -178,6 +185,12 @@ def _get_provider(self):
178185
logger.info("PII scrubbing enabled via openadapt-privacy (Presidio)")
179186
return self._provider
180187
except ImportError:
188+
if self._strict:
189+
raise ImportError(
190+
"openadapt-privacy is not installed and strict mode is enabled. "
191+
"PII scrubbing cannot be skipped. "
192+
"Install with: pip install openadapt-privacy"
193+
)
181194
logger.warning(
182195
"openadapt-privacy is not installed. "
183196
"PII scrubbing is DISABLED — screenshots will pass through unmodified. "

openadapt_evals/workflow/pipeline/extract.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ def extract_workflow(
259259
model: str = "gpt-4.1-mini",
260260
provider: str = "openai",
261261
recording_source: RecordingSource = RecordingSource.NATIVE_CAPTURE,
262+
strict: bool = False,
262263
) -> Workflow:
263264
"""Extract a structured Workflow from an EpisodeTranscript.
264265
@@ -270,6 +271,9 @@ def extract_workflow(
270271
model: VLM model name.
271272
provider: VLM provider (``"openai"`` or ``"anthropic"``).
272273
recording_source: Source of the original recording.
274+
strict: When True, raise errors instead of falling back to 1:1
275+
transcript-to-step mapping. Use during benchmarking/training
276+
to ensure VLM extraction is actually working.
273277
274278
Returns:
275279
A Workflow with WorkflowStep entries derived from the transcript.
@@ -293,16 +297,31 @@ def extract_workflow(
293297
len(transcript.entries),
294298
)
295299

296-
raw = vlm_call(
297-
prompt,
298-
model=model,
299-
provider=provider,
300-
max_tokens=4096,
301-
)
300+
try:
301+
raw = vlm_call(
302+
prompt,
303+
model=model,
304+
provider=provider,
305+
max_tokens=4096,
306+
)
307+
except Exception:
308+
if strict:
309+
raise
310+
logger.warning(
311+
"VLM call failed for transcript %s, using fallback",
312+
transcript.transcript_id,
313+
exc_info=True,
314+
)
315+
return _build_fallback_workflow(transcript, recording_source)
302316

303317
parsed = _parse_extraction_response(raw, transcript)
304318

305319
if parsed is None:
320+
if strict:
321+
raise ValueError(
322+
f"VLM extraction failed to parse response for transcript "
323+
f"{transcript.transcript_id}. Raw response: {raw!r:.500}"
324+
)
306325
logger.warning(
307326
"VLM extraction failed for transcript %s, using fallback",
308327
transcript.transcript_id,

openadapt_evals/workflow/pipeline/transcript.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ def generate_transcript(
141141
model: str = "gpt-4.1-mini",
142142
provider: str = "openai",
143143
batch_size: int = 6,
144+
strict: bool = False,
144145
) -> EpisodeTranscript:
145146
"""Generate a natural language transcript from a recording session.
146147
@@ -152,6 +153,9 @@ def generate_transcript(
152153
model: VLM model name.
153154
provider: VLM provider ("openai" or "anthropic").
154155
batch_size: Number of actions per VLM call.
156+
strict: When True, raise errors instead of returning partial
157+
or placeholder results. Use during benchmarking/training
158+
to ensure VLM transcript generation is actually working.
155159
156160
Returns:
157161
EpisodeTranscript with one TranscriptEntry per action.
@@ -178,16 +182,38 @@ def generate_transcript(
178182
batch_num, idx, end_idx - 1, len(images) if images else 0,
179183
)
180184

181-
raw = vlm_call(
182-
prompt,
183-
images=images,
184-
model=model,
185-
provider=provider,
186-
max_tokens=2048,
187-
)
185+
try:
186+
raw = vlm_call(
187+
prompt,
188+
images=images,
189+
model=model,
190+
provider=provider,
191+
max_tokens=2048,
192+
)
193+
except Exception:
194+
if strict:
195+
raise
196+
logger.warning(
197+
"VLM call failed for batch %d (actions %d-%d), skipping",
198+
batch_num, idx, end_idx - 1,
199+
exc_info=True,
200+
)
201+
idx = end_idx if end_idx >= len(session.actions) else end_idx - overlap
202+
batch_num += 1
203+
continue
188204

189205
parsed = _parse_transcript_response(raw, len(batch_actions))
190206

207+
if strict and all(
208+
entry.get("vlm_confidence", 1.0) == 0.0
209+
and entry.get("narration") == "Action performed"
210+
for entry in parsed
211+
):
212+
raise ValueError(
213+
f"VLM transcript parsing returned only placeholders for batch "
214+
f"{batch_num} (actions {idx}-{end_idx - 1}). Raw response: {raw!r:.500}"
215+
)
216+
191217
# Create TranscriptEntry objects
192218
for i, (action, entry_data) in enumerate(zip(batch_actions, parsed)):
193219
entry = TranscriptEntry(

0 commit comments

Comments
 (0)