Skip to content

Commit ea03276

Browse files
committed
feat(extract): record --reason on bulk extractions
Free-text rationale stored in the db_events row so future reviewers can see what motivated bulk re-runs. Required for any non-dry-run extraction touching >100 manpages.
1 parent 2e1b045 commit ea03276

3 files changed

Lines changed: 114 additions & 0 deletions

File tree

explainshell/extraction/report.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,3 +154,4 @@ class ExtractionReport(BaseModel):
154154
failures: list[FailureEntry] = Field(default_factory=list)
155155
skips: list[SkipEntry] = Field(default_factory=list)
156156
batch_manifest: dict[str, Any] | None = None
157+
reason: str | None = None

explainshell/manager.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,15 @@ def _require_db(ctx: click.Context, *, must_exist: bool = False) -> str:
572572
default=False,
573573
help="Process only files whose gz size exceeds 2048 bytes (route to a capable model).",
574574
)
575+
@click.option(
576+
"--reason",
577+
default=None,
578+
help=(
579+
"Why this extraction is being run. Recorded in the db_events row so future "
580+
"reviewers can see what motivated bulk re-runs. Required for runs with "
581+
"more than 100 manpages."
582+
),
583+
)
575584
@click.argument("files", nargs=-1, required=True)
576585
@click.pass_context
577586
def extract(
@@ -587,6 +596,7 @@ def extract(
587596
debug: bool,
588597
small_only: bool,
589598
large_only: bool,
599+
reason: str | None,
590600
) -> None:
591601
"""Extract options from manpages and store in DB."""
592602
try:
@@ -629,6 +639,11 @@ def extract(
629639
raise click.UsageError(str(e))
630640
if not gz_files:
631641
raise click.UsageError("No .gz files found.")
642+
if not dry_run and len(gz_files) > 100 and not reason:
643+
raise click.UsageError(
644+
f"--reason is required when extracting more than 100 manpages "
645+
f"(got {len(gz_files)})"
646+
)
632647

633648
if drop:
634649
answer = input("Really drop all data? (y/n) ").strip().lower()
@@ -871,6 +886,7 @@ def on_result(gz_path: str, entry: ExtractionResult) -> None:
871886
failures=failures,
872887
skips=skips,
873888
batch_manifest=manifest.to_dict() if manifest is not None else None,
889+
reason=reason,
874890
)
875891
_write_report(run_dir, report)
876892

@@ -1215,6 +1231,7 @@ def show_events(ctx: click.Context, limit: int) -> None:
12151231
f" db: {report.db_after.manpages}({dm:+d})"
12161232
f" mappings={report.db_after.mappings}({dmap:+d})"
12171233
)
1234+
click.echo(f" reason: {report.reason or '(no reason provided)'}")
12181235
else:
12191236
meta = ev.get("metadata", {})
12201237
for k, v in meta.items():

tests/test_manager.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,66 @@ def test_dry_run_requires_existing_db(self):
392392
self.assertNotEqual(result.exit_code, 0)
393393
self.assertIn("Database not found", result.output)
394394

395+
@patch("explainshell.util.collect_gz_files")
396+
def test_reason_required_for_bulk_extraction(self, mock_collect):
397+
mock_collect.return_value = [f"/fake/page-{i}.1.gz" for i in range(101)]
398+
with _temp_db() as db_path:
399+
Store.create(db_path).close()
400+
runner = CliRunner()
401+
result = runner.invoke(
402+
cli,
403+
[
404+
"--db",
405+
db_path,
406+
"extract",
407+
"--mode",
408+
"llm:test-model",
409+
*[f"/fake/page-{i}.1.gz" for i in range(101)],
410+
],
411+
)
412+
self.assertNotEqual(result.exit_code, 0)
413+
self.assertIn("--reason is required", result.output)
414+
self.assertIn("got 101", result.output)
415+
416+
@patch("explainshell.util.collect_gz_files")
417+
def test_reason_gate_skipped_for_dry_run(self, mock_collect):
418+
mock_collect.return_value = [f"/fake/page-{i}.1.gz" for i in range(101)]
419+
with _temp_db() as db_path:
420+
Store.create(db_path).close()
421+
runner = CliRunner()
422+
result = runner.invoke(
423+
cli,
424+
[
425+
"--db",
426+
db_path,
427+
"extract",
428+
"--mode",
429+
"llm:test-model",
430+
"--dry-run",
431+
*[f"/fake/page-{i}.1.gz" for i in range(101)],
432+
],
433+
)
434+
self.assertNotIn("--reason is required", result.output)
435+
436+
@patch("explainshell.util.collect_gz_files")
437+
def test_reason_gate_skipped_under_threshold(self, mock_collect):
438+
mock_collect.return_value = [f"/fake/page-{i}.1.gz" for i in range(100)]
439+
with _temp_db() as db_path:
440+
Store.create(db_path).close()
441+
runner = CliRunner()
442+
result = runner.invoke(
443+
cli,
444+
[
445+
"--db",
446+
db_path,
447+
"extract",
448+
"--mode",
449+
"llm:test-model",
450+
*[f"/fake/page-{i}.1.gz" for i in range(100)],
451+
],
452+
)
453+
self.assertNotIn("--reason is required", result.output)
454+
395455
@patch("explainshell.extraction.common.gz_sha256", side_effect=lambda p: p)
396456
@patch("explainshell.manager.run")
397457
@patch("explainshell.manager.make_extractor")
@@ -2257,6 +2317,32 @@ def test_show_events_extraction(self):
22572317
self.assertIn("model: openai/gpt-5", result.output)
22582318
self.assertIn("result: ok=10 skip=5 fail=1", result.output)
22592319
self.assertIn("db: 110(+10) mappings=220(+20)", result.output)
2320+
self.assertIn("reason: (no reason provided)", result.output)
2321+
2322+
def test_show_events_extraction_with_reason(self):
2323+
self.store.log_event(
2324+
"extraction",
2325+
{
2326+
"version": 1,
2327+
"command": "extract",
2328+
"timestamp": "2026-05-04T10:00:00+00:00",
2329+
"git": {"commit": "abc123", "commit_short": "abc", "dirty": False},
2330+
"config": {"mode": "llm", "model": "openai/gpt-5"},
2331+
"elapsed_seconds": 5.0,
2332+
"summary": {"succeeded": 10, "skipped": 0, "failed": 0},
2333+
"db_before": {"manpages": 100, "mappings": 200},
2334+
"db_after": {"manpages": 110, "mappings": 220},
2335+
"reason": "fix garbled emphasis after mandoc 89d8f45",
2336+
},
2337+
)
2338+
2339+
runner = CliRunner()
2340+
result = runner.invoke(cli, ["--db", self.db_path, "show", "events"])
2341+
2342+
self.assertEqual(result.exit_code, 0)
2343+
self.assertIn(
2344+
"reason: fix garbled emphasis after mandoc 89d8f45", result.output
2345+
)
22602346

22612347
def test_show_events_limit(self):
22622348
for i in range(5):
@@ -2531,6 +2617,16 @@ def test_none_fields_excluded(self) -> None:
25312617

25322618
data = self._read_report()
25332619
self.assertNotIn("batch_manifest", data)
2620+
self.assertNotIn("reason", data)
2621+
2622+
def test_reason_embedded(self) -> None:
2623+
"""reason string is included in the JSON when provided."""
2624+
2625+
report = self._make_report(reason="bulk re-run after mandoc emphasis fix")
2626+
_write_report(self._run_dir, report)
2627+
2628+
data = self._read_report()
2629+
self.assertEqual(data["reason"], "bulk re-run after mandoc emphasis fix")
25342630

25352631
def test_batch_manifest_embedded(self) -> None:
25362632
"""batch_manifest dict is included when provided."""

0 commit comments

Comments
 (0)