feat(extract): record --reason on bulk extractions

idank · idank · commit ea032763a091 · 2026-05-04T21:55:26.000+03:00
Free-text rationale stored in the db_events row so future reviewers
can see what motivated bulk re-runs. Required for any non-dry-run
extraction touching &gt;100 manpages.
diff --git a/explainshell/extraction/report.py b/explainshell/extraction/report.py
@@ -154,3 +154,4 @@ class ExtractionReport(BaseModel):
     failures: list[FailureEntry] = Field(default_factory=list)
     skips: list[SkipEntry] = Field(default_factory=list)
     batch_manifest: dict[str, Any] | None = None
+    reason: str | None = None
diff --git a/explainshell/manager.py b/explainshell/manager.py
@@ -572,6 +572,15 @@ def _require_db(ctx: click.Context, *, must_exist: bool = False) -> str:
     default=False,
     help="Process only files whose gz size exceeds 2048 bytes (route to a capable model).",
 )
+@click.option(
+    "--reason",
+    default=None,
+    help=(
+        "Why this extraction is being run. Recorded in the db_events row so future "
+        "reviewers can see what motivated bulk re-runs. Required for runs with "
+        "more than 100 manpages."
+    ),
+)
 @click.argument("files", nargs=-1, required=True)
 @click.pass_context
 def extract(
@@ -587,6 +596,7 @@ def extract(
     debug: bool,
     small_only: bool,
     large_only: bool,
+    reason: str | None,
 ) -> None:
     """Extract options from manpages and store in DB."""
     try:
@@ -629,6 +639,11 @@ def extract(
         raise click.UsageError(str(e))
     if not gz_files:
         raise click.UsageError("No .gz files found.")
+    if not dry_run and len(gz_files) > 100 and not reason:
+        raise click.UsageError(
+            f"--reason is required when extracting more than 100 manpages "
+            f"(got {len(gz_files)})"
+        )
 
     if drop:
         answer = input("Really drop all data? (y/n) ").strip().lower()
@@ -871,6 +886,7 @@ def on_result(gz_path: str, entry: ExtractionResult) -> None:
         failures=failures,
         skips=skips,
         batch_manifest=manifest.to_dict() if manifest is not None else None,
+        reason=reason,
     )
     _write_report(run_dir, report)
 
@@ -1215,6 +1231,7 @@ def show_events(ctx: click.Context, limit: int) -> None:
                 f"  db:       {report.db_after.manpages}({dm:+d})"
                 f" mappings={report.db_after.mappings}({dmap:+d})"
             )
+            click.echo(f"  reason:   {report.reason or '(no reason provided)'}")
         else:
             meta = ev.get("metadata", {})
             for k, v in meta.items():
diff --git a/tests/test_manager.py b/tests/test_manager.py
@@ -392,6 +392,66 @@ def test_dry_run_requires_existing_db(self):
         self.assertNotEqual(result.exit_code, 0)
         self.assertIn("Database not found", result.output)
 
+    @patch("explainshell.util.collect_gz_files")
+    def test_reason_required_for_bulk_extraction(self, mock_collect):
+        mock_collect.return_value = [f"/fake/page-{i}.1.gz" for i in range(101)]
+        with _temp_db() as db_path:
+            Store.create(db_path).close()
+            runner = CliRunner()
+            result = runner.invoke(
+                cli,
+                [
+                    "--db",
+                    db_path,
+                    "extract",
+                    "--mode",
+                    "llm:test-model",
+                    *[f"/fake/page-{i}.1.gz" for i in range(101)],
+                ],
+            )
+        self.assertNotEqual(result.exit_code, 0)
+        self.assertIn("--reason is required", result.output)
+        self.assertIn("got 101", result.output)
+
+    @patch("explainshell.util.collect_gz_files")
+    def test_reason_gate_skipped_for_dry_run(self, mock_collect):
+        mock_collect.return_value = [f"/fake/page-{i}.1.gz" for i in range(101)]
+        with _temp_db() as db_path:
+            Store.create(db_path).close()
+            runner = CliRunner()
+            result = runner.invoke(
+                cli,
+                [
+                    "--db",
+                    db_path,
+                    "extract",
+                    "--mode",
+                    "llm:test-model",
+                    "--dry-run",
+                    *[f"/fake/page-{i}.1.gz" for i in range(101)],
+                ],
+            )
+        self.assertNotIn("--reason is required", result.output)
+
+    @patch("explainshell.util.collect_gz_files")
+    def test_reason_gate_skipped_under_threshold(self, mock_collect):
+        mock_collect.return_value = [f"/fake/page-{i}.1.gz" for i in range(100)]
+        with _temp_db() as db_path:
+            Store.create(db_path).close()
+            runner = CliRunner()
+            result = runner.invoke(
+                cli,
+                [
+                    "--db",
+                    db_path,
+                    "extract",
+                    "--mode",
+                    "llm:test-model",
+                    *[f"/fake/page-{i}.1.gz" for i in range(100)],
+                ],
+            )
+        self.assertNotIn("--reason is required", result.output)
+
     @patch("explainshell.extraction.common.gz_sha256", side_effect=lambda p: p)
     @patch("explainshell.manager.run")
     @patch("explainshell.manager.make_extractor")
@@ -2257,6 +2317,32 @@ def test_show_events_extraction(self):
         self.assertIn("model:    openai/gpt-5", result.output)
         self.assertIn("result:   ok=10 skip=5 fail=1", result.output)
         self.assertIn("db:       110(+10) mappings=220(+20)", result.output)
+        self.assertIn("reason:   (no reason provided)", result.output)
+
+    def test_show_events_extraction_with_reason(self):
+        self.store.log_event(
+            "extraction",
+            {
+                "version": 1,
+                "command": "extract",
+                "timestamp": "2026-05-04T10:00:00+00:00",
+                "git": {"commit": "abc123", "commit_short": "abc", "dirty": False},
+                "config": {"mode": "llm", "model": "openai/gpt-5"},
+                "elapsed_seconds": 5.0,
+                "summary": {"succeeded": 10, "skipped": 0, "failed": 0},
+                "db_before": {"manpages": 100, "mappings": 200},
+                "db_after": {"manpages": 110, "mappings": 220},
+                "reason": "fix garbled emphasis after mandoc 89d8f45",
+            },
+        )
+
+        runner = CliRunner()
+        result = runner.invoke(cli, ["--db", self.db_path, "show", "events"])
+
+        self.assertEqual(result.exit_code, 0)
+        self.assertIn(
+            "reason:   fix garbled emphasis after mandoc 89d8f45", result.output
+        )
 
     def test_show_events_limit(self):
         for i in range(5):
@@ -2531,6 +2617,16 @@ def test_none_fields_excluded(self) -> None:
 
         data = self._read_report()
         self.assertNotIn("batch_manifest", data)
+        self.assertNotIn("reason", data)
+
+    def test_reason_embedded(self) -> None:
+        """reason string is included in the JSON when provided."""
+
+        report = self._make_report(reason="bulk re-run after mandoc emphasis fix")
+        _write_report(self._run_dir, report)
+
+        data = self._read_report()
+        self.assertEqual(data["reason"], "bulk re-run after mandoc emphasis fix")
 
     def test_batch_manifest_embedded(self) -> None:
         """batch_manifest dict is included when provided."""