cleanup, test fixes

TheTechromancer · TheTechromancer · commit 20af84e2d7aa · 2026-04-29T15:24:03.000-04:00
diff --git a/bbot/core/config/models.py b/bbot/core/config/models.py
@@ -108,9 +108,18 @@ class BaseModuleConfig(BaseModel):
 
     model_config = STRICT
 
-    batch_size: Optional[int] = None
-    module_threads: Optional[int] = None
-    module_timeout: Optional[int] = None
+    batch_size: Optional[int] = Field(
+        default=None,
+        description="The number of events to process in a single batch (only applies to batch modules)",
+    )
+    module_threads: Optional[int] = Field(
+        default=None,
+        description="How many event handlers to run in parallel",
+    )
+    module_timeout: Optional[int] = Field(
+        default=None,
+        description="Max time in seconds to spend handling each event or batch of events",
+    )
 
 
 class BBOTConfig(BaseSettings):
diff --git a/bbot/scanner/preset/args.py b/bbot/scanner/preset/args.py
@@ -1,11 +1,10 @@
-import re
 import yaml
 import logging
 import argparse
 
 from bbot.errors import *
-from bbot.core.config.merge import dotted_get, dotted_set
-from bbot.core.helpers.misc import chain_lists, get_closest_match, get_keys_in_dot_syntax
+from bbot.core.config.merge import dotted_set
+from bbot.core.helpers.misc import chain_lists
 
 
 def _parse_cli_value(raw: str):
@@ -14,8 +13,12 @@ def _parse_cli_value(raw: str):
 
     YAML safe_load handles `true`/`false`/`null`/ints/floats and quoted strings
     the way users expect when they write `web.spider_distance=2` or
-    `modules.stdout.event_fields='[type, data]'`.
+    `modules.stdout.event_fields='[type, data]'`. An empty RHS (`-c key=`) is
+    treated as an empty string rather than None — matching the "clear this
+    value" intent users normally have.
     """
+    if raw == "":
+        return ""
     try:
         return yaml.safe_load(raw)
     except yaml.YAMLError:
@@ -47,19 +50,7 @@ def parse_dotted_cli(entries):
 log = logging.getLogger("bbot.presets.args")
 
 
-universal_module_options = {
-    "batch_size": "The number of events to process in a single batch (only applies to batch modules)",
-    "module_threads": "How many event handlers to run in parallel",
-    "module_timeout": "Max time in seconds to spend handling each event or batch of events",
-}
-
-
 class BBOTArgs:
-    # module config options to exclude from validation
-    exclude_from_validation = re.compile(
-        r".*modules\.[a-z0-9_]+\.(?:" + "|".join(universal_module_options.keys()) + ")$"
-    )
-
     scan_examples = [
         (
             "Subdomains",
@@ -491,16 +482,16 @@ def sanitize_args(self):
             self.parsed.preset += ["fast"]
 
     def validate(self):
-        # validate config options
-        sentinel = object()
-        all_options = set(get_keys_in_dot_syntax(self.preset.core.default_config))
-        for c in self.parsed.config:
-            c = c.split("=")[0].strip()
-            v = dotted_get(self.preset.core.default_config, c, default=sentinel)
-            # if option isn't in the default config
-            if v is sentinel:
-                # skip if it's excluded from validation
-                if self.exclude_from_validation.match(c):
-                    continue
-                # otherwise, ensure it exists as a module option
-                raise ValidationError(get_closest_match(c, all_options, msg="config option"))
+        """
+        Validate the CLI `-c key=value` arguments against the composite
+        preset schema. Catches typos like `bbot -c modules.shoudn.api_key=x`
+        with a closest-match suggestion.
+        """
+        from .validate import validate_preset
+
+        if not self.parsed.config:
+            return
+        cli_dict = parse_dotted_cli(self.parsed.config)
+        errs = validate_preset({"config": cli_dict}, module_loader=self.preset.module_loader)
+        if errs:
+            raise ValidationError("\n".join(str(e) for e in errs))
diff --git a/bbot/scanner/preset/validate.py b/bbot/scanner/preset/validate.py
@@ -26,7 +26,7 @@
 
 from pydantic import ValidationError
 
-from bbot.core.helpers.misc import get_closest_match
+from bbot.core.helpers.misc import get_closest_match, get_keys_in_dot_syntax
 
 
 log = logging.getLogger("bbot.presets.validate")
@@ -72,18 +72,25 @@ def _classify_loc(loc: tuple) -> tuple[str, str]:
     return ("preset", ".".join(parts))
 
 
-def _format_msg(err: dict, known_modules: set | None = None) -> str:
+def _format_msg(err: dict, known_modules: set | None = None, known_paths: set | None = None) -> str:
     kind = err["type"]
     input_value = err.get("input")
     loc = err["loc"]
     field = str(loc[-1]) if loc else ""
     path = ".".join(str(p) for p in loc)
 
     if kind == "extra_forbidden":
-        # Special-case unknown module name (config.modules.<bad>) so users get
-        # a suggestion rather than "Unknown option".
+        # Special-case unknown module name (config.modules.<bad>) — users get
+        # a suggestion drawn from the set of known module names.
         if len(loc) == 3 and loc[0] == "config" and loc[1] == "modules":
             return get_closest_match(field, known_modules or set(), msg="module")
+        # For everything else, suggest from the known dotted-path universe
+        # (`web.spier_distance` → `web.spider_distance`).
+        if known_paths:
+            # strip the leading "config." prefix when matching, since
+            # default_config dotted paths don't include it
+            lookup_path = ".".join(str(p) for p in loc[1:]) if loc and loc[0] == "config" else path
+            return get_closest_match(lookup_path, known_paths, msg="config option")
         msg = f"Unknown option: {field!r}"
         if isinstance(input_value, (str, int, bool, float)):
             msg += f" (value: {input_value!r})"
@@ -114,11 +121,19 @@ def _format_msg(err: dict, known_modules: set | None = None) -> str:
     return err["msg"] if err.get("msg") else f"validation error at {path}"
 
 
-def _format_errors(exc: ValidationError, known_modules: set | None = None) -> list[PresetValidationError]:
+def _format_errors(
+    exc: ValidationError,
+    known_modules: set | None = None,
+    known_paths: set | None = None,
+) -> list[PresetValidationError]:
     out: list[PresetValidationError] = []
     for err in exc.errors():
         where, path = _classify_loc(err["loc"])
-        out.append(PresetValidationError(where=where, path=path, message=_format_msg(err, known_modules)))
+        out.append(
+            PresetValidationError(
+                where=where, path=path, message=_format_msg(err, known_modules, known_paths)
+            )
+        )
     return out
 
 
@@ -168,14 +183,17 @@ def validate_preset(preset_dict: Any, module_loader=None) -> list[PresetValidati
 
     errors: list[PresetValidationError] = []
     known_modules = set(module_loader.all_module_choices)
+    # Universe of valid dotted config paths, used for "did you mean ...?"
+    # suggestions on unknown global-config keys.
+    known_paths = set(get_keys_in_dot_syntax(module_loader.core.default_config))
 
     # Validate against the composite schema (rebuilt automatically if new
     # module_dirs were just preloaded above). Closest-match suggestions
-    # for unknown module names are produced inside the formatter.
+    # for unknown module names + config options are produced inside the formatter.
     try:
         module_loader.validation_schema.model_validate(preset_dict)
     except ValidationError as e:
-        errors.extend(_format_errors(e, known_modules=known_modules))
+        errors.extend(_format_errors(e, known_modules=known_modules, known_paths=known_paths))
 
     # Module names listed in top-level `modules`/`output_modules`/`exclude_modules`
     # aren't covered by the composite schema (they're a list of strings, not a
diff --git a/bbot/scripts/docs.py b/bbot/scripts/docs.py
@@ -182,12 +182,12 @@ def update_individual_module_options():
     assert len(bbot_output_module_table.splitlines()) > 10
     update_md_files("BBOT OUTPUT MODULES", bbot_output_module_table)
 
-    # BBOT universal module options
-    from bbot.scanner.preset.args import universal_module_options
+    # BBOT universal module options (sourced from BaseModuleConfig)
+    from bbot.core.config.models import BaseModuleConfig
 
     universal_module_options_table = ""
-    for option, description in universal_module_options.items():
-        universal_module_options_table += f"**{option}**: {description}\n"
+    for name, field in BaseModuleConfig.model_fields.items():
+        universal_module_options_table += f"**{name}**: {field.description}\n"
     update_md_files("BBOT UNIVERSAL MODULE OPTIONS", universal_module_options_table)
 
     # BBOT module options
diff --git a/bbot/test/test_step_1/test_cli.py b/bbot/test/test_step_1/test_cli.py
@@ -466,13 +466,15 @@ def test_cli_config_validation(monkeypatch, caplog):
     monkeypatch.setattr(sys, "exit", lambda *args, **kwargs: True)
     monkeypatch.setattr(os, "_exit", lambda *args, **kwargs: True)
 
-    # incorrect module option
+    # incorrect module name nested under modules.* — surfaces as an unknown
+    # module with a closest-match suggestion (more useful than the legacy
+    # "Could not find config option ..." phrasing)
     caplog.clear()
     assert not caplog.text
     monkeypatch.setattr("sys.argv", ["bbot", "-c", "modules.ipnegibhor.num_bits=4"])
     cli.main()
-    assert 'Could not find config option "modules.ipnegibhor.num_bits"' in caplog.text
-    assert 'Did you mean "modules.ipneighbor.num_bits"?' in caplog.text
+    assert 'Could not find module "ipnegibhor"' in caplog.text
+    assert 'Did you mean "ipneighbor"?' in caplog.text
 
     # incorrect global option
     caplog.clear()