From c5e0469a8e86ab29f37a46ad3b190a57a1634b4e Mon Sep 17 00:00:00 2001 From: Shreel Shah Date: Mon, 15 Jun 2026 14:36:54 -0400 Subject: [PATCH 1/2] fix: repair invalid trufflehog_detectors.yaml and support custom detector files The shipped configs/trufflehog_detectors.yaml was invalid YAML, so load_config_from_file() always threw, silently fell back to 3 built-in detectors, and never loaded the file's detectors (GitHub issue: custom detectors not applied; 'mapping values are not allowed here, line 6'). Two structural bugs fixed: - detector item keys were over-indented (the line-6 parse error) - detectors were a bare top-level sequence instead of being nested under the top-level 'detectors:' key the loader reads Also: - Make the fallback loud: log at ERROR that custom detectors are NOT applied, instead of masking a broken config. - Support an optional, git-ignored custom_trufflehog_detectors.yaml that is merged into the shipped detectors (custom wins on name clash), so customers add their own without editing the shipped file. Adds a .sample template and .gitignore entry. Co-authored-by: Isaac --- .gitignore | 3 + .../custom_trufflehog_detectors.yaml.sample | 30 +++++ configs/trufflehog_detectors.yaml | 35 +++--- .../scan_secrets/notebook_secret_scan.py | 110 +++++++++++++----- 4 files changed, 138 insertions(+), 40 deletions(-) create mode 100644 configs/custom_trufflehog_detectors.yaml.sample diff --git a/.gitignore b/.gitignore index f0601ce2b..505aadee5 100644 --- a/.gitignore +++ b/.gitignore @@ -196,3 +196,6 @@ docs/*Implementation_Plan.html # SAT test framework generated reports tests/automated/reports/ + +# Customer-supplied custom TruffleHog detectors (merged at runtime, not shipped) +configs/custom_trufflehog_detectors.yaml diff --git a/configs/custom_trufflehog_detectors.yaml.sample b/configs/custom_trufflehog_detectors.yaml.sample new file mode 100644 index 000000000..983f8fcd3 --- /dev/null +++ b/configs/custom_trufflehog_detectors.yaml.sample @@ -0,0 +1,30 @@ +# Custom TruffleHog Detectors (customer-supplied) — TEMPLATE +# +# Copy this file to `custom_trufflehog_detectors.yaml` in this same configs/ +# directory and add your own detectors. SAT loads it automatically and merges +# its `detectors:` into the shipped trufflehog_detectors.yaml at scan time: +# - detectors here are ADDED to the shipped set +# - a detector whose `name` matches a shipped one OVERRIDES it +# - any settings.excluded_detectors here are appended to the shipped list +# +# The real custom_trufflehog_detectors.yaml is git-ignored, so it survives +# SAT upgrades and is never overwritten. +# +# Formatting rules (the shipped file had a bug here once — keep these): +# - everything lives under the top-level `detectors:` key +# - each item's keys (name, description, keywords, regex) align at the +# same indentation +# - single-quote regex values so backslashes (e.g. \b) stay literal + +detectors: + - name: AcmeApiKey + description: "Example: Acme Corp API key" + keywords: + - acme + regex: + id: '(?i)\b(acme_[a-z0-9]{32})\b' + +# Optionally exclude additional built-in TruffleHog detectors: +# settings: +# excluded_detectors: +# - SomeNoisyDetector diff --git a/configs/trufflehog_detectors.yaml b/configs/trufflehog_detectors.yaml index 8c8ce5152..970786720 100644 --- a/configs/trufflehog_detectors.yaml +++ b/configs/trufflehog_detectors.yaml @@ -1,52 +1,59 @@ # TruffleHog Custom Detectors Configuration # This file contains custom detector definitions for TruffleHog secret scanning # Used by the SAT (Security Analysis Tool) TruffleHog integration +# +# Detectors live under the top-level `detectors:` key. Each item's keys +# (name, description, keywords, regex) must align at the same indentation. +# To add your OWN detectors without editing this shipped file, create +# `custom_trufflehog_detectors.yaml` in this same configs/ directory — SAT +# merges its `detectors:` into the list below (custom wins on name clash). -- name: DapiToken +detectors: + - name: DapiToken description: "Databricks DAPI token detector" keywords: - dapi regex: - id: (?i)\b(dapi[a-h0-9]{32}) - -- name: DkeaToken + id: '(?i)\b(dapi[a-h0-9]{32})' + + - name: DkeaToken description: "Databricks DKEA token detector" keywords: - dkea regex: - id: (?i)\b(dkea[a-h0-9]{32}) + id: '(?i)\b(dkea[a-h0-9]{32})' -- name: DsapiToken - description: "Databricks Databricks Scoped API Token detector" + - name: DsapiToken + description: "Databricks Scoped API Token detector" keywords: - dsapi regex: - id: (?i)\b(dsapi[a-h0-9]{32}) + id: '(?i)\b(dsapi[a-h0-9]{32})' -- name: DoseToken + - name: DoseToken description: "Databricks DOSE token detector" keywords: - dose regex: - id: (?i)\b(dose[a-h0-9]{32}) + id: '(?i)\b(dose[a-h0-9]{32})' # Configuration settings for TruffleHog scanning settings: excluded_detectors: - DatabricksToken # Exclude built-in Databricks token detector to avoid false positives - + scan_options: no_update: true json_output: true - + rate_limiting: api_sleep_seconds: 10 # Sleep between API calls to prevent rate limiting - + file_paths: temp_config: "/tmp/trufflehog_config.yaml" temp_notebooks: "/tmp/notebooks" results_log: "/tmp/trufflehog_scan_results.json" - + search_settings: page_size: 50 days_back: 1 # Number of days back to search for modified notebooks diff --git a/notebooks/Includes/scan_secrets/notebook_secret_scan.py b/notebooks/Includes/scan_secrets/notebook_secret_scan.py index 9e1ef82f7..1e1c7406c 100644 --- a/notebooks/Includes/scan_secrets/notebook_secret_scan.py +++ b/notebooks/Includes/scan_secrets/notebook_secret_scan.py @@ -183,37 +183,95 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) +def _default_config() -> Dict[str, Any]: + """Built-in fallback used only when the shipped config can't be loaded.""" + return { + "detectors": [ + {"name": "DkeaToken", "keywords": ["dkea"], "regex": {"id": "(?i)\\b(dkea[a-h0-9]{32})"}}, + {"name": "DapiToken", "keywords": ["dapi"], "regex": {"id": "(?i)\\b(dapi[a-h0-9]{32})"}}, + {"name": "DoseToken", "keywords": ["dose"], "regex": {"id": "(?i)\\b(dose[a-h0-9]{32})"}} + ], + "settings": { + "excluded_detectors": ["DatabricksToken"], + "rate_limiting": {"api_sleep_seconds": 10}, + "search_settings": {"page_size": 50, "days_back": 1} + } + } + + +def _merge_custom_detectors(config_data: Dict[str, Any], config_folder: str) -> Dict[str, Any]: + """Merge an optional customer-supplied detector file into `config_data`. + + Looks for `custom_trufflehog_detectors.yaml` in the same configs/ folder. + Its `detectors:` are merged into the shipped list (custom wins on a name + clash) and any `settings.excluded_detectors` are appended. The file is + optional; if it's missing or invalid we keep the shipped config and carry + on (logging the reason) rather than failing the scan. Lets customers add + their own detectors without editing the file that ships with SAT. + """ + custom_path = f"{config_folder}/custom_trufflehog_detectors.yaml" + if not os.path.exists(custom_path): + return config_data + + try: + with open(custom_path, 'r') as file: + custom = yaml.safe_load(file) + except Exception as e: + logger.error(f"Custom detector file {custom_path} failed to parse — ignoring it: {str(e)}") + return config_data + + if not isinstance(custom, dict): + logger.warning(f"Custom detector file {custom_path} is not a YAML mapping — ignoring it") + return config_data + + custom_detectors = custom.get("detectors") or [] + if custom_detectors: + by_name = {d.get("name"): d for d in config_data.get("detectors", []) if isinstance(d, dict)} + for d in custom_detectors: + if isinstance(d, dict) and d.get("name"): + by_name[d["name"]] = d # custom overrides built-in of the same name + config_data["detectors"] = list(by_name.values()) + logger.info(f"Merged {len(custom_detectors)} custom detector(s) from {custom_path}") + + custom_excluded = (custom.get("settings") or {}).get("excluded_detectors") or [] + if custom_excluded: + existing = config_data.setdefault("settings", {}).setdefault("excluded_detectors", []) + for x in custom_excluded: + if x not in existing: + existing.append(x) + + return config_data + + def load_config_from_file(): - """Load configuration from the external YAML file in the configs directory.""" + """Load detector/scan configuration from the YAML files in the configs directory. + + Loads the shipped `trufflehog_detectors.yaml`, then merges an optional + customer-supplied `custom_trufflehog_detectors.yaml` if present. Falls back + to a minimal built-in config only if the shipped file can't be loaded — + and logs that loudly, since a silent fallback previously masked a broken + config and left scans running with just the built-in detectors. + """ + config_folder = getConfigPath() + config_path = f"{config_folder}/trufflehog_detectors.yaml" + + logger.info(f"Loading configuration from: {config_path}") try: - # Get the current notebook's directory and construct path to config file - notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get() - config_folder = getConfigPath() - config_path = f"{config_folder}/trufflehog_detectors.yaml" - - logger.info(f"Loading configuration from: {config_path}") - - # Read the configuration file with open(config_path, 'r') as file: config_data = yaml.safe_load(file) - - return config_data + if not isinstance(config_data, dict) or "detectors" not in config_data: + raise ValueError( + "config must be a YAML mapping with a top-level 'detectors:' key" + ) except Exception as e: - logger.warning(f"Could not load external config file: {str(e)}") - logger.info("Using default configuration") - # Fallback to default configuration - return { - "detectors": [ - {"name": "DkeaToken", "keywords": ["dkea"], "regex": {"id": "(?i)\\b(dkea[a-h0-9]{32})"}}, - {"name": "DapiToken", "keywords": ["dapi"], "regex": {"id": "(?i)\\b(dapi[a-h0-9]{32})"}}, - {"name": "DoseToken", "keywords": ["dose"], "regex": {"id": "(?i)\\b(dose[a-h0-9]{32})"}} - ], - "settings": { - "excluded_detectors": ["DatabricksToken"], - "rate_limiting": {"api_sleep_seconds": 10}, - "search_settings": {"page_size": 50, "days_back": 1} - } - } + logger.error( + f"Could not load {config_path}: {str(e)}. " + f"FALLING BACK to built-in detectors only — custom detectors will NOT be applied. " + f"Fix the YAML and re-run to scan with the full detector set." + ) + return _default_config() + + return _merge_custom_detectors(config_data, config_folder) def create_trufflehog_config(config_data: Dict[str, Any]) -> str: """Create TruffleHog configuration file from loaded config data.""" From 783318c777614737be986e39b54ad9b2252a8d46 Mon Sep 17 00:00:00 2001 From: Shreel Shah Date: Mon, 15 Jun 2026 14:44:32 -0400 Subject: [PATCH 2/2] docs: keep detector config comments functional and customer-facing Remove references to past behavior/bugs from comments and the fallback log message; describe only what the files and code do. Co-authored-by: Isaac --- configs/custom_trufflehog_detectors.yaml.sample | 2 +- notebooks/Includes/scan_secrets/notebook_secret_scan.py | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/configs/custom_trufflehog_detectors.yaml.sample b/configs/custom_trufflehog_detectors.yaml.sample index 983f8fcd3..bb8b939b0 100644 --- a/configs/custom_trufflehog_detectors.yaml.sample +++ b/configs/custom_trufflehog_detectors.yaml.sample @@ -10,7 +10,7 @@ # The real custom_trufflehog_detectors.yaml is git-ignored, so it survives # SAT upgrades and is never overwritten. # -# Formatting rules (the shipped file had a bug here once — keep these): +# Formatting rules: # - everything lives under the top-level `detectors:` key # - each item's keys (name, description, keywords, regex) align at the # same indentation diff --git a/notebooks/Includes/scan_secrets/notebook_secret_scan.py b/notebooks/Includes/scan_secrets/notebook_secret_scan.py index 1e1c7406c..e85df8e24 100644 --- a/notebooks/Includes/scan_secrets/notebook_secret_scan.py +++ b/notebooks/Includes/scan_secrets/notebook_secret_scan.py @@ -248,9 +248,7 @@ def load_config_from_file(): Loads the shipped `trufflehog_detectors.yaml`, then merges an optional customer-supplied `custom_trufflehog_detectors.yaml` if present. Falls back - to a minimal built-in config only if the shipped file can't be loaded — - and logs that loudly, since a silent fallback previously masked a broken - config and left scans running with just the built-in detectors. + to a minimal built-in config if the shipped file cannot be loaded. """ config_folder = getConfigPath() config_path = f"{config_folder}/trufflehog_detectors.yaml" @@ -266,8 +264,7 @@ def load_config_from_file(): except Exception as e: logger.error( f"Could not load {config_path}: {str(e)}. " - f"FALLING BACK to built-in detectors only — custom detectors will NOT be applied. " - f"Fix the YAML and re-run to scan with the full detector set." + f"Using built-in detectors only; fix the file to load the full detector set." ) return _default_config()