diff --git a/.gitignore b/.gitignore index f0601ce2..505aadee 100644 --- a/.gitignore +++ b/.gitignore @@ -196,3 +196,6 @@ docs/*Implementation_Plan.html # SAT test framework generated reports tests/automated/reports/ + +# Customer-supplied custom TruffleHog detectors (merged at runtime, not shipped) +configs/custom_trufflehog_detectors.yaml diff --git a/configs/custom_trufflehog_detectors.yaml.sample b/configs/custom_trufflehog_detectors.yaml.sample new file mode 100644 index 00000000..bb8b939b --- /dev/null +++ b/configs/custom_trufflehog_detectors.yaml.sample @@ -0,0 +1,30 @@ +# Custom TruffleHog Detectors (customer-supplied) — TEMPLATE +# +# Copy this file to `custom_trufflehog_detectors.yaml` in this same configs/ +# directory and add your own detectors. SAT loads it automatically and merges +# its `detectors:` into the shipped trufflehog_detectors.yaml at scan time: +# - detectors here are ADDED to the shipped set +# - a detector whose `name` matches a shipped one OVERRIDES it +# - any settings.excluded_detectors here are appended to the shipped list +# +# The real custom_trufflehog_detectors.yaml is git-ignored, so it survives +# SAT upgrades and is never overwritten. +# +# Formatting rules: +# - everything lives under the top-level `detectors:` key +# - each item's keys (name, description, keywords, regex) align at the +# same indentation +# - single-quote regex values so backslashes (e.g. \b) stay literal + +detectors: + - name: AcmeApiKey + description: "Example: Acme Corp API key" + keywords: + - acme + regex: + id: '(?i)\b(acme_[a-z0-9]{32})\b' + +# Optionally exclude additional built-in TruffleHog detectors: +# settings: +# excluded_detectors: +# - SomeNoisyDetector diff --git a/configs/trufflehog_detectors.yaml b/configs/trufflehog_detectors.yaml index 8c8ce515..97078672 100644 --- a/configs/trufflehog_detectors.yaml +++ b/configs/trufflehog_detectors.yaml @@ -1,52 +1,59 @@ # TruffleHog Custom Detectors Configuration # This file contains custom detector definitions for TruffleHog secret scanning # Used by the SAT (Security Analysis Tool) TruffleHog integration +# +# Detectors live under the top-level `detectors:` key. Each item's keys +# (name, description, keywords, regex) must align at the same indentation. +# To add your OWN detectors without editing this shipped file, create +# `custom_trufflehog_detectors.yaml` in this same configs/ directory — SAT +# merges its `detectors:` into the list below (custom wins on name clash). -- name: DapiToken +detectors: + - name: DapiToken description: "Databricks DAPI token detector" keywords: - dapi regex: - id: (?i)\b(dapi[a-h0-9]{32}) - -- name: DkeaToken + id: '(?i)\b(dapi[a-h0-9]{32})' + + - name: DkeaToken description: "Databricks DKEA token detector" keywords: - dkea regex: - id: (?i)\b(dkea[a-h0-9]{32}) + id: '(?i)\b(dkea[a-h0-9]{32})' -- name: DsapiToken - description: "Databricks Databricks Scoped API Token detector" + - name: DsapiToken + description: "Databricks Scoped API Token detector" keywords: - dsapi regex: - id: (?i)\b(dsapi[a-h0-9]{32}) + id: '(?i)\b(dsapi[a-h0-9]{32})' -- name: DoseToken + - name: DoseToken description: "Databricks DOSE token detector" keywords: - dose regex: - id: (?i)\b(dose[a-h0-9]{32}) + id: '(?i)\b(dose[a-h0-9]{32})' # Configuration settings for TruffleHog scanning settings: excluded_detectors: - DatabricksToken # Exclude built-in Databricks token detector to avoid false positives - + scan_options: no_update: true json_output: true - + rate_limiting: api_sleep_seconds: 10 # Sleep between API calls to prevent rate limiting - + file_paths: temp_config: "/tmp/trufflehog_config.yaml" temp_notebooks: "/tmp/notebooks" results_log: "/tmp/trufflehog_scan_results.json" - + search_settings: page_size: 50 days_back: 1 # Number of days back to search for modified notebooks diff --git a/notebooks/Includes/scan_secrets/notebook_secret_scan.py b/notebooks/Includes/scan_secrets/notebook_secret_scan.py index 9e1ef82f..e85df8e2 100644 --- a/notebooks/Includes/scan_secrets/notebook_secret_scan.py +++ b/notebooks/Includes/scan_secrets/notebook_secret_scan.py @@ -183,37 +183,92 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) +def _default_config() -> Dict[str, Any]: + """Built-in fallback used only when the shipped config can't be loaded.""" + return { + "detectors": [ + {"name": "DkeaToken", "keywords": ["dkea"], "regex": {"id": "(?i)\\b(dkea[a-h0-9]{32})"}}, + {"name": "DapiToken", "keywords": ["dapi"], "regex": {"id": "(?i)\\b(dapi[a-h0-9]{32})"}}, + {"name": "DoseToken", "keywords": ["dose"], "regex": {"id": "(?i)\\b(dose[a-h0-9]{32})"}} + ], + "settings": { + "excluded_detectors": ["DatabricksToken"], + "rate_limiting": {"api_sleep_seconds": 10}, + "search_settings": {"page_size": 50, "days_back": 1} + } + } + + +def _merge_custom_detectors(config_data: Dict[str, Any], config_folder: str) -> Dict[str, Any]: + """Merge an optional customer-supplied detector file into `config_data`. + + Looks for `custom_trufflehog_detectors.yaml` in the same configs/ folder. + Its `detectors:` are merged into the shipped list (custom wins on a name + clash) and any `settings.excluded_detectors` are appended. The file is + optional; if it's missing or invalid we keep the shipped config and carry + on (logging the reason) rather than failing the scan. Lets customers add + their own detectors without editing the file that ships with SAT. + """ + custom_path = f"{config_folder}/custom_trufflehog_detectors.yaml" + if not os.path.exists(custom_path): + return config_data + + try: + with open(custom_path, 'r') as file: + custom = yaml.safe_load(file) + except Exception as e: + logger.error(f"Custom detector file {custom_path} failed to parse — ignoring it: {str(e)}") + return config_data + + if not isinstance(custom, dict): + logger.warning(f"Custom detector file {custom_path} is not a YAML mapping — ignoring it") + return config_data + + custom_detectors = custom.get("detectors") or [] + if custom_detectors: + by_name = {d.get("name"): d for d in config_data.get("detectors", []) if isinstance(d, dict)} + for d in custom_detectors: + if isinstance(d, dict) and d.get("name"): + by_name[d["name"]] = d # custom overrides built-in of the same name + config_data["detectors"] = list(by_name.values()) + logger.info(f"Merged {len(custom_detectors)} custom detector(s) from {custom_path}") + + custom_excluded = (custom.get("settings") or {}).get("excluded_detectors") or [] + if custom_excluded: + existing = config_data.setdefault("settings", {}).setdefault("excluded_detectors", []) + for x in custom_excluded: + if x not in existing: + existing.append(x) + + return config_data + + def load_config_from_file(): - """Load configuration from the external YAML file in the configs directory.""" + """Load detector/scan configuration from the YAML files in the configs directory. + + Loads the shipped `trufflehog_detectors.yaml`, then merges an optional + customer-supplied `custom_trufflehog_detectors.yaml` if present. Falls back + to a minimal built-in config if the shipped file cannot be loaded. + """ + config_folder = getConfigPath() + config_path = f"{config_folder}/trufflehog_detectors.yaml" + + logger.info(f"Loading configuration from: {config_path}") try: - # Get the current notebook's directory and construct path to config file - notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get() - config_folder = getConfigPath() - config_path = f"{config_folder}/trufflehog_detectors.yaml" - - logger.info(f"Loading configuration from: {config_path}") - - # Read the configuration file with open(config_path, 'r') as file: config_data = yaml.safe_load(file) - - return config_data + if not isinstance(config_data, dict) or "detectors" not in config_data: + raise ValueError( + "config must be a YAML mapping with a top-level 'detectors:' key" + ) except Exception as e: - logger.warning(f"Could not load external config file: {str(e)}") - logger.info("Using default configuration") - # Fallback to default configuration - return { - "detectors": [ - {"name": "DkeaToken", "keywords": ["dkea"], "regex": {"id": "(?i)\\b(dkea[a-h0-9]{32})"}}, - {"name": "DapiToken", "keywords": ["dapi"], "regex": {"id": "(?i)\\b(dapi[a-h0-9]{32})"}}, - {"name": "DoseToken", "keywords": ["dose"], "regex": {"id": "(?i)\\b(dose[a-h0-9]{32})"}} - ], - "settings": { - "excluded_detectors": ["DatabricksToken"], - "rate_limiting": {"api_sleep_seconds": 10}, - "search_settings": {"page_size": 50, "days_back": 1} - } - } + logger.error( + f"Could not load {config_path}: {str(e)}. " + f"Using built-in detectors only; fix the file to load the full detector set." + ) + return _default_config() + + return _merge_custom_detectors(config_data, config_folder) def create_trufflehog_config(config_data: Dict[str, Any]) -> str: """Create TruffleHog configuration file from loaded config data."""