Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -196,3 +196,6 @@ docs/*Implementation_Plan.html

# SAT test framework generated reports
tests/automated/reports/

# Customer-supplied custom TruffleHog detectors (merged at runtime, not shipped)
configs/custom_trufflehog_detectors.yaml
30 changes: 30 additions & 0 deletions configs/custom_trufflehog_detectors.yaml.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Custom TruffleHog Detectors (customer-supplied) — TEMPLATE
#
# Copy this file to `custom_trufflehog_detectors.yaml` in this same configs/
# directory and add your own detectors. SAT loads it automatically and merges
# its `detectors:` into the shipped trufflehog_detectors.yaml at scan time:
# - detectors here are ADDED to the shipped set
# - a detector whose `name` matches a shipped one OVERRIDES it
# - any settings.excluded_detectors here are appended to the shipped list
#
# The real custom_trufflehog_detectors.yaml is git-ignored, so it survives
# SAT upgrades and is never overwritten.
#
# Formatting rules:
# - everything lives under the top-level `detectors:` key
# - each item's keys (name, description, keywords, regex) align at the
# same indentation
# - single-quote regex values so backslashes (e.g. \b) stay literal

detectors:
- name: AcmeApiKey
description: "Example: Acme Corp API key"
keywords:
- acme
regex:
id: '(?i)\b(acme_[a-z0-9]{32})\b'

# Optionally exclude additional built-in TruffleHog detectors:
# settings:
# excluded_detectors:
# - SomeNoisyDetector
35 changes: 21 additions & 14 deletions configs/trufflehog_detectors.yaml
Original file line number Diff line number Diff line change
@@ -1,52 +1,59 @@
# TruffleHog Custom Detectors Configuration
# This file contains custom detector definitions for TruffleHog secret scanning
# Used by the SAT (Security Analysis Tool) TruffleHog integration
#
# Detectors live under the top-level `detectors:` key. Each item's keys
# (name, description, keywords, regex) must align at the same indentation.
# To add your OWN detectors without editing this shipped file, create
# `custom_trufflehog_detectors.yaml` in this same configs/ directory — SAT
# merges its `detectors:` into the list below (custom wins on name clash).

- name: DapiToken
detectors:
- name: DapiToken
description: "Databricks DAPI token detector"
keywords:
- dapi
regex:
id: (?i)\b(dapi[a-h0-9]{32})
- name: DkeaToken
id: '(?i)\b(dapi[a-h0-9]{32})'

- name: DkeaToken
description: "Databricks DKEA token detector"
keywords:
- dkea
regex:
id: (?i)\b(dkea[a-h0-9]{32})
id: '(?i)\b(dkea[a-h0-9]{32})'

- name: DsapiToken
description: "Databricks Databricks Scoped API Token detector"
- name: DsapiToken
description: "Databricks Scoped API Token detector"
keywords:
- dsapi
regex:
id: (?i)\b(dsapi[a-h0-9]{32})
id: '(?i)\b(dsapi[a-h0-9]{32})'

- name: DoseToken
- name: DoseToken
description: "Databricks DOSE token detector"
keywords:
- dose
regex:
id: (?i)\b(dose[a-h0-9]{32})
id: '(?i)\b(dose[a-h0-9]{32})'

# Configuration settings for TruffleHog scanning
settings:
excluded_detectors:
- DatabricksToken # Exclude built-in Databricks token detector to avoid false positives

scan_options:
no_update: true
json_output: true

rate_limiting:
api_sleep_seconds: 10 # Sleep between API calls to prevent rate limiting

file_paths:
temp_config: "/tmp/trufflehog_config.yaml"
temp_notebooks: "/tmp/notebooks"
results_log: "/tmp/trufflehog_scan_results.json"

search_settings:
page_size: 50
days_back: 1 # Number of days back to search for modified notebooks
Expand Down
107 changes: 81 additions & 26 deletions notebooks/Includes/scan_secrets/notebook_secret_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,37 +183,92 @@
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def _default_config() -> Dict[str, Any]:
"""Built-in fallback used only when the shipped config can't be loaded."""
return {
"detectors": [
{"name": "DkeaToken", "keywords": ["dkea"], "regex": {"id": "(?i)\\b(dkea[a-h0-9]{32})"}},
{"name": "DapiToken", "keywords": ["dapi"], "regex": {"id": "(?i)\\b(dapi[a-h0-9]{32})"}},
{"name": "DoseToken", "keywords": ["dose"], "regex": {"id": "(?i)\\b(dose[a-h0-9]{32})"}}
],
"settings": {
"excluded_detectors": ["DatabricksToken"],
"rate_limiting": {"api_sleep_seconds": 10},
"search_settings": {"page_size": 50, "days_back": 1}
}
}


def _merge_custom_detectors(config_data: Dict[str, Any], config_folder: str) -> Dict[str, Any]:
"""Merge an optional customer-supplied detector file into `config_data`.

Looks for `custom_trufflehog_detectors.yaml` in the same configs/ folder.
Its `detectors:` are merged into the shipped list (custom wins on a name
clash) and any `settings.excluded_detectors` are appended. The file is
optional; if it's missing or invalid we keep the shipped config and carry
on (logging the reason) rather than failing the scan. Lets customers add
their own detectors without editing the file that ships with SAT.
"""
custom_path = f"{config_folder}/custom_trufflehog_detectors.yaml"
if not os.path.exists(custom_path):
return config_data

try:
with open(custom_path, 'r') as file:
custom = yaml.safe_load(file)
except Exception as e:
logger.error(f"Custom detector file {custom_path} failed to parse — ignoring it: {str(e)}")
return config_data

if not isinstance(custom, dict):
logger.warning(f"Custom detector file {custom_path} is not a YAML mapping — ignoring it")
return config_data

custom_detectors = custom.get("detectors") or []
if custom_detectors:
by_name = {d.get("name"): d for d in config_data.get("detectors", []) if isinstance(d, dict)}
for d in custom_detectors:
if isinstance(d, dict) and d.get("name"):
by_name[d["name"]] = d # custom overrides built-in of the same name
config_data["detectors"] = list(by_name.values())
logger.info(f"Merged {len(custom_detectors)} custom detector(s) from {custom_path}")

custom_excluded = (custom.get("settings") or {}).get("excluded_detectors") or []
if custom_excluded:
existing = config_data.setdefault("settings", {}).setdefault("excluded_detectors", [])
for x in custom_excluded:
if x not in existing:
existing.append(x)

return config_data


def load_config_from_file():
"""Load configuration from the external YAML file in the configs directory."""
"""Load detector/scan configuration from the YAML files in the configs directory.

Loads the shipped `trufflehog_detectors.yaml`, then merges an optional
customer-supplied `custom_trufflehog_detectors.yaml` if present. Falls back
to a minimal built-in config if the shipped file cannot be loaded.
"""
config_folder = getConfigPath()
config_path = f"{config_folder}/trufflehog_detectors.yaml"

logger.info(f"Loading configuration from: {config_path}")
try:
# Get the current notebook's directory and construct path to config file
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
config_folder = getConfigPath()
config_path = f"{config_folder}/trufflehog_detectors.yaml"

logger.info(f"Loading configuration from: {config_path}")

# Read the configuration file
with open(config_path, 'r') as file:
config_data = yaml.safe_load(file)

return config_data
if not isinstance(config_data, dict) or "detectors" not in config_data:
raise ValueError(
"config must be a YAML mapping with a top-level 'detectors:' key"
)
except Exception as e:
logger.warning(f"Could not load external config file: {str(e)}")
logger.info("Using default configuration")
# Fallback to default configuration
return {
"detectors": [
{"name": "DkeaToken", "keywords": ["dkea"], "regex": {"id": "(?i)\\b(dkea[a-h0-9]{32})"}},
{"name": "DapiToken", "keywords": ["dapi"], "regex": {"id": "(?i)\\b(dapi[a-h0-9]{32})"}},
{"name": "DoseToken", "keywords": ["dose"], "regex": {"id": "(?i)\\b(dose[a-h0-9]{32})"}}
],
"settings": {
"excluded_detectors": ["DatabricksToken"],
"rate_limiting": {"api_sleep_seconds": 10},
"search_settings": {"page_size": 50, "days_back": 1}
}
}
logger.error(
f"Could not load {config_path}: {str(e)}. "
f"Using built-in detectors only; fix the file to load the full detector set."
)
return _default_config()

return _merge_custom_detectors(config_data, config_folder)

def create_trufflehog_config(config_data: Dict[str, Any]) -> str:
"""Create TruffleHog configuration file from loaded config data."""
Expand Down