Skip to content

Commit 0f87c5c

Browse files
committed
Harden secrets model dump dedupe
Signed-off-by: lucarlig <luca.carlig@ibm.com>
1 parent 76555e7 commit 0f87c5c

2 files changed

Lines changed: 166 additions & 10 deletions

File tree

plugins/rust/python-package/secrets_detection/src/object_model.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// SPDX-License-Identifier: Apache-2.0
33

44
use pyo3::prelude::*;
5-
use pyo3::types::{PyAny, PyDict, PyFrozenSet, PyList, PySet, PyTuple};
5+
use pyo3::types::{PyAny, PyDict, PyFrozenSet, PyList, PySet, PyString, PyTuple};
66

77
pub struct InspectedObjectState<'py> {
88
pub rebuild_state: Option<Bound<'py, PyDict>>,
@@ -19,7 +19,9 @@ pub fn inspect_object_state<'py>(
1919
if let Ok(model_dump) = container.call_method0("model_dump") {
2020
if let Ok(model_state) = model_dump.cast::<PyDict>() {
2121
serialized_state = Some(model_state.clone().into_any());
22-
mappings.push(model_state)?;
22+
if dict_has_only_exact_string_keys(model_state) {
23+
mappings.push(model_state)?;
24+
}
2325
} else if !model_dump.is(container) {
2426
serialized_state = Some(model_dump);
2527
}
@@ -244,6 +246,11 @@ fn merge_state_into(target: &Bound<'_, PyDict>, source: &Bound<'_, PyDict>) -> P
244246
Ok(())
245247
}
246248

249+
fn dict_has_only_exact_string_keys(dict: &Bound<'_, PyDict>) -> bool {
250+
dict.iter()
251+
.all(|(key, _)| key.is_exact_instance_of::<PyString>())
252+
}
253+
247254
struct MappingStateAccumulator<'py> {
248255
py: Python<'py>,
249256
state: Option<Bound<'py, PyDict>>,

plugins/rust/python-package/secrets_detection/src/scanner/python_scan.rs

Lines changed: 157 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ fn scan_container_inner<'py>(
3333
) -> PyResult<(usize, Bound<'py, PyAny>, Bound<'py, PyList>)> {
3434
let findings = PyList::empty(py);
3535

36-
if let Ok(text) = container.extract::<String>() {
36+
if container.is_exact_instance_of::<PyString>() {
37+
let text = container.extract::<String>()?;
3738
let (matches, redacted) = detect_and_redact(&text, config);
3839
for finding in &matches {
3940
let finding_dict = PyDict::new(py);
@@ -137,7 +138,7 @@ fn scan_container_inner<'py>(
137138
for finding in child_findings.iter() {
138139
findings.append(finding)?;
139140
}
140-
if count > 0 || !redacted_state.eq(&state_any)? {
141+
if count > 0 || !same_safe_value(&redacted_state, &state_any, &mut HashSet::new())? {
141142
apply_object_state(py, &target, &redacted_state)?;
142143
rebuilt = Some(target.into_any());
143144
}
@@ -245,15 +246,19 @@ fn serialized_dict_duplicates_rebuild_state(
245246
let Some(rebuild_value) = rebuild_dict.get_item(&key)? else {
246247
return Ok(false);
247248
};
248-
if !same_safe_value(&serialized_value, &rebuild_value)? {
249+
if !same_safe_value(&serialized_value, &rebuild_value, &mut HashSet::new())? {
249250
return Ok(false);
250251
}
251252
}
252253

253254
Ok(true)
254255
}
255256

256-
fn same_safe_value(left: &Bound<'_, PyAny>, right: &Bound<'_, PyAny>) -> PyResult<bool> {
257+
fn same_safe_value(
258+
left: &Bound<'_, PyAny>,
259+
right: &Bound<'_, PyAny>,
260+
seen: &mut HashSet<(usize, usize)>,
261+
) -> PyResult<bool> {
257262
if left.is(right) {
258263
return Ok(true);
259264
}
@@ -262,31 +267,44 @@ fn same_safe_value(left: &Bound<'_, PyAny>, right: &Bound<'_, PyAny>) -> PyResul
262267
return Ok(left.extract::<String>()? == right.extract::<String>()?);
263268
}
264269

270+
if is_exact_safe_scalar_pair(left, right) {
271+
return left.eq(right);
272+
}
273+
265274
if let (Ok(left_list), Ok(right_list)) = (left.cast::<PyList>(), right.cast::<PyList>()) {
275+
if !seen.insert((left.as_ptr() as usize, right.as_ptr() as usize)) {
276+
return Ok(false);
277+
}
266278
if left_list.len() != right_list.len() {
267279
return Ok(false);
268280
}
269281
for (left_item, right_item) in left_list.iter().zip(right_list.iter()) {
270-
if !same_safe_value(&left_item, &right_item)? {
282+
if !same_safe_value(&left_item, &right_item, seen)? {
271283
return Ok(false);
272284
}
273285
}
274286
return Ok(true);
275287
}
276288

277289
if let (Ok(left_tuple), Ok(right_tuple)) = (left.cast::<PyTuple>(), right.cast::<PyTuple>()) {
290+
if !seen.insert((left.as_ptr() as usize, right.as_ptr() as usize)) {
291+
return Ok(false);
292+
}
278293
if left_tuple.len() != right_tuple.len() {
279294
return Ok(false);
280295
}
281296
for (left_item, right_item) in left_tuple.iter().zip(right_tuple.iter()) {
282-
if !same_safe_value(&left_item, &right_item)? {
297+
if !same_safe_value(&left_item, &right_item, seen)? {
283298
return Ok(false);
284299
}
285300
}
286301
return Ok(true);
287302
}
288303

289304
if let (Ok(left_dict), Ok(right_dict)) = (left.cast::<PyDict>(), right.cast::<PyDict>()) {
305+
if !seen.insert((left.as_ptr() as usize, right.as_ptr() as usize)) {
306+
return Ok(false);
307+
}
290308
if left_dict.len() != right_dict.len()
291309
|| !dict_has_only_exact_string_keys(left_dict)
292310
|| !dict_has_only_exact_string_keys(right_dict)
@@ -297,7 +315,7 @@ fn same_safe_value(left: &Bound<'_, PyAny>, right: &Bound<'_, PyAny>) -> PyResul
297315
let Some(right_value) = right_dict.get_item(&key)? else {
298316
return Ok(false);
299317
};
300-
if !same_safe_value(&left_value, &right_value)? {
318+
if !same_safe_value(&left_value, &right_value, seen)? {
301319
return Ok(false);
302320
}
303321
}
@@ -312,6 +330,36 @@ fn dict_has_only_exact_string_keys(dict: &Bound<'_, PyDict>) -> bool {
312330
.all(|(key, _)| key.is_exact_instance_of::<PyString>())
313331
}
314332

333+
fn is_exact_safe_scalar_pair(left: &Bound<'_, PyAny>, right: &Bound<'_, PyAny>) -> bool {
334+
let left_module = left
335+
.get_type()
336+
.module()
337+
.and_then(|module| module.extract::<String>());
338+
let left_name = left
339+
.get_type()
340+
.name()
341+
.and_then(|name| name.extract::<String>());
342+
let right_module = right
343+
.get_type()
344+
.module()
345+
.and_then(|module| module.extract::<String>());
346+
let right_name = right
347+
.get_type()
348+
.name()
349+
.and_then(|name| name.extract::<String>());
350+
match (
351+
left_module.as_deref(),
352+
left_name.as_deref(),
353+
right_module.as_deref(),
354+
right_name.as_deref(),
355+
) {
356+
(Ok("builtins"), Ok(left_name), Ok("builtins"), Ok(right_name)) => {
357+
left_name == right_name && matches!(left_name, "int" | "float" | "bool" | "bytes")
358+
}
359+
_ => false,
360+
}
361+
}
362+
315363
fn serialized_result<'py>(
316364
py: Python<'py>,
317365
container: &Bound<'py, PyAny>,
@@ -525,6 +573,70 @@ class Model:
525573
.unwrap();
526574
}
527575

576+
#[test]
577+
fn scan_container_does_not_double_count_with_copied_model_dump_scalar() {
578+
Python::initialize();
579+
Python::attach(|py| -> PyResult<()> {
580+
let code = CString::new(
581+
r#"
582+
class Model:
583+
def __init__(self):
584+
self.text = "AWS_ACCESS_KEY_ID=AKIAFAKE12345EXAMPLE"
585+
self.num = int("1000000000000000000000")
586+
587+
def model_dump(self):
588+
return {"text": self.text, "num": int(str(self.num))}
589+
"#,
590+
)
591+
.unwrap();
592+
let module =
593+
PyModule::from_code(py, code.as_c_str(), c"test_module.py", c"test_module")?;
594+
let instance = module.getattr("Model")?.call0()?;
595+
let config = SecretsDetectionConfig::default();
596+
597+
let (count, _, findings) = scan_container(py, &instance, &config)?;
598+
599+
assert_eq!(count, 1);
600+
assert_eq!(findings.len(), 1);
601+
602+
Ok(())
603+
})
604+
.unwrap();
605+
}
606+
607+
#[test]
608+
fn scan_container_scans_cyclic_model_dump_without_duplicate_gate_recursion() {
609+
Python::initialize();
610+
Python::attach(|py| -> PyResult<()> {
611+
let code = CString::new(
612+
r#"
613+
class Model:
614+
def __init__(self):
615+
self.items = []
616+
self.items.append(self.items)
617+
618+
def model_dump(self):
619+
items = []
620+
items.append(items)
621+
return {"items": items}
622+
"#,
623+
)
624+
.unwrap();
625+
let module =
626+
PyModule::from_code(py, code.as_c_str(), c"test_module.py", c"test_module")?;
627+
let instance = module.getattr("Model")?.call0()?;
628+
let config = SecretsDetectionConfig::default();
629+
630+
let (count, _, findings) = scan_container(py, &instance, &config)?;
631+
632+
assert_eq!(count, 0);
633+
assert_eq!(findings.len(), 0);
634+
635+
Ok(())
636+
})
637+
.unwrap();
638+
}
639+
528640
#[test]
529641
fn duplicate_gate_ignores_non_string_model_dump_keys_without_lookup() {
530642
Python::initialize();
@@ -533,7 +645,7 @@ class Model:
533645
r#"
534646
class BadKey:
535647
def __hash__(self):
536-
return hash("text")
648+
return hash("bad-key")
537649
538650
def __eq__(self, other):
539651
raise RuntimeError("duplicate gate should not compare custom keys")
@@ -557,4 +669,41 @@ class BadKey:
557669
})
558670
.unwrap();
559671
}
672+
673+
#[test]
674+
fn scan_container_ignores_duplicate_gate_for_non_string_model_dump_keys() {
675+
Python::initialize();
676+
Python::attach(|py| -> PyResult<()> {
677+
let code = CString::new(
678+
r#"
679+
class BadKey:
680+
def __hash__(self):
681+
return hash("bad-key")
682+
683+
def __eq__(self, other):
684+
raise RuntimeError("duplicate gate should not compare custom keys")
685+
686+
class Model:
687+
def __init__(self):
688+
self.text = "clean"
689+
690+
def model_dump(self):
691+
return {BadKey(): "clean"}
692+
"#,
693+
)
694+
.unwrap();
695+
let module =
696+
PyModule::from_code(py, code.as_c_str(), c"test_module.py", c"test_module")?;
697+
let instance = module.getattr("Model")?.call0()?;
698+
let config = SecretsDetectionConfig::default();
699+
700+
let (count, _, findings) = scan_container(py, &instance, &config)?;
701+
702+
assert_eq!(count, 0);
703+
assert_eq!(findings.len(), 0);
704+
705+
Ok(())
706+
})
707+
.unwrap();
708+
}
560709
}

0 commit comments

Comments
 (0)