Skip to content

Commit 60b3aac

Browse files
Add configuration to disable sample collection for PII tables
- Add disable_samples_on_pii_tables and pii_table_tags config vars - Create is_pii_table helper macro for PII detection - Modify test materialization to skip sampling for PII tables - Add integration tests for PII sampling behavior - Extend dbt_project.test() to support model_config parameter Fixes ELE-4833 Co-Authored-By: Yosef Arbiv <yosef.arbiv@gmail.com>
1 parent f6d557d commit 60b3aac

5 files changed

Lines changed: 138 additions & 2 deletions

File tree

integration_tests/tests/dbt_project.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def test(
109109
materialization: str = "table", # Only relevant if as_model=True
110110
test_vars: Optional[dict] = None,
111111
elementary_enabled: bool = True,
112+
model_config: Optional[Dict[str, Any]] = None,
112113
*,
113114
multiple_results: Literal[False] = False,
114115
) -> Dict[str, Any]:
@@ -128,6 +129,7 @@ def test(
128129
materialization: str = "table", # Only relevant if as_model=True
129130
test_vars: Optional[dict] = None,
130131
elementary_enabled: bool = True,
132+
model_config: Optional[Dict[str, Any]] = None,
131133
*,
132134
multiple_results: Literal[True],
133135
) -> List[Dict[str, Any]]:
@@ -146,6 +148,7 @@ def test(
146148
materialization: str = "table", # Only relevant if as_model=True
147149
test_vars: Optional[dict] = None,
148150
elementary_enabled: bool = True,
151+
model_config: Optional[Dict[str, Any]] = None,
149152
*,
150153
multiple_results: bool = False,
151154
) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
@@ -161,6 +164,9 @@ def test(
161164
test_args = test_args or {}
162165
table_yaml: Dict[str, Any] = {"name": test_id}
163166

167+
if model_config:
168+
table_yaml.update(model_config)
169+
164170
if columns:
165171
table_yaml["columns"] = columns
166172

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
import json
2+
3+
import pytest
4+
from dbt_project import DbtProject
5+
6+
COLUMN_NAME = "value"
7+
8+
9+
SAMPLES_QUERY = """
10+
with latest_elementary_test_result as (
11+
select id
12+
from {{{{ ref("elementary_test_results") }}}}
13+
where lower(table_name) = lower('{test_id}')
14+
order by created_at desc
15+
limit 1
16+
)
17+
18+
select result_row
19+
from {{{{ ref("test_result_rows") }}}}
20+
where elementary_test_results_id in (select * from latest_elementary_test_result)
21+
"""
22+
23+
TEST_SAMPLE_ROW_COUNT = 7
24+
25+
26+
@pytest.mark.skip_targets(["clickhouse"])
27+
def test_sampling_pii_disabled(test_id: str, dbt_project: DbtProject):
28+
"""Test that PII-tagged tables don't upload samples even when tests fail"""
29+
null_count = 50
30+
data = [{COLUMN_NAME: None} for _ in range(null_count)]
31+
32+
test_result = dbt_project.test(
33+
test_id,
34+
"not_null",
35+
dict(column_name=COLUMN_NAME),
36+
data=data,
37+
as_model=True,
38+
model_config={"config": {"tags": ["pii"]}},
39+
test_vars={
40+
"enable_elementary_test_materialization": True,
41+
"test_sample_row_count": TEST_SAMPLE_ROW_COUNT,
42+
"disable_samples_on_pii_tables": True,
43+
"pii_table_tags": ["pii", "sensitive"],
44+
},
45+
)
46+
assert test_result["status"] == "fail"
47+
48+
samples = [
49+
json.loads(row["result_row"])
50+
for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
51+
]
52+
assert len(samples) == 0
53+
54+
55+
@pytest.mark.skip_targets(["clickhouse"])
56+
def test_sampling_non_pii_enabled(test_id: str, dbt_project: DbtProject):
57+
"""Test that non-PII tables still collect samples normally"""
58+
null_count = 50
59+
data = [{COLUMN_NAME: None} for _ in range(null_count)]
60+
61+
test_result = dbt_project.test(
62+
test_id,
63+
"not_null",
64+
dict(column_name=COLUMN_NAME),
65+
data=data,
66+
as_model=True,
67+
model_config={"config": {"tags": ["normal"]}},
68+
test_vars={
69+
"enable_elementary_test_materialization": True,
70+
"test_sample_row_count": TEST_SAMPLE_ROW_COUNT,
71+
"disable_samples_on_pii_tables": True,
72+
"pii_table_tags": ["pii", "sensitive"],
73+
},
74+
)
75+
assert test_result["status"] == "fail"
76+
77+
samples = [
78+
json.loads(row["result_row"])
79+
for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
80+
]
81+
assert len(samples) == TEST_SAMPLE_ROW_COUNT
82+
83+
84+
@pytest.mark.skip_targets(["clickhouse"])
85+
def test_sampling_pii_feature_disabled(test_id: str, dbt_project: DbtProject):
86+
"""Test that when PII feature is disabled, PII tables still collect samples"""
87+
null_count = 50
88+
data = [{COLUMN_NAME: None} for _ in range(null_count)]
89+
90+
test_result = dbt_project.test(
91+
test_id,
92+
"not_null",
93+
dict(column_name=COLUMN_NAME),
94+
data=data,
95+
as_model=True,
96+
model_config={"config": {"tags": ["pii"]}},
97+
test_vars={
98+
"enable_elementary_test_materialization": True,
99+
"test_sample_row_count": TEST_SAMPLE_ROW_COUNT,
100+
"disable_samples_on_pii_tables": False,
101+
"pii_table_tags": ["pii", "sensitive"],
102+
},
103+
)
104+
assert test_result["status"] == "fail"
105+
106+
samples = [
107+
json.loads(row["result_row"])
108+
for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id))
109+
]
110+
assert len(samples) == TEST_SAMPLE_ROW_COUNT

macros/edr/materializations/test/test.sql

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,11 @@
5050

5151
{% macro handle_dbt_test(flattened_test, materialization_macro) %}
5252
{% set result = materialization_macro() %}
53-
{% set result_rows = elementary.query_test_result_rows(sample_limit=elementary.get_config_var('test_sample_row_count'),
53+
{% set sample_limit = elementary.get_config_var('test_sample_row_count') %}
54+
{% if elementary.is_pii_table(flattened_test) %}
55+
{% set sample_limit = 0 %}
56+
{% endif %}
57+
{% set result_rows = elementary.query_test_result_rows(sample_limit=sample_limit,
5458
ignore_passed_tests=true) %}
5559
{% set elementary_test_results_row = elementary.get_dbt_test_result_row(flattened_test, result_rows) %}
5660
{% do elementary.cache_elementary_test_results_rows([elementary_test_results_row]) %}

macros/edr/system/system_utils/get_config_var.sql

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,9 @@
6464
},
6565
'include_other_warehouse_specific_columns': false,
6666
'fail_on_zero': false,
67-
'anomaly_exclude_metrics': none
67+
'anomaly_exclude_metrics': none,
68+
'disable_samples_on_pii_tables': false,
69+
'pii_table_tags': ['pii']
6870
} %}
6971
{{- return(default_config) -}}
7072
{%- endmacro -%}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{% macro is_pii_table(flattened_test) %}
2+
{% set disable_samples_on_pii_tables = elementary.get_config_var('disable_samples_on_pii_tables') %}
3+
{% if not disable_samples_on_pii_tables %}
4+
{% do return(false) %}
5+
{% endif %}
6+
7+
{% set pii_table_tags = elementary.get_config_var('pii_table_tags') %}
8+
{% set model_tags = elementary.insensitive_get_dict_value(flattened_test, 'model_tags', []) %}
9+
10+
{% set intersection = elementary.lists_intersection(model_tags, pii_table_tags) %}
11+
{% set is_pii = intersection | length > 0 %}
12+
13+
{% do return(is_pii) %}
14+
{% endmacro %}

0 commit comments

Comments
 (0)