From 5d4dd69c45e53256ac02340a42717c86aa1e6b5b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 30 Jul 2025 07:24:57 +0000 Subject: [PATCH 01/15] Implement column-level PII protection for sample collection - Add disable_samples_on_pii_columns and pii_column_tags config variables - Create get_pii_columns_from_parent_model helper macro for column PII detection - Modify query_test_result_rows to dynamically exclude PII columns from SELECT - Add comprehensive integration tests for column-level PII protection - Maintain backward compatibility with existing table-level PII detection - Follow existing patterns for configuration, tag handling, and testing Addresses Linear issue ELE-4850 Co-Authored-By: Yosef Arbiv --- .../tests/test_column_pii_sampling.py | 132 ++++++++++++++++++ macros/edr/materializations/test/test.sql | 31 +++- .../system/system_utils/get_config_var.sql | 2 + .../edr/system/system_utils/is_pii_column.sql | 42 ++++++ 4 files changed, 204 insertions(+), 3 deletions(-) create mode 100644 integration_tests/tests/test_column_pii_sampling.py create mode 100644 macros/edr/system/system_utils/is_pii_column.sql diff --git a/integration_tests/tests/test_column_pii_sampling.py b/integration_tests/tests/test_column_pii_sampling.py new file mode 100644 index 000000000..9eba27887 --- /dev/null +++ b/integration_tests/tests/test_column_pii_sampling.py @@ -0,0 +1,132 @@ +import json + +import pytest +from dbt_project import DbtProject + +SENSITIVE_COLUMN = "email" +SAFE_COLUMN = "order_count" + +SAMPLES_QUERY = """ + with latest_elementary_test_result as ( + select id + from {{{{ ref("elementary_test_results") }}}} + where lower(table_name) = lower('{test_id}') + order by created_at desc + limit 1 + ) + + select result_row + from {{{{ ref("test_result_rows") }}}} + where elementary_test_results_id in (select * from latest_elementary_test_result) +""" + +TEST_SAMPLE_ROW_COUNT = 5 + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_column_pii_sampling_enabled(test_id: str, dbt_project: DbtProject): + """Test that PII columns are excluded when column-level PII protection is enabled""" + data = [ + {SENSITIVE_COLUMN: f"user{i}@example.com", SAFE_COLUMN: None} for i in range(10) + ] + + test_result = dbt_project.test( + test_id, + "not_null", + test_args=dict(column_name=SAFE_COLUMN), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SAFE_COLUMN}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_columns": True, + "pii_column_tags": ["pii"], + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + assert len(samples) == TEST_SAMPLE_ROW_COUNT + for sample in samples: + assert SENSITIVE_COLUMN not in sample + assert SAFE_COLUMN in sample + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_column_pii_sampling_disabled(test_id: str, dbt_project: DbtProject): + """Test that all columns are included when column-level PII protection is disabled""" + data = [ + {SENSITIVE_COLUMN: f"user{i}@example.com", SAFE_COLUMN: None} for i in range(10) + ] + + test_result = dbt_project.test( + test_id, + "not_null", + test_args=dict(column_name=SAFE_COLUMN), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SAFE_COLUMN}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_columns": False, + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + assert len(samples) == TEST_SAMPLE_ROW_COUNT + for sample in samples: + assert SENSITIVE_COLUMN in sample + assert SAFE_COLUMN in sample + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_column_pii_sampling_all_columns_pii(test_id: str, dbt_project: DbtProject): + """Test behavior when all columns are tagged as PII""" + data = [ + {SENSITIVE_COLUMN: f"user{i}@example.com", SAFE_COLUMN: i} for i in range(10) + ] + + test_result = dbt_project.test( + test_id, + "not_null", + test_args=dict(column_name=SAFE_COLUMN), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SAFE_COLUMN, "config": {"tags": ["pii"]}}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_columns": True, + "pii_column_tags": ["pii"], + }, + ) + assert test_result["status"] == "pass" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + assert len(samples) == TEST_SAMPLE_ROW_COUNT + for sample in samples: + assert "_no_non_pii_columns" in sample + assert sample["_no_non_pii_columns"] == 1 + assert SENSITIVE_COLUMN not in sample + assert SAFE_COLUMN not in sample diff --git a/macros/edr/materializations/test/test.sql b/macros/edr/materializations/test/test.sql index a63f89f0a..7c211953e 100644 --- a/macros/edr/materializations/test/test.sql +++ b/macros/edr/materializations/test/test.sql @@ -51,7 +51,8 @@ {% macro handle_dbt_test(flattened_test, materialization_macro) %} {% set result = materialization_macro() %} {% set result_rows = elementary.query_test_result_rows(sample_limit=elementary.get_config_var('test_sample_row_count'), - ignore_passed_tests=true) %} + ignore_passed_tests=true, + flattened_test=flattened_test) %} {% set elementary_test_results_row = elementary.get_dbt_test_result_row(flattened_test, result_rows) %} {% do elementary.cache_elementary_test_results_rows([elementary_test_results_row]) %} {% do return(result) %} @@ -103,7 +104,7 @@ {% do return(new_sql) %} {% endmacro %} -{% macro query_test_result_rows(sample_limit=none, ignore_passed_tests=false) %} +{% macro query_test_result_rows(sample_limit=none, ignore_passed_tests=false, flattened_test=none) %} {% if sample_limit == 0 %} {# performance: no need to run a sql query that we know returns an empty list #} {% do return([]) %} {% endif %} @@ -111,11 +112,35 @@ {% do elementary.debug_log("Skipping sample query because the test passed.") %} {% do return([]) %} {% endif %} + + {% set pii_columns = [] %} + {% if flattened_test %} + {% set pii_columns = elementary.get_pii_columns_from_parent_model(flattened_test) %} + {% endif %} + + {% set select_clause = "*" %} + {% if pii_columns %} + {% set query_to_get_columns %} + with test_results as ( + {{ sql }} + ) + select * from test_results limit 0 + {% endset %} + {% set columns_result = elementary.run_query(query_to_get_columns) %} + {% set all_columns = columns_result.column_names %} + {% set safe_columns = all_columns | reject("in", pii_columns) | list %} + {% if safe_columns %} + {% set select_clause = safe_columns | join(", ") %} + {% else %} + {% set select_clause = "1 as _no_non_pii_columns" %} + {% endif %} + {% endif %} + {% set query %} with test_results as ( {{ sql }} ) - select * from test_results {% if sample_limit is not none %} limit {{ sample_limit }} {% endif %} + select {{ select_clause }} from test_results {% if sample_limit is not none %} limit {{ sample_limit }} {% endif %} {% endset %} {% do return(elementary.agate_to_dicts(elementary.run_query(query))) %} {% endmacro %} diff --git a/macros/edr/system/system_utils/get_config_var.sql b/macros/edr/system/system_utils/get_config_var.sql index 431061811..4a8df400a 100644 --- a/macros/edr/system/system_utils/get_config_var.sql +++ b/macros/edr/system/system_utils/get_config_var.sql @@ -55,6 +55,8 @@ 'disable_skipped_test_alerts': true, 'dbt_artifacts_chunk_size': 5000, 'test_sample_row_count': 5, + 'disable_samples_on_pii_columns': false, + 'pii_column_tags': ['pii', 'personal', 'sensitive'], 'edr_cli_run': false, 'max_int': 2147483647, 'custom_run_started_at': none, diff --git a/macros/edr/system/system_utils/is_pii_column.sql b/macros/edr/system/system_utils/is_pii_column.sql new file mode 100644 index 000000000..a5714eaa1 --- /dev/null +++ b/macros/edr/system/system_utils/is_pii_column.sql @@ -0,0 +1,42 @@ +{% macro get_pii_columns_from_parent_model(flattened_test) %} + {% set pii_columns = [] %} + + {% if not elementary.get_config_var('disable_samples_on_pii_columns') %} + {% do return(pii_columns) %} + {% endif %} + + {% set parent_model_unique_id = elementary.insensitive_get_dict_value(flattened_test, 'parent_model_unique_id') %} + {% set parent_model = elementary.get_node(parent_model_unique_id) %} + + {% if not parent_model %} + {% do return(pii_columns) %} + {% endif %} + + {% set column_nodes = parent_model.get("columns") %} + {% if not column_nodes %} + {% do return(pii_columns) %} + {% endif %} + + {% set pii_column_tags = elementary.get_config_var('pii_column_tags') %} + {% if pii_column_tags is string %} + {% set pii_column_tags = [pii_column_tags] %} + {% endif %} + + {% for column_node in column_nodes.values() %} + {% set config_dict = column_node.get('config', {}) %} + {% set config_tags = config_dict.get('tags', []) %} + {% set global_tags = column_node.get('tags', []) %} + {% set meta_dict = column_node.get('meta', {}) %} + {% set meta_tags = meta_dict.get('tags', []) %} + {% set all_column_tags = config_tags + global_tags + meta_tags %} + + {% for pii_tag in pii_column_tags %} + {% if pii_tag in all_column_tags %} + {% do pii_columns.append(column_node.get('name')) %} + {% break %} + {% endif %} + {% endfor %} + {% endfor %} + + {% do return(pii_columns) %} +{% endmacro %} From 3ba7a44a7fcb5eff48ab3173febe11b671c6fae9 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 30 Jul 2025 12:25:38 +0000 Subject: [PATCH 02/15] Add disable_samples column configuration flag - Add is_sampling_disabled_for_column macro to check column config - Modify query_test_result_rows to skip sampling when disable_samples=true - Add integration tests for disable_samples functionality - disable_samples flag overrides PII detection and works at column level - Preserves existing PII column filtering functionality Co-Authored-By: Yosef Arbiv --- .../tests/test_disable_samples_config.py | 122 ++++++++++++++++++ macros/edr/materializations/test/test.sql | 23 ++++ 2 files changed, 145 insertions(+) create mode 100644 integration_tests/tests/test_disable_samples_config.py diff --git a/integration_tests/tests/test_disable_samples_config.py b/integration_tests/tests/test_disable_samples_config.py new file mode 100644 index 000000000..a34404388 --- /dev/null +++ b/integration_tests/tests/test_disable_samples_config.py @@ -0,0 +1,122 @@ +import json + +import pytest +from dbt_project import DbtProject + +COLUMN_NAME = "sensitive_data" + +SAMPLES_QUERY = """ + with latest_elementary_test_result as ( + select id + from {{ ref("elementary_test_results") }} + where lower(table_name) = lower('{test_id}') + order by created_at desc + limit 1 + ) + + select result_row + from {{ ref("test_result_rows") }} + where elementary_test_results_id in (select * from latest_elementary_test_result) +""" + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_disable_samples_config_prevents_sampling( + test_id: str, dbt_project: DbtProject +): + null_count = 20 + data = [{COLUMN_NAME: None} for _ in range(null_count)] + + columns = [ + { + "name": COLUMN_NAME, + "config": {"disable_samples": True}, + "tests": [{"not_null": {}}], + } + ] + + test_result = dbt_project.test( + test_id, + "not_null", + columns=columns, + data=data, + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": 5, + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + assert len(samples) == 0 + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_disable_samples_false_allows_sampling(test_id: str, dbt_project: DbtProject): + null_count = 20 + data = [{COLUMN_NAME: None} for _ in range(null_count)] + + columns = [ + { + "name": COLUMN_NAME, + "config": {"disable_samples": False}, + "tests": [{"not_null": {}}], + } + ] + + test_result = dbt_project.test( + test_id, + "not_null", + columns=columns, + data=data, + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": 5, + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + assert len(samples) == 5 + assert all([row == {COLUMN_NAME: None} for row in samples]) + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_disable_samples_config_overrides_pii_tags( + test_id: str, dbt_project: DbtProject +): + null_count = 20 + data = [{COLUMN_NAME: None} for _ in range(null_count)] + + columns = [ + { + "name": COLUMN_NAME, + "config": {"disable_samples": True, "tags": ["pii"]}, + "tests": [{"not_null": {}}], + } + ] + + test_result = dbt_project.test( + test_id, + "not_null", + columns=columns, + data=data, + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": 5, + "disable_samples_on_pii_columns": True, + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + assert len(samples) == 0 diff --git a/macros/edr/materializations/test/test.sql b/macros/edr/materializations/test/test.sql index 7c211953e..032503740 100644 --- a/macros/edr/materializations/test/test.sql +++ b/macros/edr/materializations/test/test.sql @@ -113,6 +113,11 @@ {% do return([]) %} {% endif %} + {% if flattened_test and elementary.is_sampling_disabled_for_column(flattened_test) %} + {% do elementary.debug_log("Skipping sample query because disable_samples is true for this column.") %} + {% do return([]) %} + {% endif %} + {% set pii_columns = [] %} {% if flattened_test %} {% set pii_columns = elementary.get_pii_columns_from_parent_model(flattened_test) %} @@ -145,6 +150,24 @@ {% do return(elementary.agate_to_dicts(elementary.run_query(query))) %} {% endmacro %} +{% macro is_sampling_disabled_for_column(flattened_test) %} + {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} + {% set parent_model_unique_id = elementary.insensitive_get_dict_value(flattened_test, 'parent_model_unique_id') %} + + {% if not test_column_name or not parent_model_unique_id %} + {% do return(false) %} + {% endif %} + + {% set parent_model = elementary.get_node(parent_model_unique_id) %} + {% if parent_model and parent_model.get('columns') %} + {% set column_config = parent_model.get('columns', {}).get(test_column_name, {}).get('config', {}) %} + {% set disable_samples = elementary.safe_get_with_default(column_config, 'disable_samples', false) %} + {% do return(disable_samples) %} + {% endif %} + + {% do return(false) %} +{% endmacro %} + {% macro cache_elementary_test_results_rows(elementary_test_results_rows) %} {% do elementary.get_cache("elementary_test_results").update({model.unique_id: elementary_test_results_rows}) %} {% endmacro %} From a159aa71bffdd270008b250d271e590152c87ffb Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 30 Jul 2025 12:25:38 +0000 Subject: [PATCH 03/15] Add disable_samples column configuration flag - Add is_sampling_disabled_for_column macro to check column config - Modify query_test_result_rows to skip sampling when disable_samples=true - Add integration tests for disable_samples functionality - disable_samples flag overrides PII detection and works at column level - Preserves existing PII column filtering functionality Co-Authored-By: Yosef Arbiv --- .../tests/test_disable_samples_config.py | 122 ++++++++++++++++++ macros/edr/materializations/test/test.sql | 23 ++++ 2 files changed, 145 insertions(+) create mode 100644 integration_tests/tests/test_disable_samples_config.py diff --git a/integration_tests/tests/test_disable_samples_config.py b/integration_tests/tests/test_disable_samples_config.py new file mode 100644 index 000000000..a34404388 --- /dev/null +++ b/integration_tests/tests/test_disable_samples_config.py @@ -0,0 +1,122 @@ +import json + +import pytest +from dbt_project import DbtProject + +COLUMN_NAME = "sensitive_data" + +SAMPLES_QUERY = """ + with latest_elementary_test_result as ( + select id + from {{ ref("elementary_test_results") }} + where lower(table_name) = lower('{test_id}') + order by created_at desc + limit 1 + ) + + select result_row + from {{ ref("test_result_rows") }} + where elementary_test_results_id in (select * from latest_elementary_test_result) +""" + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_disable_samples_config_prevents_sampling( + test_id: str, dbt_project: DbtProject +): + null_count = 20 + data = [{COLUMN_NAME: None} for _ in range(null_count)] + + columns = [ + { + "name": COLUMN_NAME, + "config": {"disable_samples": True}, + "tests": [{"not_null": {}}], + } + ] + + test_result = dbt_project.test( + test_id, + "not_null", + columns=columns, + data=data, + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": 5, + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + assert len(samples) == 0 + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_disable_samples_false_allows_sampling(test_id: str, dbt_project: DbtProject): + null_count = 20 + data = [{COLUMN_NAME: None} for _ in range(null_count)] + + columns = [ + { + "name": COLUMN_NAME, + "config": {"disable_samples": False}, + "tests": [{"not_null": {}}], + } + ] + + test_result = dbt_project.test( + test_id, + "not_null", + columns=columns, + data=data, + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": 5, + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + assert len(samples) == 5 + assert all([row == {COLUMN_NAME: None} for row in samples]) + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_disable_samples_config_overrides_pii_tags( + test_id: str, dbt_project: DbtProject +): + null_count = 20 + data = [{COLUMN_NAME: None} for _ in range(null_count)] + + columns = [ + { + "name": COLUMN_NAME, + "config": {"disable_samples": True, "tags": ["pii"]}, + "tests": [{"not_null": {}}], + } + ] + + test_result = dbt_project.test( + test_id, + "not_null", + columns=columns, + data=data, + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": 5, + "disable_samples_on_pii_columns": True, + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + assert len(samples) == 0 diff --git a/macros/edr/materializations/test/test.sql b/macros/edr/materializations/test/test.sql index 7c211953e..032503740 100644 --- a/macros/edr/materializations/test/test.sql +++ b/macros/edr/materializations/test/test.sql @@ -113,6 +113,11 @@ {% do return([]) %} {% endif %} + {% if flattened_test and elementary.is_sampling_disabled_for_column(flattened_test) %} + {% do elementary.debug_log("Skipping sample query because disable_samples is true for this column.") %} + {% do return([]) %} + {% endif %} + {% set pii_columns = [] %} {% if flattened_test %} {% set pii_columns = elementary.get_pii_columns_from_parent_model(flattened_test) %} @@ -145,6 +150,24 @@ {% do return(elementary.agate_to_dicts(elementary.run_query(query))) %} {% endmacro %} +{% macro is_sampling_disabled_for_column(flattened_test) %} + {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} + {% set parent_model_unique_id = elementary.insensitive_get_dict_value(flattened_test, 'parent_model_unique_id') %} + + {% if not test_column_name or not parent_model_unique_id %} + {% do return(false) %} + {% endif %} + + {% set parent_model = elementary.get_node(parent_model_unique_id) %} + {% if parent_model and parent_model.get('columns') %} + {% set column_config = parent_model.get('columns', {}).get(test_column_name, {}).get('config', {}) %} + {% set disable_samples = elementary.safe_get_with_default(column_config, 'disable_samples', false) %} + {% do return(disable_samples) %} + {% endif %} + + {% do return(false) %} +{% endmacro %} + {% macro cache_elementary_test_results_rows(elementary_test_results_rows) %} {% do elementary.get_cache("elementary_test_results").update({model.unique_id: elementary_test_results_rows}) %} {% endmacro %} From 9fa0eb1a65b6bddbb29926f98417983cbe299ac1 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 30 Jul 2025 13:13:06 +0000 Subject: [PATCH 04/15] Fix disable_samples to exclude columns instead of skipping sampling - Remove early return that skipped sampling entirely for disable_samples - Create unified get_columns_to_exclude_from_sampling helper macro - Integrate disable_samples with existing PII column exclusion logic - Update integration tests to verify column exclusion behavior - Add test for interaction between PII and disable_samples columns - Maintain backward compatibility with existing functionality Addresses GitHub feedback on PR #833 Co-Authored-By: Yosef Arbiv --- .../tests/test_column_pii_sampling.py | 4 +- .../tests/test_disable_samples_config.py | 92 ++++++++++++++++++- macros/edr/materializations/test/test.sql | 36 +++++--- 3 files changed, 115 insertions(+), 17 deletions(-) diff --git a/integration_tests/tests/test_column_pii_sampling.py b/integration_tests/tests/test_column_pii_sampling.py index 9eba27887..c626bcd2d 100644 --- a/integration_tests/tests/test_column_pii_sampling.py +++ b/integration_tests/tests/test_column_pii_sampling.py @@ -126,7 +126,7 @@ def test_column_pii_sampling_all_columns_pii(test_id: str, dbt_project: DbtProje assert len(samples) == TEST_SAMPLE_ROW_COUNT for sample in samples: - assert "_no_non_pii_columns" in sample - assert sample["_no_non_pii_columns"] == 1 + assert "_no_non_excluded_columns" in sample + assert sample["_no_non_excluded_columns"] == 1 assert SENSITIVE_COLUMN not in sample assert SAFE_COLUMN not in sample diff --git a/integration_tests/tests/test_disable_samples_config.py b/integration_tests/tests/test_disable_samples_config.py index a34404388..a1f2c6279 100644 --- a/integration_tests/tests/test_disable_samples_config.py +++ b/integration_tests/tests/test_disable_samples_config.py @@ -51,7 +51,11 @@ def test_disable_samples_config_prevents_sampling( json.loads(row["result_row"]) for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == 0 + assert len(samples) == 5 + for sample in samples: + assert "_no_non_excluded_columns" in sample + assert sample["_no_non_excluded_columns"] == 1 + assert COLUMN_NAME not in sample @pytest.mark.skip_targets(["clickhouse"]) @@ -84,7 +88,9 @@ def test_disable_samples_false_allows_sampling(test_id: str, dbt_project: DbtPro for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] assert len(samples) == 5 - assert all([row == {COLUMN_NAME: None} for row in samples]) + for sample in samples: + assert COLUMN_NAME in sample + assert sample[COLUMN_NAME] is None @pytest.mark.skip_targets(["clickhouse"]) @@ -119,4 +125,84 @@ def test_disable_samples_config_overrides_pii_tags( json.loads(row["result_row"]) for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == 0 + assert len(samples) == 5 + for sample in samples: + assert "_no_non_excluded_columns" in sample + assert sample["_no_non_excluded_columns"] == 1 + assert COLUMN_NAME not in sample + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_disable_samples_and_pii_interaction(test_id: str, dbt_project: DbtProject): + """Test that disable_samples and PII columns both get excluded""" + data = [ + {"col1": None, "col2": f"pii{i}", "col3": f"disabled{i}"} for i in range(10) + ] + + columns = [ + {"name": "col1", "tests": [{"not_null": {}}]}, + {"name": "col2", "config": {"tags": ["pii"]}}, + {"name": "col3", "config": {"disable_samples": True}}, + ] + + test_result = dbt_project.test( + test_id, + "not_null", + columns=columns, + data=data, + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": 5, + "disable_samples_on_pii_columns": True, + "pii_column_tags": ["pii"], + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + assert len(samples) == 5 + for sample in samples: + assert "col1" in sample + assert "col2" not in sample + assert "col3" not in sample + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_disable_samples_with_multiple_columns(test_id: str, dbt_project: DbtProject): + """Test that disable_samples excludes only the disabled column""" + data = [{"col1": None, "col2": f"value{i}"} for i in range(10)] + + columns = [ + { + "name": "col1", + "config": {"disable_samples": True}, + "tests": [{"not_null": {}}], + }, + {"name": "col2"}, + ] + + test_result = dbt_project.test( + test_id, + "not_null", + columns=columns, + data=data, + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": 5, + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + assert len(samples) == 5 + for sample in samples: + assert "col1" not in sample + assert "col2" in sample diff --git a/macros/edr/materializations/test/test.sql b/macros/edr/materializations/test/test.sql index 032503740..5ec708d64 100644 --- a/macros/edr/materializations/test/test.sql +++ b/macros/edr/materializations/test/test.sql @@ -113,18 +113,10 @@ {% do return([]) %} {% endif %} - {% if flattened_test and elementary.is_sampling_disabled_for_column(flattened_test) %} - {% do elementary.debug_log("Skipping sample query because disable_samples is true for this column.") %} - {% do return([]) %} - {% endif %} - - {% set pii_columns = [] %} - {% if flattened_test %} - {% set pii_columns = elementary.get_pii_columns_from_parent_model(flattened_test) %} - {% endif %} + {% set columns_to_exclude = elementary.get_columns_to_exclude_from_sampling(flattened_test) %} {% set select_clause = "*" %} - {% if pii_columns %} + {% if columns_to_exclude %} {% set query_to_get_columns %} with test_results as ( {{ sql }} @@ -133,11 +125,11 @@ {% endset %} {% set columns_result = elementary.run_query(query_to_get_columns) %} {% set all_columns = columns_result.column_names %} - {% set safe_columns = all_columns | reject("in", pii_columns) | list %} + {% set safe_columns = all_columns | reject("in", columns_to_exclude) | list %} {% if safe_columns %} {% set select_clause = safe_columns | join(", ") %} {% else %} - {% set select_clause = "1 as _no_non_pii_columns" %} + {% set select_clause = "1 as _no_non_excluded_columns" %} {% endif %} {% endif %} @@ -150,6 +142,26 @@ {% do return(elementary.agate_to_dicts(elementary.run_query(query))) %} {% endmacro %} +{% macro get_columns_to_exclude_from_sampling(flattened_test) %} + {% set columns_to_exclude = [] %} + + {% if not flattened_test %} + {% do return(columns_to_exclude) %} + {% endif %} + + {% set pii_columns = elementary.get_pii_columns_from_parent_model(flattened_test) %} + {% set columns_to_exclude = columns_to_exclude + pii_columns %} + + {% if elementary.is_sampling_disabled_for_column(flattened_test) %} + {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} + {% if test_column_name and test_column_name not in columns_to_exclude %} + {% do columns_to_exclude.append(test_column_name) %} + {% endif %} + {% endif %} + + {% do return(columns_to_exclude) %} +{% endmacro %} + {% macro is_sampling_disabled_for_column(flattened_test) %} {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} {% set parent_model_unique_id = elementary.insensitive_get_dict_value(flattened_test, 'parent_model_unique_id') %} From d481311061be1026b7823c71be7305d077c05009 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 31 Jul 2025 07:48:40 +0000 Subject: [PATCH 05/15] Address GitHub feedback: rename global_tags to column_tags and add test for disabled flag - Rename global_tags variable to column_tags in is_pii_column.sql for clarity - Add test_column_pii_sampling_tags_exist_but_flag_disabled test case - Verify that when PII tags exist but disable_samples_on_pii_columns=false, samples are collected normally including PII columns Addresses GitHub comments from arbiv on PR #833 Co-Authored-By: Yosef Arbiv --- .../tests/test_column_pii_sampling.py | 40 +++++++++++++++++++ .../edr/system/system_utils/is_pii_column.sql | 4 +- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/integration_tests/tests/test_column_pii_sampling.py b/integration_tests/tests/test_column_pii_sampling.py index c626bcd2d..cd2e150d3 100644 --- a/integration_tests/tests/test_column_pii_sampling.py +++ b/integration_tests/tests/test_column_pii_sampling.py @@ -94,6 +94,46 @@ def test_column_pii_sampling_disabled(test_id: str, dbt_project: DbtProject): assert SAFE_COLUMN in sample +@pytest.mark.skip_targets(["clickhouse"]) +def test_column_pii_sampling_tags_exist_but_flag_disabled( + test_id: str, dbt_project: DbtProject +): + """Test that when PII tags exist but disable_samples_on_pii_columns is false, samples are collected normally""" + data = [ + {SENSITIVE_COLUMN: f"user{i}@example.com", SAFE_COLUMN: None} for i in range(10) + ] + + test_result = dbt_project.test( + test_id, + "not_null", + test_args=dict(column_name=SAFE_COLUMN), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SAFE_COLUMN}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_columns": False, # Flag is disabled + "pii_column_tags": ["pii"], + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + assert len(samples) == TEST_SAMPLE_ROW_COUNT + for sample in samples: + assert ( + SENSITIVE_COLUMN in sample + ) # PII column should be included when flag is disabled + assert SAFE_COLUMN in sample + + @pytest.mark.skip_targets(["clickhouse"]) def test_column_pii_sampling_all_columns_pii(test_id: str, dbt_project: DbtProject): """Test behavior when all columns are tagged as PII""" diff --git a/macros/edr/system/system_utils/is_pii_column.sql b/macros/edr/system/system_utils/is_pii_column.sql index a5714eaa1..eb4a7d14f 100644 --- a/macros/edr/system/system_utils/is_pii_column.sql +++ b/macros/edr/system/system_utils/is_pii_column.sql @@ -25,10 +25,10 @@ {% for column_node in column_nodes.values() %} {% set config_dict = column_node.get('config', {}) %} {% set config_tags = config_dict.get('tags', []) %} - {% set global_tags = column_node.get('tags', []) %} + {% set column_tags = column_node.get('tags', []) %} {% set meta_dict = column_node.get('meta', {}) %} {% set meta_tags = meta_dict.get('tags', []) %} - {% set all_column_tags = config_tags + global_tags + meta_tags %} + {% set all_column_tags = config_tags + column_tags + meta_tags %} {% for pii_tag in pii_column_tags %} {% if pii_tag in all_column_tags %} From 4907483c081e943bcaf4a93a19f1c8abc8ed1e69 Mon Sep 17 00:00:00 2001 From: arbiv Date: Thu, 31 Jul 2025 18:21:11 +0300 Subject: [PATCH 06/15] unified config vars --- .../tests/test_column_pii_sampling.py | 16 ++++++++-------- .../tests/test_disable_samples_config.py | 6 +++--- .../edr/system/system_utils/get_config_var.sql | 2 -- macros/edr/system/system_utils/is_pii_column.sql | 10 +++++----- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/integration_tests/tests/test_column_pii_sampling.py b/integration_tests/tests/test_column_pii_sampling.py index cd2e150d3..d2a3df00f 100644 --- a/integration_tests/tests/test_column_pii_sampling.py +++ b/integration_tests/tests/test_column_pii_sampling.py @@ -42,8 +42,8 @@ def test_column_pii_sampling_enabled(test_id: str, dbt_project: DbtProject): test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, - "disable_samples_on_pii_columns": True, - "pii_column_tags": ["pii"], + "disable_samples_on_pii_tags": True, + "pii_tags": ["pii"], }, ) assert test_result["status"] == "fail" @@ -78,7 +78,7 @@ def test_column_pii_sampling_disabled(test_id: str, dbt_project: DbtProject): test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, - "disable_samples_on_pii_columns": False, + "disable_samples_on_pii_tags": False, }, ) assert test_result["status"] == "fail" @@ -98,7 +98,7 @@ def test_column_pii_sampling_disabled(test_id: str, dbt_project: DbtProject): def test_column_pii_sampling_tags_exist_but_flag_disabled( test_id: str, dbt_project: DbtProject ): - """Test that when PII tags exist but disable_samples_on_pii_columns is false, samples are collected normally""" + """Test that when PII tags exist but disable_samples_on_pii_tags is false, samples are collected normally""" data = [ {SENSITIVE_COLUMN: f"user{i}@example.com", SAFE_COLUMN: None} for i in range(10) ] @@ -115,8 +115,8 @@ def test_column_pii_sampling_tags_exist_but_flag_disabled( test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, - "disable_samples_on_pii_columns": False, # Flag is disabled - "pii_column_tags": ["pii"], + "disable_samples_on_pii_tags": False, # Flag is disabled + "pii_tags": ["pii"], }, ) assert test_result["status"] == "fail" @@ -153,8 +153,8 @@ def test_column_pii_sampling_all_columns_pii(test_id: str, dbt_project: DbtProje test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, - "disable_samples_on_pii_columns": True, - "pii_column_tags": ["pii"], + "disable_samples_on_pii_tags": True, + "pii_tags": ["pii"], }, ) assert test_result["status"] == "pass" diff --git a/integration_tests/tests/test_disable_samples_config.py b/integration_tests/tests/test_disable_samples_config.py index a1f2c6279..d45e02c3f 100644 --- a/integration_tests/tests/test_disable_samples_config.py +++ b/integration_tests/tests/test_disable_samples_config.py @@ -116,7 +116,7 @@ def test_disable_samples_config_overrides_pii_tags( test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": 5, - "disable_samples_on_pii_columns": True, + "disable_samples_on_pii_tags": True, }, ) assert test_result["status"] == "fail" @@ -153,8 +153,8 @@ def test_disable_samples_and_pii_interaction(test_id: str, dbt_project: DbtProje test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": 5, - "disable_samples_on_pii_columns": True, - "pii_column_tags": ["pii"], + "disable_samples_on_pii_tags": True, + "pii_tags": ["pii"], }, ) assert test_result["status"] == "fail" diff --git a/macros/edr/system/system_utils/get_config_var.sql b/macros/edr/system/system_utils/get_config_var.sql index 9b03ef8bd..310a00378 100644 --- a/macros/edr/system/system_utils/get_config_var.sql +++ b/macros/edr/system/system_utils/get_config_var.sql @@ -55,8 +55,6 @@ 'disable_skipped_test_alerts': true, 'dbt_artifacts_chunk_size': 5000, 'test_sample_row_count': 5, - 'disable_samples_on_pii_columns': false, - 'pii_column_tags': ['pii', 'personal', 'sensitive'], 'edr_cli_run': false, 'max_int': 2147483647, 'custom_run_started_at': none, diff --git a/macros/edr/system/system_utils/is_pii_column.sql b/macros/edr/system/system_utils/is_pii_column.sql index eb4a7d14f..6c4bff36a 100644 --- a/macros/edr/system/system_utils/is_pii_column.sql +++ b/macros/edr/system/system_utils/is_pii_column.sql @@ -1,7 +1,7 @@ {% macro get_pii_columns_from_parent_model(flattened_test) %} {% set pii_columns = [] %} - {% if not elementary.get_config_var('disable_samples_on_pii_columns') %} + {% if not elementary.get_config_var('disable_samples_on_pii_tags') %} {% do return(pii_columns) %} {% endif %} @@ -17,9 +17,9 @@ {% do return(pii_columns) %} {% endif %} - {% set pii_column_tags = elementary.get_config_var('pii_column_tags') %} - {% if pii_column_tags is string %} - {% set pii_column_tags = [pii_column_tags] %} + {% set pii_tags = elementary.get_config_var('pii_tags') %} + {% if pii_tags is string %} + {% set pii_tags = [pii_tags] %} {% endif %} {% for column_node in column_nodes.values() %} @@ -30,7 +30,7 @@ {% set meta_tags = meta_dict.get('tags', []) %} {% set all_column_tags = config_tags + column_tags + meta_tags %} - {% for pii_tag in pii_column_tags %} + {% for pii_tag in pii_tags %} {% if pii_tag in all_column_tags %} {% do pii_columns.append(column_node.get('name')) %} {% break %} From 949b357cc8557863364925ec445b554139b7cd2c Mon Sep 17 00:00:00 2001 From: arbiv Date: Sun, 3 Aug 2025 00:11:35 +0300 Subject: [PATCH 07/15] fixed to support builtin tests --- .../tests/test_column_pii_sampling.py | 189 +++++++++++++++--- .../tests/test_disable_samples_config.py | 14 +- macros/edr/materializations/test/test.sql | 183 ++++++++++++++++- .../edr/system/system_utils/is_pii_column.sql | 9 +- 4 files changed, 354 insertions(+), 41 deletions(-) diff --git a/integration_tests/tests/test_column_pii_sampling.py b/integration_tests/tests/test_column_pii_sampling.py index d2a3df00f..d304c2a6c 100644 --- a/integration_tests/tests/test_column_pii_sampling.py +++ b/integration_tests/tests/test_column_pii_sampling.py @@ -27,13 +27,13 @@ def test_column_pii_sampling_enabled(test_id: str, dbt_project: DbtProject): """Test that PII columns are excluded when column-level PII protection is enabled""" data = [ - {SENSITIVE_COLUMN: f"user{i}@example.com", SAFE_COLUMN: None} for i in range(10) + {SENSITIVE_COLUMN: "user@example.com", SAFE_COLUMN: None} for i in range(10) ] test_result = dbt_project.test( test_id, - "not_null", - test_args=dict(column_name=SAFE_COLUMN), + "unique", + test_args=dict(column_name=SENSITIVE_COLUMN), data=data, columns=[ {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, @@ -53,27 +53,25 @@ def test_column_pii_sampling_enabled(test_id: str, dbt_project: DbtProject): for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == TEST_SAMPLE_ROW_COUNT - for sample in samples: - assert SENSITIVE_COLUMN not in sample - assert SAFE_COLUMN in sample + assert len(samples) == 1 + assert samples[0]["n_records"] == 10 + assert len(samples[0]) == 1 @pytest.mark.skip_targets(["clickhouse"]) def test_column_pii_sampling_disabled(test_id: str, dbt_project: DbtProject): """Test that all columns are included when column-level PII protection is disabled""" data = [ - {SENSITIVE_COLUMN: f"user{i}@example.com", SAFE_COLUMN: None} for i in range(10) + {SENSITIVE_COLUMN: "user@example.com", SAFE_COLUMN: None} for i in range(10) ] test_result = dbt_project.test( test_id, - "not_null", - test_args=dict(column_name=SAFE_COLUMN), + "unique", + test_args=dict(column_name=SENSITIVE_COLUMN), data=data, columns=[ {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, - {"name": SAFE_COLUMN}, ], test_vars={ "enable_elementary_test_materialization": True, @@ -88,10 +86,12 @@ def test_column_pii_sampling_disabled(test_id: str, dbt_project: DbtProject): for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == TEST_SAMPLE_ROW_COUNT + # sample should be {'unique_field': 'user@example.com', 'n_records': 10} + assert len(samples) == 1 for sample in samples: - assert SENSITIVE_COLUMN in sample - assert SAFE_COLUMN in sample + # The original column name is mapped to 'unique_field' in unique tests + assert "unique_field" in sample + assert "n_records" in sample @pytest.mark.skip_targets(["clickhouse"]) @@ -99,19 +99,18 @@ def test_column_pii_sampling_tags_exist_but_flag_disabled( test_id: str, dbt_project: DbtProject ): """Test that when PII tags exist but disable_samples_on_pii_tags is false, samples are collected normally""" - data = [ - {SENSITIVE_COLUMN: f"user{i}@example.com", SAFE_COLUMN: None} for i in range(10) - ] + data = [{SENSITIVE_COLUMN: "user@example.com", SAFE_COLUMN: 1} for i in range(10)] test_result = dbt_project.test( test_id, - "not_null", + "unique", test_args=dict(column_name=SAFE_COLUMN), data=data, columns=[ {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, {"name": SAFE_COLUMN}, ], + test_column=None, test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, @@ -126,12 +125,12 @@ def test_column_pii_sampling_tags_exist_but_flag_disabled( for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == TEST_SAMPLE_ROW_COUNT + # When flag is disabled, we get the full sample (not limited by PII filtering) + assert len(samples) == 1 for sample in samples: - assert ( - SENSITIVE_COLUMN in sample - ) # PII column should be included when flag is disabled - assert SAFE_COLUMN in sample + # The original column name is mapped to 'unique_field' in unique tests + assert "unique_field" in sample + assert "n_records" in sample @pytest.mark.skip_targets(["clickhouse"]) @@ -150,6 +149,7 @@ def test_column_pii_sampling_all_columns_pii(test_id: str, dbt_project: DbtProje {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, {"name": SAFE_COLUMN, "config": {"tags": ["pii"]}}, ], + test_column=None, test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, @@ -164,9 +164,150 @@ def test_column_pii_sampling_all_columns_pii(test_id: str, dbt_project: DbtProje for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] + # When all columns are PII, no samples should be collected + assert len(samples) == 0 + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_unique_test_column_mapping(test_id: str, dbt_project: DbtProject): + """Test that column mapping correctly maps unique test columns""" + data = [{SENSITIVE_COLUMN: "user@example.com", SAFE_COLUMN: i} for i in range(10)] + + test_result = dbt_project.test( + test_id, + "unique", + test_args=dict(column_name=SENSITIVE_COLUMN), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SAFE_COLUMN}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_tags": True, + "pii_tags": ["pii"], + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + # Should only contain n_records, not unique_field (which contains PII) + assert len(samples) == 1 + assert "n_records" in samples[0] + assert "unique_field" not in samples[0] + assert len(samples[0]) == 1 + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_accepted_values_test_column_mapping(test_id: str, dbt_project: DbtProject): + """Test that column mapping correctly maps accepted_values test columns""" + data = [{SENSITIVE_COLUMN: "invalid_value", SAFE_COLUMN: i} for i in range(10)] + + test_result = dbt_project.test( + test_id, + "accepted_values", + test_args=dict(column_name=SENSITIVE_COLUMN, values=["valid1", "valid2"]), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SAFE_COLUMN}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_tags": True, + "pii_tags": ["pii"], + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + # Should only contain n_records, not value (which contains PII) + assert len(samples) == 1 + assert "n_records" in samples[0] + assert "value" not in samples[0] + assert len(samples[0]) == 1 + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_not_null_test_column_mapping(test_id: str, dbt_project: DbtProject): + """Test that column mapping correctly handles not_null test columns""" + data = [{SENSITIVE_COLUMN: None, SAFE_COLUMN: i} for i in range(10)] + + test_result = dbt_project.test( + test_id, + "not_null", + test_args=dict(column_name=SENSITIVE_COLUMN), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SAFE_COLUMN}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_tags": True, + "pii_tags": ["pii"], + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + # Should only contain _no_non_excluded_columns when all columns are PII assert len(samples) == TEST_SAMPLE_ROW_COUNT for sample in samples: assert "_no_non_excluded_columns" in sample - assert sample["_no_non_excluded_columns"] == 1 assert SENSITIVE_COLUMN not in sample assert SAFE_COLUMN not in sample + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_multiple_pii_columns_mapping(test_id: str, dbt_project: DbtProject): + """Test that column mapping handles multiple PII columns correctly""" + data = [ + {SENSITIVE_COLUMN: "user@example.com", "phone": "123-456-7890", SAFE_COLUMN: i} + for i in range(10) + ] + + test_result = dbt_project.test( + test_id, + "unique", + test_args=dict(column_name=SENSITIVE_COLUMN), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": "phone", "config": {"tags": ["pii"]}}, + {"name": SAFE_COLUMN}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_tags": True, + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + # Should only contain n_records, not unique_field or phone (which contain PII) + assert len(samples) == 1 + assert "n_records" in samples[0] + assert "unique_field" not in samples[0] + assert "phone" not in samples[0] + assert len(samples[0]) == 1 diff --git a/integration_tests/tests/test_disable_samples_config.py b/integration_tests/tests/test_disable_samples_config.py index d45e02c3f..70eb61e32 100644 --- a/integration_tests/tests/test_disable_samples_config.py +++ b/integration_tests/tests/test_disable_samples_config.py @@ -30,7 +30,7 @@ def test_disable_samples_config_prevents_sampling( columns = [ { "name": COLUMN_NAME, - "config": {"disable_samples": True}, + "config": {"disable_test_samples": True}, "tests": [{"not_null": {}}], } ] @@ -66,7 +66,7 @@ def test_disable_samples_false_allows_sampling(test_id: str, dbt_project: DbtPro columns = [ { "name": COLUMN_NAME, - "config": {"disable_samples": False}, + "config": {"disable_test_samples": False}, "tests": [{"not_null": {}}], } ] @@ -103,7 +103,7 @@ def test_disable_samples_config_overrides_pii_tags( columns = [ { "name": COLUMN_NAME, - "config": {"disable_samples": True, "tags": ["pii"]}, + "config": {"disable_test_samples": True, "tags": ["pii"]}, "tests": [{"not_null": {}}], } ] @@ -134,7 +134,7 @@ def test_disable_samples_config_overrides_pii_tags( @pytest.mark.skip_targets(["clickhouse"]) def test_disable_samples_and_pii_interaction(test_id: str, dbt_project: DbtProject): - """Test that disable_samples and PII columns both get excluded""" + """Test that disable_test_samples and PII columns both get excluded""" data = [ {"col1": None, "col2": f"pii{i}", "col3": f"disabled{i}"} for i in range(10) ] @@ -142,7 +142,7 @@ def test_disable_samples_and_pii_interaction(test_id: str, dbt_project: DbtProje columns = [ {"name": "col1", "tests": [{"not_null": {}}]}, {"name": "col2", "config": {"tags": ["pii"]}}, - {"name": "col3", "config": {"disable_samples": True}}, + {"name": "col3", "config": {"disable_test_samples": True}}, ] test_result = dbt_project.test( @@ -173,13 +173,13 @@ def test_disable_samples_and_pii_interaction(test_id: str, dbt_project: DbtProje @pytest.mark.skip_targets(["clickhouse"]) def test_disable_samples_with_multiple_columns(test_id: str, dbt_project: DbtProject): - """Test that disable_samples excludes only the disabled column""" + """Test that disable_test_samples excludes only the disabled column""" data = [{"col1": None, "col2": f"value{i}"} for i in range(10)] columns = [ { "name": "col1", - "config": {"disable_samples": True}, + "config": {"disable_test_samples": True}, "tests": [{"not_null": {}}], }, {"name": "col2"}, diff --git a/macros/edr/materializations/test/test.sql b/macros/edr/materializations/test/test.sql index 055b8dad6..76dc44260 100644 --- a/macros/edr/materializations/test/test.sql +++ b/macros/edr/materializations/test/test.sql @@ -126,7 +126,7 @@ {% do return([]) %} {% endif %} - {% set columns_to_exclude = elementary.get_columns_to_exclude_from_sampling(flattened_test) %} + {% set columns_to_exclude = elementary.get_columns_to_exclude_from_sampling_intelligent(flattened_test) %} {% set select_clause = "*" %} {% if columns_to_exclude %} @@ -162,8 +162,10 @@ {% do return(columns_to_exclude) %} {% endif %} - {% set pii_columns = elementary.get_pii_columns_from_parent_model(flattened_test) %} - {% set columns_to_exclude = columns_to_exclude + pii_columns %} + {% if elementary.get_config_var('disable_samples_on_pii_tags') %} + {% set pii_columns = elementary.get_pii_columns_from_parent_model(flattened_test) %} + {% set columns_to_exclude = columns_to_exclude + pii_columns %} + {% endif %} {% if elementary.is_sampling_disabled_for_column(flattened_test) %} {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} @@ -175,6 +177,177 @@ {% do return(columns_to_exclude) %} {% endmacro %} +{% macro get_test_result_column_mapping(flattened_test) %} + {# This macro maps original column names to the actual column names in test results #} + {% set test_type = elementary.get_test_type(flattened_test) %} + {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} + {% set mapping = {} %} + + {% if test_type == 'dbt_test' %} + {% set test_name = elementary.insensitive_get_dict_value(flattened_test, 'name') %} + + {# For unique tests, the original column becomes 'unique_field' #} + {% if test_name == 'unique' and test_column_name %} + {% do mapping.update({test_column_name: 'unique_field'}) %} + {% endif %} + + {# For accepted_values tests, the original column becomes 'value' #} + {% if test_name == 'accepted_values' and test_column_name %} + {% do mapping.update({test_column_name: 'value'}) %} + {% endif %} + + {# For relationships tests, the original column becomes 'from_field' #} + {% if test_name == 'relationships' and test_column_name %} + {% do mapping.update({test_column_name: 'from_field'}) %} + {% endif %} + + {# For not_null tests, the original column name is preserved #} + {% if test_name == 'not_null' and test_column_name %} + {% do mapping.update({test_column_name: test_column_name}) %} + {% endif %} + {% endif %} + + {% do return(mapping) %} +{% endmacro %} + +{% macro get_test_result_column_mapping_dynamic(flattened_test) %} + {# This macro dynamically analyzes the test SQL to understand column mappings #} + {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} + {% set mapping = {} %} + + {% if not test_column_name %} + {% do return(mapping) %} + {% endif %} + + {# Get the compiled SQL for the test #} + {% set test_sql = elementary.get_compiled_code(flattened_test) %} + + {# Look for common patterns in the SQL that indicate column aliasing #} + {% set sql_lower = test_sql | lower %} + + {# Pattern 1: Look for "as unique_field" or "unique_field" after the column name #} + {% if test_column_name in sql_lower %} + {# Find the position of the column name in the SQL #} + {% set column_pos = sql_lower.find(test_column_name | lower) %} + {% if column_pos != -1 %} + {# Look for common aliases after the column name #} + {% set after_column = sql_lower[column_pos + (test_column_name | length):column_pos + (test_column_name | length) + 50] %} + + {# Check for "as unique_field" pattern #} + {% if " as unique_field" in after_column %} + {% do mapping.update({test_column_name: 'unique_field'}) %} + {% elif " as value" in after_column %} + {% do mapping.update({test_column_name: 'value'}) %} + {% elif " as from_field" in after_column %} + {% do mapping.update({test_column_name: 'from_field'}) %} + {% elif " as to_field" in after_column %} + {% do mapping.update({test_column_name: 'to_field'}) %} + {% else %} + {# If no explicit alias found, check if the column name appears in the SELECT clause #} + {% set select_pattern = "select " ~ test_column_name | lower %} + {% if select_pattern in sql_lower %} + {% do mapping.update({test_column_name: test_column_name}) %} + {% endif %} + {% endif %} + {% endif %} + {% endif %} + + {% do return(mapping) %} +{% endmacro %} + +{% macro get_columns_to_exclude_from_sampling_with_mapping(flattened_test) %} + {% set original_columns_to_exclude = elementary.get_columns_to_exclude_from_sampling(flattened_test) %} + {% set column_mapping = elementary.get_test_result_column_mapping_dynamic(flattened_test) %} + {% set mapped_columns_to_exclude = [] %} + + {% for original_column in original_columns_to_exclude %} + {% if original_column in column_mapping %} + {% do mapped_columns_to_exclude.append(column_mapping[original_column]) %} + {% else %} + {% do mapped_columns_to_exclude.append(original_column) %} + {% endif %} + {% endfor %} + + {% do return(mapped_columns_to_exclude) %} +{% endmacro %} + +{% macro get_columns_to_exclude_from_sampling_intelligent(flattened_test) %} + {# This macro uses an intelligent approach to map excluded columns to test result columns #} + {% set original_columns_to_exclude = elementary.get_columns_to_exclude_from_sampling(flattened_test) %} + {% set mapped_columns_to_exclude = [] %} + + {% if not original_columns_to_exclude %} + {% do return(mapped_columns_to_exclude) %} + {% endif %} + + {# Get the test result columns by running a sample query #} + {% set sample_query %} + with test_results as ( + {{ sql }} + ) + select * from test_results limit 0 + {% endset %} + + {% set columns_result = elementary.run_query(sample_query) %} + {% set result_columns = columns_result.column_names %} + + {# For each column to exclude, find the best match in the result columns #} + {% for original_column in original_columns_to_exclude %} + {% set found_match = false %} + + {# First, try exact match #} + {% if original_column in result_columns %} + {% do mapped_columns_to_exclude.append(original_column) %} + {% set found_match = true %} + {% else %} + {# Try common dbt test patterns #} + {% set test_name = elementary.insensitive_get_dict_value(flattened_test, 'name') %} + + {# For unique tests, check for 'unique_field' #} + {% if test_name == 'unique' and 'unique_field' in result_columns %} + {% do mapped_columns_to_exclude.append('unique_field') %} + {% set found_match = true %} + {% endif %} + + {# For accepted_values tests, check for 'value' #} + {% if test_name == 'accepted_values' and 'value' in result_columns %} + {% do mapped_columns_to_exclude.append('value') %} + {% set found_match = true %} + {% endif %} + + {# For relationships tests, check for 'from_field' #} + {% if test_name == 'relationships' and 'from_field' in result_columns %} + {% do mapped_columns_to_exclude.append('from_field') %} + {% set found_match = true %} + {% endif %} + + {# For custom tests, try to find columns that might contain the original data #} + {% if not found_match %} + {# Look for columns that might contain the original column's data #} + {% for result_column in result_columns %} + {# Skip metadata columns like n_records, count, etc. #} + {% if result_column not in ['n_records', 'count', 'num_records', 'row_count'] %} + {# If this is the only non-metadata column, it's likely the original data #} + {% set non_metadata_columns = result_columns | reject("in", ['n_records', 'count', 'num_records', 'row_count']) | list %} + {% if non_metadata_columns | length == 1 %} + {% do mapped_columns_to_exclude.append(result_column) %} + {% set found_match = true %} + {% break %} + {% endif %} + {% endif %} + {% endfor %} + {% endif %} + {% endif %} + + {# If no match found, add the original column name (will be filtered out if not present) #} + {% if not found_match %} + {% do mapped_columns_to_exclude.append(original_column) %} + {% endif %} + {% endfor %} + + {% do return(mapped_columns_to_exclude) %} +{% endmacro %} + {% macro is_sampling_disabled_for_column(flattened_test) %} {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} {% set parent_model_unique_id = elementary.insensitive_get_dict_value(flattened_test, 'parent_model_unique_id') %} @@ -186,8 +359,8 @@ {% set parent_model = elementary.get_node(parent_model_unique_id) %} {% if parent_model and parent_model.get('columns') %} {% set column_config = parent_model.get('columns', {}).get(test_column_name, {}).get('config', {}) %} - {% set disable_samples = elementary.safe_get_with_default(column_config, 'disable_samples', false) %} - {% do return(disable_samples) %} + {% set disable_test_samples = elementary.safe_get_with_default(column_config, 'disable_test_samples', false) %} + {% do return(disable_test_samples) %} {% endif %} {% do return(false) %} diff --git a/macros/edr/system/system_utils/is_pii_column.sql b/macros/edr/system/system_utils/is_pii_column.sql index 6c4bff36a..ebea82309 100644 --- a/macros/edr/system/system_utils/is_pii_column.sql +++ b/macros/edr/system/system_utils/is_pii_column.sql @@ -17,10 +17,8 @@ {% do return(pii_columns) %} {% endif %} - {% set pii_tags = elementary.get_config_var('pii_tags') %} - {% if pii_tags is string %} - {% set pii_tags = [pii_tags] %} - {% endif %} + {% set raw_pii_tags = elementary.get_config_var('pii_tags') %} + {% set pii_tags = (raw_pii_tags if raw_pii_tags is iterable else [raw_pii_tags]) | map('lower') | list %} {% for column_node in column_nodes.values() %} {% set config_dict = column_node.get('config', {}) %} @@ -29,9 +27,10 @@ {% set meta_dict = column_node.get('meta', {}) %} {% set meta_tags = meta_dict.get('tags', []) %} {% set all_column_tags = config_tags + column_tags + meta_tags %} + {% set all_column_tags_lower = all_column_tags | map('lower') | list %} {% for pii_tag in pii_tags %} - {% if pii_tag in all_column_tags %} + {% if pii_tag in all_column_tags_lower %} {% do pii_columns.append(column_node.get('name')) %} {% break %} {% endif %} From 06e9451307faaffab76d9f073ebb120f99fc6e4b Mon Sep 17 00:00:00 2001 From: arbiv Date: Sun, 3 Aug 2025 00:39:45 +0300 Subject: [PATCH 08/15] using dbt parser --- .../tests/test_column_pii_sampling.py | 148 ++++++++++++ macros/edr/materializations/test/test.sql | 215 +++++------------- 2 files changed, 209 insertions(+), 154 deletions(-) diff --git a/integration_tests/tests/test_column_pii_sampling.py b/integration_tests/tests/test_column_pii_sampling.py index d304c2a6c..15cb31afb 100644 --- a/integration_tests/tests/test_column_pii_sampling.py +++ b/integration_tests/tests/test_column_pii_sampling.py @@ -311,3 +311,151 @@ def test_multiple_pii_columns_mapping(test_id: str, dbt_project: DbtProject): assert "unique_field" not in samples[0] assert "phone" not in samples[0] assert len(samples[0]) == 1 + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_custom_sql_test_with_pii_column_simple(test_id: str, dbt_project: DbtProject): + """Test that custom SQL tests with PII columns are handled correctly""" + data = [{SENSITIVE_COLUMN: "user@example.com", SAFE_COLUMN: i} for i in range(10)] + + test_result = dbt_project.test( + test_id, + "unique", + test_args=dict(column_name=SENSITIVE_COLUMN), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SAFE_COLUMN}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_tags": True, + "pii_tags": ["pii"], + }, + ) + assert test_result["status"] == "fail" + + # Verify that PII columns are excluded from sampling + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + assert len(samples) == 1 + assert samples[0]["n_records"] == 10 + # Should only contain n_records, not the actual PII data + assert len(samples[0]) == 1 + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_custom_sql_test_with_pii_column_complex_aliasing( + test_id: str, dbt_project: DbtProject +): + """Test that custom SQL tests with complex column aliasing and PII columns work correctly""" + data = [{SENSITIVE_COLUMN: "user@example.com", SAFE_COLUMN: i} for i in range(10)] + + # Test with accepted_values to simulate complex column mapping + test_result = dbt_project.test( + test_id, + "accepted_values", + test_args=dict(column_name=SENSITIVE_COLUMN, values=["invalid@example.com"]), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SAFE_COLUMN}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_tags": True, + "pii_tags": ["pii"], + }, + ) + assert test_result["status"] == "fail" + + # Verify that PII columns are excluded from sampling + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + assert len(samples) == 1 + assert samples[0]["n_records"] == 10 + # Should only contain n_records, not the actual PII data + assert len(samples[0]) == 1 + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_custom_sql_test_with_multiple_pii_columns( + test_id: str, dbt_project: DbtProject +): + """Test that custom SQL tests with multiple PII columns are handled correctly""" + data = [ + {SENSITIVE_COLUMN: "user@example.com", "phone": "123-456-7890", SAFE_COLUMN: i} + for i in range(10) + ] + + # Test with unique to simulate complex multi-column scenarios + test_result = dbt_project.test( + test_id, + "unique", + test_args=dict(column_name=SENSITIVE_COLUMN), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": "phone", "config": {"tags": ["pii"]}}, + {"name": SAFE_COLUMN}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_tags": True, + "pii_tags": ["pii"], + }, + ) + assert test_result["status"] == "fail" + + # Verify that PII columns are excluded from sampling + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + assert len(samples) == 1 + assert samples[0]["n_records"] == 10 + # Should only contain n_records, not the actual PII data + assert len(samples[0]) == 1 + + +@pytest.mark.skip_targets(["clickhouse"]) +def test_custom_sql_test_with_subquery_and_pii(test_id: str, dbt_project: DbtProject): + """Test that custom SQL tests with subqueries and PII columns work correctly""" + data = [{SENSITIVE_COLUMN: "user@example.com", SAFE_COLUMN: i} for i in range(10)] + + # Test with not_null to simulate subquery-like scenarios + test_result = dbt_project.test( + test_id, + "not_null", + test_args=dict(column_name=SENSITIVE_COLUMN), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SAFE_COLUMN}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_tags": True, + "pii_tags": ["pii"], + }, + ) + assert test_result["status"] == "pass" + + # For passing tests, we don't expect samples to be generated + # The test passes, so no failed rows to sample + # This is expected behavior for passing tests + + +# Removed complex custom SQL tests that don't work with this framework +# The simplified column mapping logic works with standard dbt test types diff --git a/macros/edr/materializations/test/test.sql b/macros/edr/materializations/test/test.sql index 76dc44260..4c0ca60cb 100644 --- a/macros/edr/materializations/test/test.sql +++ b/macros/edr/materializations/test/test.sql @@ -130,14 +130,10 @@ {% set select_clause = "*" %} {% if columns_to_exclude %} - {% set query_to_get_columns %} - with test_results as ( - {{ sql }} - ) - select * from test_results limit 0 - {% endset %} - {% set columns_result = elementary.run_query(query_to_get_columns) %} - {% set all_columns = columns_result.column_names %} + {# Use dbt's built-in parser to get actual column names #} + {% set test_relation = elementary.create_test_result_relation(flattened_test) %} + {% set result_columns = adapter.get_columns_in_relation(test_relation) %} + {% set all_columns = result_columns | map(attribute='name') | list %} {% set safe_columns = all_columns | reject("in", columns_to_exclude) | list %} {% if safe_columns %} {% set select_clause = safe_columns | join(", ") %} @@ -177,177 +173,88 @@ {% do return(columns_to_exclude) %} {% endmacro %} -{% macro get_test_result_column_mapping(flattened_test) %} - {# This macro maps original column names to the actual column names in test results #} - {% set test_type = elementary.get_test_type(flattened_test) %} - {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} - {% set mapping = {} %} - - {% if test_type == 'dbt_test' %} - {% set test_name = elementary.insensitive_get_dict_value(flattened_test, 'name') %} - - {# For unique tests, the original column becomes 'unique_field' #} - {% if test_name == 'unique' and test_column_name %} - {% do mapping.update({test_column_name: 'unique_field'}) %} - {% endif %} - - {# For accepted_values tests, the original column becomes 'value' #} - {% if test_name == 'accepted_values' and test_column_name %} - {% do mapping.update({test_column_name: 'value'}) %} - {% endif %} - - {# For relationships tests, the original column becomes 'from_field' #} - {% if test_name == 'relationships' and test_column_name %} - {% do mapping.update({test_column_name: 'from_field'}) %} - {% endif %} - - {# For not_null tests, the original column name is preserved #} - {% if test_name == 'not_null' and test_column_name %} - {% do mapping.update({test_column_name: test_column_name}) %} - {% endif %} - {% endif %} - - {% do return(mapping) %} -{% endmacro %} - -{% macro get_test_result_column_mapping_dynamic(flattened_test) %} - {# This macro dynamically analyzes the test SQL to understand column mappings #} - {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} - {% set mapping = {} %} - - {% if not test_column_name %} - {% do return(mapping) %} - {% endif %} - - {# Get the compiled SQL for the test #} - {% set test_sql = elementary.get_compiled_code(flattened_test) %} - - {# Look for common patterns in the SQL that indicate column aliasing #} - {% set sql_lower = test_sql | lower %} - - {# Pattern 1: Look for "as unique_field" or "unique_field" after the column name #} - {% if test_column_name in sql_lower %} - {# Find the position of the column name in the SQL #} - {% set column_pos = sql_lower.find(test_column_name | lower) %} - {% if column_pos != -1 %} - {# Look for common aliases after the column name #} - {% set after_column = sql_lower[column_pos + (test_column_name | length):column_pos + (test_column_name | length) + 50] %} - - {# Check for "as unique_field" pattern #} - {% if " as unique_field" in after_column %} - {% do mapping.update({test_column_name: 'unique_field'}) %} - {% elif " as value" in after_column %} - {% do mapping.update({test_column_name: 'value'}) %} - {% elif " as from_field" in after_column %} - {% do mapping.update({test_column_name: 'from_field'}) %} - {% elif " as to_field" in after_column %} - {% do mapping.update({test_column_name: 'to_field'}) %} - {% else %} - {# If no explicit alias found, check if the column name appears in the SELECT clause #} - {% set select_pattern = "select " ~ test_column_name | lower %} - {% if select_pattern in sql_lower %} - {% do mapping.update({test_column_name: test_column_name}) %} - {% endif %} - {% endif %} - {% endif %} - {% endif %} - - {% do return(mapping) %} -{% endmacro %} - -{% macro get_columns_to_exclude_from_sampling_with_mapping(flattened_test) %} - {% set original_columns_to_exclude = elementary.get_columns_to_exclude_from_sampling(flattened_test) %} - {% set column_mapping = elementary.get_test_result_column_mapping_dynamic(flattened_test) %} - {% set mapped_columns_to_exclude = [] %} - - {% for original_column in original_columns_to_exclude %} - {% if original_column in column_mapping %} - {% do mapped_columns_to_exclude.append(column_mapping[original_column]) %} - {% else %} - {% do mapped_columns_to_exclude.append(original_column) %} - {% endif %} - {% endfor %} - - {% do return(mapped_columns_to_exclude) %} -{% endmacro %} +{# Removed complex column mapping macros - replaced with simpler dbt parser-based approach #} {% macro get_columns_to_exclude_from_sampling_intelligent(flattened_test) %} - {# This macro uses an intelligent approach to map excluded columns to test result columns #} + {# This macro uses dbt's built-in parser to get actual test result columns and handle exclusions #} {% set original_columns_to_exclude = elementary.get_columns_to_exclude_from_sampling(flattened_test) %} - {% set mapped_columns_to_exclude = [] %} {% if not original_columns_to_exclude %} - {% do return(mapped_columns_to_exclude) %} + {% do return([]) %} {% endif %} - {# Get the test result columns by running a sample query #} - {% set sample_query %} - with test_results as ( - {{ sql }} - ) - select * from test_results limit 0 - {% endset %} + {# Get the actual test result columns using dbt's built-in parser #} + {% set test_relation = elementary.create_test_result_relation(flattened_test) %} + {% set result_columns = adapter.get_columns_in_relation(test_relation) %} + {% set result_column_names = result_columns | map(attribute='name') | list %} - {% set columns_result = elementary.run_query(sample_query) %} - {% set result_columns = columns_result.column_names %} + {# Map original columns to actual result columns using a simple matching strategy #} + {% set mapped_columns_to_exclude = [] %} - {# For each column to exclude, find the best match in the result columns #} {% for original_column in original_columns_to_exclude %} {% set found_match = false %} - {# First, try exact match #} - {% if original_column in result_columns %} - {% do mapped_columns_to_exclude.append(original_column) %} - {% set found_match = true %} - {% else %} - {# Try common dbt test patterns #} - {% set test_name = elementary.insensitive_get_dict_value(flattened_test, 'name') %} - - {# For unique tests, check for 'unique_field' #} - {% if test_name == 'unique' and 'unique_field' in result_columns %} - {% do mapped_columns_to_exclude.append('unique_field') %} - {% set found_match = true %} - {% endif %} - - {# For accepted_values tests, check for 'value' #} - {% if test_name == 'accepted_values' and 'value' in result_columns %} - {% do mapped_columns_to_exclude.append('value') %} - {% set found_match = true %} - {% endif %} - - {# For relationships tests, check for 'from_field' #} - {% if test_name == 'relationships' and 'from_field' in result_columns %} - {% do mapped_columns_to_exclude.append('from_field') %} + {# First, try exact match (case-insensitive) #} + {% for result_column in result_column_names %} + {% if result_column.lower() == original_column.lower() %} + {% do mapped_columns_to_exclude.append(result_column) %} {% set found_match = true %} + {% break %} {% endif %} + {% endfor %} + + {# If no exact match, try common dbt test patterns #} + {% if not found_match %} + {% set test_name = elementary.insensitive_get_dict_value(flattened_test, 'name') %} + {% set common_mappings = { + 'unique': 'unique_field', + 'accepted_values': 'value', + 'relationships': 'from_field', + 'not_null': original_column + } %} - {# For custom tests, try to find columns that might contain the original data #} - {% if not found_match %} - {# Look for columns that might contain the original column's data #} - {% for result_column in result_columns %} - {# Skip metadata columns like n_records, count, etc. #} - {% if result_column not in ['n_records', 'count', 'num_records', 'row_count'] %} - {# If this is the only non-metadata column, it's likely the original data #} - {% set non_metadata_columns = result_columns | reject("in", ['n_records', 'count', 'num_records', 'row_count']) | list %} - {% if non_metadata_columns | length == 1 %} - {% do mapped_columns_to_exclude.append(result_column) %} - {% set found_match = true %} - {% break %} - {% endif %} - {% endif %} - {% endfor %} + {% if test_name in common_mappings %} + {% set expected_column = common_mappings[test_name] %} + {% if expected_column in result_column_names %} + {% do mapped_columns_to_exclude.append(expected_column) %} + {% set found_match = true %} + {% endif %} {% endif %} {% endif %} - {# If no match found, add the original column name (will be filtered out if not present) #} + {# If still no match, try to find the most likely column containing the original data #} {% if not found_match %} - {% do mapped_columns_to_exclude.append(original_column) %} + {# Look for columns that might contain the original column's data #} + {% set non_metadata_columns = result_column_names | reject("in", ['n_records', 'count', 'num_records', 'row_count', 'test_result']) | list %} + {% if non_metadata_columns | length == 1 %} + {% do mapped_columns_to_exclude.append(non_metadata_columns[0]) %} + {% elif original_column in result_column_names %} + {# If the original column name exists in results, use it #} + {% do mapped_columns_to_exclude.append(original_column) %} + {% endif %} {% endif %} {% endfor %} {% do return(mapped_columns_to_exclude) %} {% endmacro %} +{% macro create_test_result_relation(flattened_test) %} + {# Create a temporary relation for the test results to use dbt's parser #} + {% set database, schema = elementary.get_package_database_and_schema() %} + {% set test_id = "tmp_" ~ elementary.get_node_execution_id(flattened_test)[:20] %} + + {# Create a temporary table with the test results #} + {% set temp_sql %} + with test_results as ( + {{ sql }} + ) + select * from test_results limit 0 + {% endset %} + + {% set relation = elementary.create_temp_table(database, schema, test_id, temp_sql) %} + {% do return(relation) %} +{% endmacro %} + {% macro is_sampling_disabled_for_column(flattened_test) %} {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} {% set parent_model_unique_id = elementary.insensitive_get_dict_value(flattened_test, 'parent_model_unique_id') %} From 3b6854be233c6ed72b466d093ad1eabfb8db9a5b Mon Sep 17 00:00:00 2001 From: arbiv Date: Sun, 3 Aug 2025 11:48:51 +0300 Subject: [PATCH 09/15] changed column logic to not sample if one of the columns is in the test query --- integration_tests/tests/dbt_project.py | 8 +- .../tests/test_column_pii_sampling.py | 60 +++------- .../tests/test_disable_samples_config.py | 84 +++---------- macros/edr/materializations/test/test.sql | 112 ++++-------------- 4 files changed, 64 insertions(+), 200 deletions(-) diff --git a/integration_tests/tests/dbt_project.py b/integration_tests/tests/dbt_project.py index a871f0f53..6aea795b8 100644 --- a/integration_tests/tests/dbt_project.py +++ b/integration_tests/tests/dbt_project.py @@ -149,6 +149,7 @@ def test( test_vars: Optional[dict] = None, elementary_enabled: bool = True, model_config: Optional[Dict[str, Any]] = None, + column_config: Optional[Dict[str, Any]] = None, *, multiple_results: bool = False, ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: @@ -173,9 +174,10 @@ def test( if test_column is None: table_yaml["tests"] = [{dbt_test_name: test_args}] else: - table_yaml["columns"] = [ - {"name": test_column, "tests": [{dbt_test_name: test_args}]} - ] + column_def = {"name": test_column, "tests": [{dbt_test_name: test_args}]} + if column_config: + column_def["config"] = column_config + table_yaml["columns"] = [column_def] temp_table_ctx: Any if as_model: diff --git a/integration_tests/tests/test_column_pii_sampling.py b/integration_tests/tests/test_column_pii_sampling.py index 15cb31afb..f82b2e51c 100644 --- a/integration_tests/tests/test_column_pii_sampling.py +++ b/integration_tests/tests/test_column_pii_sampling.py @@ -53,9 +53,8 @@ def test_column_pii_sampling_enabled(test_id: str, dbt_project: DbtProject): for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == 1 - assert samples[0]["n_records"] == 10 - assert len(samples[0]) == 1 + # With new logic: sampling is disabled entirely when PII is detected + assert len(samples) == 0 @pytest.mark.skip_targets(["clickhouse"]) @@ -196,11 +195,8 @@ def test_unique_test_column_mapping(test_id: str, dbt_project: DbtProject): for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - # Should only contain n_records, not unique_field (which contains PII) - assert len(samples) == 1 - assert "n_records" in samples[0] - assert "unique_field" not in samples[0] - assert len(samples[0]) == 1 + # With new logic: sampling is disabled entirely when PII is detected + assert len(samples) == 0 @pytest.mark.skip_targets(["clickhouse"]) @@ -231,11 +227,8 @@ def test_accepted_values_test_column_mapping(test_id: str, dbt_project: DbtProje for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - # Should only contain n_records, not value (which contains PII) - assert len(samples) == 1 - assert "n_records" in samples[0] - assert "value" not in samples[0] - assert len(samples[0]) == 1 + # With new logic: sampling is disabled entirely when PII is detected + assert len(samples) == 0 @pytest.mark.skip_targets(["clickhouse"]) @@ -266,12 +259,8 @@ def test_not_null_test_column_mapping(test_id: str, dbt_project: DbtProject): for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - # Should only contain _no_non_excluded_columns when all columns are PII - assert len(samples) == TEST_SAMPLE_ROW_COUNT - for sample in samples: - assert "_no_non_excluded_columns" in sample - assert SENSITIVE_COLUMN not in sample - assert SAFE_COLUMN not in sample + # With new logic: sampling is disabled entirely when PII is detected + assert len(samples) == 0 @pytest.mark.skip_targets(["clickhouse"]) @@ -305,12 +294,8 @@ def test_multiple_pii_columns_mapping(test_id: str, dbt_project: DbtProject): for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - # Should only contain n_records, not unique_field or phone (which contain PII) - assert len(samples) == 1 - assert "n_records" in samples[0] - assert "unique_field" not in samples[0] - assert "phone" not in samples[0] - assert len(samples[0]) == 1 + # With new logic: sampling is disabled entirely when PII is detected + assert len(samples) == 0 @pytest.mark.skip_targets(["clickhouse"]) @@ -342,10 +327,8 @@ def test_custom_sql_test_with_pii_column_simple(test_id: str, dbt_project: DbtPr for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == 1 - assert samples[0]["n_records"] == 10 - # Should only contain n_records, not the actual PII data - assert len(samples[0]) == 1 + # With new logic: sampling is disabled entirely when PII is detected + assert len(samples) == 0 @pytest.mark.skip_targets(["clickhouse"]) @@ -380,10 +363,8 @@ def test_custom_sql_test_with_pii_column_complex_aliasing( for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == 1 - assert samples[0]["n_records"] == 10 - # Should only contain n_records, not the actual PII data - assert len(samples[0]) == 1 + # With new logic: sampling is disabled entirely when PII is detected + assert len(samples) == 0 @pytest.mark.skip_targets(["clickhouse"]) @@ -422,10 +403,8 @@ def test_custom_sql_test_with_multiple_pii_columns( for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == 1 - assert samples[0]["n_records"] == 10 - # Should only contain n_records, not the actual PII data - assert len(samples[0]) == 1 + # With new logic: sampling is disabled entirely when PII is detected + assert len(samples) == 0 @pytest.mark.skip_targets(["clickhouse"]) @@ -451,11 +430,4 @@ def test_custom_sql_test_with_subquery_and_pii(test_id: str, dbt_project: DbtPro }, ) assert test_result["status"] == "pass" - # For passing tests, we don't expect samples to be generated - # The test passes, so no failed rows to sample - # This is expected behavior for passing tests - - -# Removed complex custom SQL tests that don't work with this framework -# The simplified column mapping logic works with standard dbt test types diff --git a/integration_tests/tests/test_disable_samples_config.py b/integration_tests/tests/test_disable_samples_config.py index 70eb61e32..849672a98 100644 --- a/integration_tests/tests/test_disable_samples_config.py +++ b/integration_tests/tests/test_disable_samples_config.py @@ -8,14 +8,14 @@ SAMPLES_QUERY = """ with latest_elementary_test_result as ( select id - from {{ ref("elementary_test_results") }} + from {{{{ ref("elementary_test_results") }}}} where lower(table_name) = lower('{test_id}') order by created_at desc limit 1 ) select result_row - from {{ ref("test_result_rows") }} + from {{{{ ref("test_result_rows") }}}} where elementary_test_results_id in (select * from latest_elementary_test_result) """ @@ -27,19 +27,12 @@ def test_disable_samples_config_prevents_sampling( null_count = 20 data = [{COLUMN_NAME: None} for _ in range(null_count)] - columns = [ - { - "name": COLUMN_NAME, - "config": {"disable_test_samples": True}, - "tests": [{"not_null": {}}], - } - ] - test_result = dbt_project.test( test_id, "not_null", - columns=columns, + dict(column_name=COLUMN_NAME, meta={"disable_test_samples": True}), data=data, + as_model=True, test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": 5, @@ -51,11 +44,7 @@ def test_disable_samples_config_prevents_sampling( json.loads(row["result_row"]) for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == 5 - for sample in samples: - assert "_no_non_excluded_columns" in sample - assert sample["_no_non_excluded_columns"] == 1 - assert COLUMN_NAME not in sample + assert len(samples) == 0 @pytest.mark.skip_targets(["clickhouse"]) @@ -63,19 +52,12 @@ def test_disable_samples_false_allows_sampling(test_id: str, dbt_project: DbtPro null_count = 20 data = [{COLUMN_NAME: None} for _ in range(null_count)] - columns = [ - { - "name": COLUMN_NAME, - "config": {"disable_test_samples": False}, - "tests": [{"not_null": {}}], - } - ] - test_result = dbt_project.test( test_id, "not_null", - columns=columns, + dict(column_name=COLUMN_NAME, meta={"disable_test_samples": False}), data=data, + as_model=True, test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": 5, @@ -100,19 +82,15 @@ def test_disable_samples_config_overrides_pii_tags( null_count = 20 data = [{COLUMN_NAME: None} for _ in range(null_count)] - columns = [ - { - "name": COLUMN_NAME, - "config": {"disable_test_samples": True, "tags": ["pii"]}, - "tests": [{"not_null": {}}], - } - ] - test_result = dbt_project.test( test_id, "not_null", - columns=columns, + dict( + column_name=COLUMN_NAME, + meta={"disable_test_samples": True, "tags": ["pii"]}, + ), data=data, + as_model=True, test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": 5, @@ -125,11 +103,7 @@ def test_disable_samples_config_overrides_pii_tags( json.loads(row["result_row"]) for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == 5 - for sample in samples: - assert "_no_non_excluded_columns" in sample - assert sample["_no_non_excluded_columns"] == 1 - assert COLUMN_NAME not in sample + assert len(samples) == 0 @pytest.mark.skip_targets(["clickhouse"]) @@ -139,17 +113,12 @@ def test_disable_samples_and_pii_interaction(test_id: str, dbt_project: DbtProje {"col1": None, "col2": f"pii{i}", "col3": f"disabled{i}"} for i in range(10) ] - columns = [ - {"name": "col1", "tests": [{"not_null": {}}]}, - {"name": "col2", "config": {"tags": ["pii"]}}, - {"name": "col3", "config": {"disable_test_samples": True}}, - ] - test_result = dbt_project.test( test_id, "not_null", - columns=columns, + dict(column_name="col1", meta={"disable_test_samples": True}), data=data, + as_model=True, test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": 5, @@ -164,11 +133,7 @@ def test_disable_samples_and_pii_interaction(test_id: str, dbt_project: DbtProje for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == 5 - for sample in samples: - assert "col1" in sample - assert "col2" not in sample - assert "col3" not in sample + assert len(samples) == 0 @pytest.mark.skip_targets(["clickhouse"]) @@ -176,20 +141,12 @@ def test_disable_samples_with_multiple_columns(test_id: str, dbt_project: DbtPro """Test that disable_test_samples excludes only the disabled column""" data = [{"col1": None, "col2": f"value{i}"} for i in range(10)] - columns = [ - { - "name": "col1", - "config": {"disable_test_samples": True}, - "tests": [{"not_null": {}}], - }, - {"name": "col2"}, - ] - test_result = dbt_project.test( test_id, "not_null", - columns=columns, + dict(column_name="col1", meta={"disable_test_samples": True}), data=data, + as_model=True, test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": 5, @@ -202,7 +159,4 @@ def test_disable_samples_with_multiple_columns(test_id: str, dbt_project: DbtPro for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - assert len(samples) == 5 - for sample in samples: - assert "col1" not in sample - assert "col2" in sample + assert len(samples) == 0 diff --git a/macros/edr/materializations/test/test.sql b/macros/edr/materializations/test/test.sql index 4c0ca60cb..1affb4828 100644 --- a/macros/edr/materializations/test/test.sql +++ b/macros/edr/materializations/test/test.sql @@ -61,6 +61,8 @@ {% set sample_limit = 0 %} {% elif elementary.is_pii_table(flattened_test) %} {% set sample_limit = 0 %} + {% elif elementary.should_disable_sampling_for_pii(flattened_test) %} + {% set sample_limit = 0 %} {% endif %} {% set result_rows = elementary.query_test_result_rows(sample_limit=sample_limit, @@ -126,27 +128,11 @@ {% do return([]) %} {% endif %} - {% set columns_to_exclude = elementary.get_columns_to_exclude_from_sampling_intelligent(flattened_test) %} - - {% set select_clause = "*" %} - {% if columns_to_exclude %} - {# Use dbt's built-in parser to get actual column names #} - {% set test_relation = elementary.create_test_result_relation(flattened_test) %} - {% set result_columns = adapter.get_columns_in_relation(test_relation) %} - {% set all_columns = result_columns | map(attribute='name') | list %} - {% set safe_columns = all_columns | reject("in", columns_to_exclude) | list %} - {% if safe_columns %} - {% set select_clause = safe_columns | join(", ") %} - {% else %} - {% set select_clause = "1 as _no_non_excluded_columns" %} - {% endif %} - {% endif %} - {% set query %} with test_results as ( {{ sql }} ) - select {{ select_clause }} from test_results {% if sample_limit is not none %} limit {{ sample_limit }} {% endif %} + select * from test_results {% if sample_limit is not none %} limit {{ sample_limit }} {% endif %} {% endset %} {% do return(elementary.agate_to_dicts(elementary.run_query(query))) %} {% endmacro %} @@ -173,87 +159,37 @@ {% do return(columns_to_exclude) %} {% endmacro %} -{# Removed complex column mapping macros - replaced with simpler dbt parser-based approach #} - -{% macro get_columns_to_exclude_from_sampling_intelligent(flattened_test) %} - {# This macro uses dbt's built-in parser to get actual test result columns and handle exclusions #} - {% set original_columns_to_exclude = elementary.get_columns_to_exclude_from_sampling(flattened_test) %} +{# Simple logic: if test query contains PII columns or *, disable sampling entirely #} +{% macro should_disable_sampling_for_pii(flattened_test) %} + {% if not elementary.get_config_var('disable_samples_on_pii_tags') %} + {% do return(false) %} + {% endif %} - {% if not original_columns_to_exclude %} - {% do return([]) %} + {% set pii_columns = elementary.get_pii_columns_from_parent_model(flattened_test) %} + {% if not pii_columns %} + {% do return(false) %} {% endif %} - {# Get the actual test result columns using dbt's built-in parser #} - {% set test_relation = elementary.create_test_result_relation(flattened_test) %} - {% set result_columns = adapter.get_columns_in_relation(test_relation) %} - {% set result_column_names = result_columns | map(attribute='name') | list %} + {# Get the compiled test query #} + {% set test_query = elementary.get_compiled_code(flattened_test) %} + {% set test_query_lower = test_query.lower() %} - {# Map original columns to actual result columns using a simple matching strategy #} - {% set mapped_columns_to_exclude = [] %} + {# Check if query uses * (select all columns) #} + {% if '*' in test_query_lower %} + {% do return(true) %} + {% endif %} - {% for original_column in original_columns_to_exclude %} - {% set found_match = false %} - - {# First, try exact match (case-insensitive) #} - {% for result_column in result_column_names %} - {% if result_column.lower() == original_column.lower() %} - {% do mapped_columns_to_exclude.append(result_column) %} - {% set found_match = true %} - {% break %} - {% endif %} - {% endfor %} - - {# If no exact match, try common dbt test patterns #} - {% if not found_match %} - {% set test_name = elementary.insensitive_get_dict_value(flattened_test, 'name') %} - {% set common_mappings = { - 'unique': 'unique_field', - 'accepted_values': 'value', - 'relationships': 'from_field', - 'not_null': original_column - } %} - - {% if test_name in common_mappings %} - {% set expected_column = common_mappings[test_name] %} - {% if expected_column in result_column_names %} - {% do mapped_columns_to_exclude.append(expected_column) %} - {% set found_match = true %} - {% endif %} - {% endif %} - {% endif %} - - {# If still no match, try to find the most likely column containing the original data #} - {% if not found_match %} - {# Look for columns that might contain the original column's data #} - {% set non_metadata_columns = result_column_names | reject("in", ['n_records', 'count', 'num_records', 'row_count', 'test_result']) | list %} - {% if non_metadata_columns | length == 1 %} - {% do mapped_columns_to_exclude.append(non_metadata_columns[0]) %} - {% elif original_column in result_column_names %} - {# If the original column name exists in results, use it #} - {% do mapped_columns_to_exclude.append(original_column) %} - {% endif %} + {# Check if any PII column appears in the test query #} + {% for pii_column in pii_columns %} + {% if pii_column.lower() in test_query_lower %} + {% do return(true) %} {% endif %} {% endfor %} - {% do return(mapped_columns_to_exclude) %} + {% do return(false) %} {% endmacro %} -{% macro create_test_result_relation(flattened_test) %} - {# Create a temporary relation for the test results to use dbt's parser #} - {% set database, schema = elementary.get_package_database_and_schema() %} - {% set test_id = "tmp_" ~ elementary.get_node_execution_id(flattened_test)[:20] %} - - {# Create a temporary table with the test results #} - {% set temp_sql %} - with test_results as ( - {{ sql }} - ) - select * from test_results limit 0 - {% endset %} - - {% set relation = elementary.create_temp_table(database, schema, test_id, temp_sql) %} - {% do return(relation) %} -{% endmacro %} +{# Removed complex column mapping macros - replaced with simpler approach #} {% macro is_sampling_disabled_for_column(flattened_test) %} {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} From 057a2cc67521935d3b2f8130834d617390fad001 Mon Sep 17 00:00:00 2001 From: arbiv Date: Sun, 3 Aug 2025 13:16:09 +0300 Subject: [PATCH 10/15] improved tests and code readability --- .../tests/test_column_pii_sampling.py | 45 +++++++------------ macros/edr/materializations/test/test.sql | 4 +- .../edr/system/system_utils/is_pii_column.sql | 8 +++- 3 files changed, 25 insertions(+), 32 deletions(-) diff --git a/integration_tests/tests/test_column_pii_sampling.py b/integration_tests/tests/test_column_pii_sampling.py index f82b2e51c..71995d87e 100644 --- a/integration_tests/tests/test_column_pii_sampling.py +++ b/integration_tests/tests/test_column_pii_sampling.py @@ -53,7 +53,6 @@ def test_column_pii_sampling_enabled(test_id: str, dbt_project: DbtProject): for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - # With new logic: sampling is disabled entirely when PII is detected assert len(samples) == 0 @@ -87,10 +86,10 @@ def test_column_pii_sampling_disabled(test_id: str, dbt_project: DbtProject): # sample should be {'unique_field': 'user@example.com', 'n_records': 10} assert len(samples) == 1 - for sample in samples: - # The original column name is mapped to 'unique_field' in unique tests - assert "unique_field" in sample - assert "n_records" in sample + assert "unique_field" in samples[0] + assert samples[0]["unique_field"] == "user@example.com" + assert "n_records" in samples[0] + assert samples[0]["n_records"] == 10 @pytest.mark.skip_targets(["clickhouse"]) @@ -126,10 +125,10 @@ def test_column_pii_sampling_tags_exist_but_flag_disabled( # When flag is disabled, we get the full sample (not limited by PII filtering) assert len(samples) == 1 - for sample in samples: - # The original column name is mapped to 'unique_field' in unique tests - assert "unique_field" in sample - assert "n_records" in sample + assert "unique_field" in samples[0] + assert samples[0]["unique_field"] == 1 + assert "n_records" in samples[0] + assert samples[0]["n_records"] == 10 @pytest.mark.skip_targets(["clickhouse"]) @@ -195,7 +194,6 @@ def test_unique_test_column_mapping(test_id: str, dbt_project: DbtProject): for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - # With new logic: sampling is disabled entirely when PII is detected assert len(samples) == 0 @@ -227,7 +225,6 @@ def test_accepted_values_test_column_mapping(test_id: str, dbt_project: DbtProje for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - # With new logic: sampling is disabled entirely when PII is detected assert len(samples) == 0 @@ -259,7 +256,6 @@ def test_not_null_test_column_mapping(test_id: str, dbt_project: DbtProject): for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - # With new logic: sampling is disabled entirely when PII is detected assert len(samples) == 0 @@ -294,7 +290,6 @@ def test_multiple_pii_columns_mapping(test_id: str, dbt_project: DbtProject): for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - # With new logic: sampling is disabled entirely when PII is detected assert len(samples) == 0 @@ -321,21 +316,16 @@ def test_custom_sql_test_with_pii_column_simple(test_id: str, dbt_project: DbtPr ) assert test_result["status"] == "fail" - # Verify that PII columns are excluded from sampling samples = [ json.loads(row["result_row"]) for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - # With new logic: sampling is disabled entirely when PII is detected assert len(samples) == 0 @pytest.mark.skip_targets(["clickhouse"]) -def test_custom_sql_test_with_pii_column_complex_aliasing( - test_id: str, dbt_project: DbtProject -): - """Test that custom SQL tests with complex column aliasing and PII columns work correctly""" +def test_meta_tags_and_accepted_values(test_id: str, dbt_project: DbtProject): data = [{SENSITIVE_COLUMN: "user@example.com", SAFE_COLUMN: i} for i in range(10)] # Test with accepted_values to simulate complex column mapping @@ -345,7 +335,7 @@ def test_custom_sql_test_with_pii_column_complex_aliasing( test_args=dict(column_name=SENSITIVE_COLUMN, values=["invalid@example.com"]), data=data, columns=[ - {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SENSITIVE_COLUMN, "meta": {"tags": ["pii"]}}, {"name": SAFE_COLUMN}, ], test_vars={ @@ -357,13 +347,11 @@ def test_custom_sql_test_with_pii_column_complex_aliasing( ) assert test_result["status"] == "fail" - # Verify that PII columns are excluded from sampling samples = [ json.loads(row["result_row"]) for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - # With new logic: sampling is disabled entirely when PII is detected assert len(samples) == 0 @@ -384,8 +372,8 @@ def test_custom_sql_test_with_multiple_pii_columns( test_args=dict(column_name=SENSITIVE_COLUMN), data=data, columns=[ - {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, - {"name": "phone", "config": {"tags": ["pii"]}}, + {"name": SENSITIVE_COLUMN, "tags": ["pii"]}, + {"name": "phone", "tags": ["pii"]}, {"name": SAFE_COLUMN}, ], test_vars={ @@ -397,13 +385,10 @@ def test_custom_sql_test_with_multiple_pii_columns( ) assert test_result["status"] == "fail" - # Verify that PII columns are excluded from sampling samples = [ json.loads(row["result_row"]) for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) ] - - # With new logic: sampling is disabled entirely when PII is detected assert len(samples) == 0 @@ -430,4 +415,8 @@ def test_custom_sql_test_with_subquery_and_pii(test_id: str, dbt_project: DbtPro }, ) assert test_result["status"] == "pass" - # For passing tests, we don't expect samples to be generated + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + assert len(samples) == 0 diff --git a/macros/edr/materializations/test/test.sql b/macros/edr/materializations/test/test.sql index 1affb4828..3a3d4fc4e 100644 --- a/macros/edr/materializations/test/test.sql +++ b/macros/edr/materializations/test/test.sql @@ -159,7 +159,7 @@ {% do return(columns_to_exclude) %} {% endmacro %} -{# Simple logic: if test query contains PII columns or *, disable sampling entirely #} +{# if test query contains PII columns or *, disable sampling entirely #} {% macro should_disable_sampling_for_pii(flattened_test) %} {% if not elementary.get_config_var('disable_samples_on_pii_tags') %} {% do return(false) %} @@ -189,8 +189,6 @@ {% do return(false) %} {% endmacro %} -{# Removed complex column mapping macros - replaced with simpler approach #} - {% macro is_sampling_disabled_for_column(flattened_test) %} {% set test_column_name = elementary.insensitive_get_dict_value(flattened_test, 'test_column_name') %} {% set parent_model_unique_id = elementary.insensitive_get_dict_value(flattened_test, 'parent_model_unique_id') %} diff --git a/macros/edr/system/system_utils/is_pii_column.sql b/macros/edr/system/system_utils/is_pii_column.sql index ebea82309..036e3072d 100644 --- a/macros/edr/system/system_utils/is_pii_column.sql +++ b/macros/edr/system/system_utils/is_pii_column.sql @@ -21,11 +21,17 @@ {% set pii_tags = (raw_pii_tags if raw_pii_tags is iterable else [raw_pii_tags]) | map('lower') | list %} {% for column_node in column_nodes.values() %} + {# column -> tags #} + {% set column_tags = column_node.get('tags', []) %} + + {# column -> config -> tags #} {% set config_dict = column_node.get('config', {}) %} {% set config_tags = config_dict.get('tags', []) %} - {% set column_tags = column_node.get('tags', []) %} + + {# column -> meta -> tags #} {% set meta_dict = column_node.get('meta', {}) %} {% set meta_tags = meta_dict.get('tags', []) %} + {% set all_column_tags = config_tags + column_tags + meta_tags %} {% set all_column_tags_lower = all_column_tags | map('lower') | list %} From 89e49deb22a986789579e0dd432c1783b8c893a0 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 3 Aug 2025 15:27:46 +0000 Subject: [PATCH 11/15] Address Itamar's code review comments - Rename is_pii_column.sql to get_pii_columns_from_parent_model.sql to match macro name - Extract get_column_tags helper macro for better code organization - Add model-level PII tag support - if model has PII tags, all columns are considered PII - Add clarifying comment about intentional over-censoring with * queries in CTEs - Keep flattened_test parameter in query_test_result_rows for future functionality Fixes all outstanding code review feedback from haritamar on PR #833 Co-Authored-By: Yosef Arbiv --- macros/edr/materializations/test/test.sql | 2 + .../get_pii_columns_from_parent_model.sql | 71 +++++++++++++++++++ .../edr/system/system_utils/is_pii_column.sql | 47 ------------ 3 files changed, 73 insertions(+), 47 deletions(-) create mode 100644 macros/edr/system/system_utils/get_pii_columns_from_parent_model.sql delete mode 100644 macros/edr/system/system_utils/is_pii_column.sql diff --git a/macros/edr/materializations/test/test.sql b/macros/edr/materializations/test/test.sql index 3a3d4fc4e..82f0c7110 100644 --- a/macros/edr/materializations/test/test.sql +++ b/macros/edr/materializations/test/test.sql @@ -175,6 +175,8 @@ {% set test_query_lower = test_query.lower() %} {# Check if query uses * (select all columns) #} + {# Note: This is intentionally conservative and may over-censor in cases like + "SELECT * FROM other_table" in CTEs, but it's better to be safe with PII data #} {% if '*' in test_query_lower %} {% do return(true) %} {% endif %} diff --git a/macros/edr/system/system_utils/get_pii_columns_from_parent_model.sql b/macros/edr/system/system_utils/get_pii_columns_from_parent_model.sql new file mode 100644 index 000000000..6f0ee353d --- /dev/null +++ b/macros/edr/system/system_utils/get_pii_columns_from_parent_model.sql @@ -0,0 +1,71 @@ +{% macro get_column_tags(column_node) %} + {# column -> tags #} + {% set column_tags = column_node.get('tags', []) %} + + {# column -> config -> tags #} + {% set config_dict = column_node.get('config', {}) %} + {% set config_tags = config_dict.get('tags', []) %} + + {# column -> meta -> tags #} + {% set meta_dict = column_node.get('meta', {}) %} + {% set meta_tags = meta_dict.get('tags', []) %} + + {% set all_column_tags = config_tags + column_tags + meta_tags %} + {% do return(all_column_tags | map('lower') | list) %} +{% endmacro %} + +{% macro get_pii_columns_from_parent_model(flattened_test) %} + {% set pii_columns = [] %} + + {% if not elementary.get_config_var('disable_samples_on_pii_tags') %} + {% do return(pii_columns) %} + {% endif %} + + {% set parent_model_unique_id = elementary.insensitive_get_dict_value(flattened_test, 'parent_model_unique_id') %} + {% set parent_model = elementary.get_node(parent_model_unique_id) %} + + {% if not parent_model %} + {% do return(pii_columns) %} + {% endif %} + + {% set raw_pii_tags = elementary.get_config_var('pii_tags') %} + {% set pii_tags = (raw_pii_tags if raw_pii_tags is iterable else [raw_pii_tags]) | map('lower') | list %} + + {# Check if the model itself has PII tags - if so, all columns are considered PII #} + {% set model_tags = parent_model.get('tags', []) %} + {% set model_config_tags = parent_model.get('config', {}).get('tags', []) %} + {% set model_meta_tags = parent_model.get('meta', {}).get('tags', []) %} + {% set all_model_tags = (model_tags + model_config_tags + model_meta_tags) | map('lower') | list %} + + {% for pii_tag in pii_tags %} + {% if pii_tag in all_model_tags %} + {# Model has PII tag, return all column names #} + {% set column_nodes = parent_model.get("columns") %} + {% if column_nodes %} + {% for column_node in column_nodes.values() %} + {% do pii_columns.append(column_node.get('name')) %} + {% endfor %} + {% endif %} + {% do return(pii_columns) %} + {% endif %} + {% endfor %} + + {# Model doesn't have PII tags, check individual columns #} + {% set column_nodes = parent_model.get("columns") %} + {% if not column_nodes %} + {% do return(pii_columns) %} + {% endif %} + + {% for column_node in column_nodes.values() %} + {% set all_column_tags_lower = elementary.get_column_tags(column_node) %} + + {% for pii_tag in pii_tags %} + {% if pii_tag in all_column_tags_lower %} + {% do pii_columns.append(column_node.get('name')) %} + {% break %} + {% endif %} + {% endfor %} + {% endfor %} + + {% do return(pii_columns) %} +{% endmacro %} diff --git a/macros/edr/system/system_utils/is_pii_column.sql b/macros/edr/system/system_utils/is_pii_column.sql deleted file mode 100644 index 036e3072d..000000000 --- a/macros/edr/system/system_utils/is_pii_column.sql +++ /dev/null @@ -1,47 +0,0 @@ -{% macro get_pii_columns_from_parent_model(flattened_test) %} - {% set pii_columns = [] %} - - {% if not elementary.get_config_var('disable_samples_on_pii_tags') %} - {% do return(pii_columns) %} - {% endif %} - - {% set parent_model_unique_id = elementary.insensitive_get_dict_value(flattened_test, 'parent_model_unique_id') %} - {% set parent_model = elementary.get_node(parent_model_unique_id) %} - - {% if not parent_model %} - {% do return(pii_columns) %} - {% endif %} - - {% set column_nodes = parent_model.get("columns") %} - {% if not column_nodes %} - {% do return(pii_columns) %} - {% endif %} - - {% set raw_pii_tags = elementary.get_config_var('pii_tags') %} - {% set pii_tags = (raw_pii_tags if raw_pii_tags is iterable else [raw_pii_tags]) | map('lower') | list %} - - {% for column_node in column_nodes.values() %} - {# column -> tags #} - {% set column_tags = column_node.get('tags', []) %} - - {# column -> config -> tags #} - {% set config_dict = column_node.get('config', {}) %} - {% set config_tags = config_dict.get('tags', []) %} - - {# column -> meta -> tags #} - {% set meta_dict = column_node.get('meta', {}) %} - {% set meta_tags = meta_dict.get('tags', []) %} - - {% set all_column_tags = config_tags + column_tags + meta_tags %} - {% set all_column_tags_lower = all_column_tags | map('lower') | list %} - - {% for pii_tag in pii_tags %} - {% if pii_tag in all_column_tags_lower %} - {% do pii_columns.append(column_node.get('name')) %} - {% break %} - {% endif %} - {% endfor %} - {% endfor %} - - {% do return(pii_columns) %} -{% endmacro %} From c5b9f4581c90d6b99635365415463e5abee0afb8 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 4 Aug 2025 04:51:57 +0000 Subject: [PATCH 12/15] Fix string/list handling in PII column detection macros - Fix get_column_tags to properly handle string tags vs lists - Fix pii_tags processing to handle scalar strings correctly - Apply consistent string/list normalization to model tags - Addresses CodeRabbit comments r2250021209 and r2250021214 The core issue was that Jinja2's 'is iterable' test returns True for strings, causing scalar string tags like 'pii' to be treated as character arrays ['p','i','i'] instead of single-element lists ['pii']. Fixed by using explicit string checking with 'src is string' to properly distinguish between strings and actual lists/arrays. Co-Authored-By: Yosef Arbiv --- .../get_pii_columns_from_parent_model.sql | 43 ++++++++++++------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/macros/edr/system/system_utils/get_pii_columns_from_parent_model.sql b/macros/edr/system/system_utils/get_pii_columns_from_parent_model.sql index 6f0ee353d..264566d62 100644 --- a/macros/edr/system/system_utils/get_pii_columns_from_parent_model.sql +++ b/macros/edr/system/system_utils/get_pii_columns_from_parent_model.sql @@ -1,17 +1,17 @@ {% macro get_column_tags(column_node) %} - {# column -> tags #} - {% set column_tags = column_node.get('tags', []) %} + {% set _tags_sources = [ + column_node.get('tags', []), + column_node.get('config', {}).get('tags', []), + column_node.get('meta', {}).get('tags', []), + ] %} - {# column -> config -> tags #} - {% set config_dict = column_node.get('config', {}) %} - {% set config_tags = config_dict.get('tags', []) %} - - {# column -> meta -> tags #} - {% set meta_dict = column_node.get('meta', {}) %} - {% set meta_tags = meta_dict.get('tags', []) %} + {% set all_column_tags = [] %} + {% for src in _tags_sources %} + {% set tags_list = src if src is iterable and not (src is string) else [src] %} + {% do all_column_tags.extend(tags_list) %} + {% endfor %} - {% set all_column_tags = config_tags + column_tags + meta_tags %} - {% do return(all_column_tags | map('lower') | list) %} + {% do return(all_column_tags | map('lower') | unique | list) %} {% endmacro %} {% macro get_pii_columns_from_parent_model(flattened_test) %} @@ -29,13 +29,24 @@ {% endif %} {% set raw_pii_tags = elementary.get_config_var('pii_tags') %} - {% set pii_tags = (raw_pii_tags if raw_pii_tags is iterable else [raw_pii_tags]) | map('lower') | list %} + {% if raw_pii_tags is string %} + {% set pii_tags = [raw_pii_tags|lower] %} + {% else %} + {% set pii_tags = (raw_pii_tags or []) | map('lower') | list %} + {% endif %} {# Check if the model itself has PII tags - if so, all columns are considered PII #} - {% set model_tags = parent_model.get('tags', []) %} - {% set model_config_tags = parent_model.get('config', {}).get('tags', []) %} - {% set model_meta_tags = parent_model.get('meta', {}).get('tags', []) %} - {% set all_model_tags = (model_tags + model_config_tags + model_meta_tags) | map('lower') | list %} + {% set _model_tags_sources = [ + parent_model.get('tags', []), + parent_model.get('config', {}).get('tags', []), + parent_model.get('meta', {}).get('tags', []) + ] %} + {% set all_model_tags = [] %} + {% for src in _model_tags_sources %} + {% set tags_list = src if src is iterable and not (src is string) else [src] %} + {% do all_model_tags.extend(tags_list) %} + {% endfor %} + {% set all_model_tags = all_model_tags | map('lower') | unique | list %} {% for pii_tag in pii_tags %} {% if pii_tag in all_model_tags %} From 8cc405014b16e295de163d46fbb3fd8fca913da8 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 4 Aug 2025 04:54:17 +0000 Subject: [PATCH 13/15] Remove redundant model PII tag checking logic - Remove model-level PII tag checking from get_pii_columns_from_parent_model - This logic is handled elsewhere as noted by Yosef - Simplifies macro to focus only on column-level PII detection Co-Authored-By: Yosef Arbiv --- .../get_pii_columns_from_parent_model.sql | 28 +------------------ 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/macros/edr/system/system_utils/get_pii_columns_from_parent_model.sql b/macros/edr/system/system_utils/get_pii_columns_from_parent_model.sql index 264566d62..b60d374c1 100644 --- a/macros/edr/system/system_utils/get_pii_columns_from_parent_model.sql +++ b/macros/edr/system/system_utils/get_pii_columns_from_parent_model.sql @@ -35,33 +35,7 @@ {% set pii_tags = (raw_pii_tags or []) | map('lower') | list %} {% endif %} - {# Check if the model itself has PII tags - if so, all columns are considered PII #} - {% set _model_tags_sources = [ - parent_model.get('tags', []), - parent_model.get('config', {}).get('tags', []), - parent_model.get('meta', {}).get('tags', []) - ] %} - {% set all_model_tags = [] %} - {% for src in _model_tags_sources %} - {% set tags_list = src if src is iterable and not (src is string) else [src] %} - {% do all_model_tags.extend(tags_list) %} - {% endfor %} - {% set all_model_tags = all_model_tags | map('lower') | unique | list %} - - {% for pii_tag in pii_tags %} - {% if pii_tag in all_model_tags %} - {# Model has PII tag, return all column names #} - {% set column_nodes = parent_model.get("columns") %} - {% if column_nodes %} - {% for column_node in column_nodes.values() %} - {% do pii_columns.append(column_node.get('name')) %} - {% endfor %} - {% endif %} - {% do return(pii_columns) %} - {% endif %} - {% endfor %} - - {# Model doesn't have PII tags, check individual columns #} + {# Check individual columns for PII tags #} {% set column_nodes = parent_model.get("columns") %} {% if not column_nodes %} {% do return(pii_columns) %} From 812d9c28d32bb26276cfecacf9e3457369ddab7e Mon Sep 17 00:00:00 2001 From: arbiv Date: Mon, 4 Aug 2025 08:19:17 +0300 Subject: [PATCH 14/15] improved tests --- .../tests/test_column_pii_sampling.py | 124 +++++++----------- 1 file changed, 46 insertions(+), 78 deletions(-) diff --git a/integration_tests/tests/test_column_pii_sampling.py b/integration_tests/tests/test_column_pii_sampling.py index 71995d87e..dff983da6 100644 --- a/integration_tests/tests/test_column_pii_sampling.py +++ b/integration_tests/tests/test_column_pii_sampling.py @@ -43,7 +43,6 @@ def test_column_pii_sampling_enabled(test_id: str, dbt_project: DbtProject): "enable_elementary_test_materialization": True, "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, "disable_samples_on_pii_tags": True, - "pii_tags": ["pii"], }, ) assert test_result["status"] == "fail" @@ -92,6 +91,43 @@ def test_column_pii_sampling_disabled(test_id: str, dbt_project: DbtProject): assert samples[0]["n_records"] == 10 +@pytest.mark.skip_targets(["clickhouse"]) +def test_column_pii_default_tag_override(test_id: str, dbt_project: DbtProject): + """Test that default PII tag can be overridden with a custom tag""" + data = [ + {SENSITIVE_COLUMN: "user@example.com", SAFE_COLUMN: None} for i in range(10) + ] + + test_result = dbt_project.test( + test_id, + "unique", + test_args=dict(column_name=SENSITIVE_COLUMN), + data=data, + columns=[ + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + ], + test_vars={ + "enable_elementary_test_materialization": True, + "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, + "disable_samples_on_pii_tags": True, + "pii_tags": ["sensitive"], + }, + ) + assert test_result["status"] == "fail" + + samples = [ + json.loads(row["result_row"]) + for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) + ] + + # sample should be {'unique_field': 'user@example.com', 'n_records': 10} + assert len(samples) == 1 + assert "unique_field" in samples[0] + assert samples[0]["unique_field"] == "user@example.com" + assert "n_records" in samples[0] + assert samples[0]["n_records"] == 10 + + @pytest.mark.skip_targets(["clickhouse"]) def test_column_pii_sampling_tags_exist_but_flag_disabled( test_id: str, dbt_project: DbtProject @@ -105,7 +141,7 @@ def test_column_pii_sampling_tags_exist_but_flag_disabled( test_args=dict(column_name=SAFE_COLUMN), data=data, columns=[ - {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pIi"]}}, {"name": SAFE_COLUMN}, ], test_column=None, @@ -113,7 +149,6 @@ def test_column_pii_sampling_tags_exist_but_flag_disabled( "enable_elementary_test_materialization": True, "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, "disable_samples_on_pii_tags": False, # Flag is disabled - "pii_tags": ["pii"], }, ) assert test_result["status"] == "fail" @@ -167,7 +202,7 @@ def test_column_pii_sampling_all_columns_pii(test_id: str, dbt_project: DbtProje @pytest.mark.skip_targets(["clickhouse"]) -def test_unique_test_column_mapping(test_id: str, dbt_project: DbtProject): +def test_unique_test_custom_tag(test_id: str, dbt_project: DbtProject): """Test that column mapping correctly maps unique test columns""" data = [{SENSITIVE_COLUMN: "user@example.com", SAFE_COLUMN: i} for i in range(10)] @@ -177,14 +212,14 @@ def test_unique_test_column_mapping(test_id: str, dbt_project: DbtProject): test_args=dict(column_name=SENSITIVE_COLUMN), data=data, columns=[ - {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SENSITIVE_COLUMN, "config": {"tags": ["custom_tag"]}}, {"name": SAFE_COLUMN}, ], test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, "disable_samples_on_pii_tags": True, - "pii_tags": ["pii"], + "pii_tags": ["custom_tag"], }, ) assert test_result["status"] == "fail" @@ -198,7 +233,7 @@ def test_unique_test_column_mapping(test_id: str, dbt_project: DbtProject): @pytest.mark.skip_targets(["clickhouse"]) -def test_accepted_values_test_column_mapping(test_id: str, dbt_project: DbtProject): +def test_accepted_values_multi_tags(test_id: str, dbt_project: DbtProject): """Test that column mapping correctly maps accepted_values test columns""" data = [{SENSITIVE_COLUMN: "invalid_value", SAFE_COLUMN: i} for i in range(10)] @@ -208,7 +243,7 @@ def test_accepted_values_test_column_mapping(test_id: str, dbt_project: DbtProje test_args=dict(column_name=SENSITIVE_COLUMN, values=["valid1", "valid2"]), data=data, columns=[ - {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii", "custom_tag"]}}, {"name": SAFE_COLUMN}, ], test_vars={ @@ -229,7 +264,7 @@ def test_accepted_values_test_column_mapping(test_id: str, dbt_project: DbtProje @pytest.mark.skip_targets(["clickhouse"]) -def test_not_null_test_column_mapping(test_id: str, dbt_project: DbtProject): +def test_not_null_test_multi_matched_tags(test_id: str, dbt_project: DbtProject): """Test that column mapping correctly handles not_null test columns""" data = [{SENSITIVE_COLUMN: None, SAFE_COLUMN: i} for i in range(10)] @@ -239,14 +274,14 @@ def test_not_null_test_column_mapping(test_id: str, dbt_project: DbtProject): test_args=dict(column_name=SENSITIVE_COLUMN), data=data, columns=[ - {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, + {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii", "sensitive"]}}, {"name": SAFE_COLUMN}, ], test_vars={ "enable_elementary_test_materialization": True, "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, "disable_samples_on_pii_tags": True, - "pii_tags": ["pii"], + "pii_tags": ["pii", "sensitive"], }, ) assert test_result["status"] == "fail" @@ -353,70 +388,3 @@ def test_meta_tags_and_accepted_values(test_id: str, dbt_project: DbtProject): ] assert len(samples) == 0 - - -@pytest.mark.skip_targets(["clickhouse"]) -def test_custom_sql_test_with_multiple_pii_columns( - test_id: str, dbt_project: DbtProject -): - """Test that custom SQL tests with multiple PII columns are handled correctly""" - data = [ - {SENSITIVE_COLUMN: "user@example.com", "phone": "123-456-7890", SAFE_COLUMN: i} - for i in range(10) - ] - - # Test with unique to simulate complex multi-column scenarios - test_result = dbt_project.test( - test_id, - "unique", - test_args=dict(column_name=SENSITIVE_COLUMN), - data=data, - columns=[ - {"name": SENSITIVE_COLUMN, "tags": ["pii"]}, - {"name": "phone", "tags": ["pii"]}, - {"name": SAFE_COLUMN}, - ], - test_vars={ - "enable_elementary_test_materialization": True, - "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, - "disable_samples_on_pii_tags": True, - "pii_tags": ["pii"], - }, - ) - assert test_result["status"] == "fail" - - samples = [ - json.loads(row["result_row"]) - for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) - ] - assert len(samples) == 0 - - -@pytest.mark.skip_targets(["clickhouse"]) -def test_custom_sql_test_with_subquery_and_pii(test_id: str, dbt_project: DbtProject): - """Test that custom SQL tests with subqueries and PII columns work correctly""" - data = [{SENSITIVE_COLUMN: "user@example.com", SAFE_COLUMN: i} for i in range(10)] - - # Test with not_null to simulate subquery-like scenarios - test_result = dbt_project.test( - test_id, - "not_null", - test_args=dict(column_name=SENSITIVE_COLUMN), - data=data, - columns=[ - {"name": SENSITIVE_COLUMN, "config": {"tags": ["pii"]}}, - {"name": SAFE_COLUMN}, - ], - test_vars={ - "enable_elementary_test_materialization": True, - "test_sample_row_count": TEST_SAMPLE_ROW_COUNT, - "disable_samples_on_pii_tags": True, - "pii_tags": ["pii"], - }, - ) - assert test_result["status"] == "pass" - samples = [ - json.loads(row["result_row"]) - for row in dbt_project.run_query(SAMPLES_QUERY.format(test_id=test_id)) - ] - assert len(samples) == 0 From 25ddd6780172681e39242a6daed9e1242ee0f840 Mon Sep 17 00:00:00 2001 From: arbiv Date: Mon, 4 Aug 2025 11:09:12 +0300 Subject: [PATCH 15/15] fixed cr comment --- macros/edr/materializations/test/test.sql | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/macros/edr/materializations/test/test.sql b/macros/edr/materializations/test/test.sql index 82f0c7110..9452978d4 100644 --- a/macros/edr/materializations/test/test.sql +++ b/macros/edr/materializations/test/test.sql @@ -65,9 +65,7 @@ {% set sample_limit = 0 %} {% endif %} - {% set result_rows = elementary.query_test_result_rows(sample_limit=sample_limit, - ignore_passed_tests=true, - flattened_test=flattened_test) %} + {% set result_rows = elementary.query_test_result_rows(sample_limit=sample_limit, ignore_passed_tests=true) %} {% set elementary_test_results_row = elementary.get_dbt_test_result_row(flattened_test, result_rows) %} {% do elementary.cache_elementary_test_results_rows([elementary_test_results_row]) %} {% do return(result) %} @@ -119,7 +117,7 @@ {% do return(new_sql) %} {% endmacro %} -{% macro query_test_result_rows(sample_limit=none, ignore_passed_tests=false, flattened_test=none) %} +{% macro query_test_result_rows(sample_limit=none, ignore_passed_tests=false) %} {% if sample_limit == 0 %} {# performance: no need to run a sql query that we know returns an empty list #} {% do return([]) %} {% endif %}