Skip to content

Commit b4de512

Browse files
Merge remote-tracking branch 'origin/master' into devin/CORE-93-1763039329
2 parents 36878be + cccc439 commit b4de512

11 files changed

Lines changed: 320 additions & 10 deletions

dbt_project.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name: "elementary"
22
version: "0.20.1"
33

4-
require-dbt-version: [">=1.0.0", "<2.0.0"]
4+
require-dbt-version: [">=1.0.0", "<3.0.0"]
55

66
config-version: 2
77
profile: "elementary"

integration_tests/requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,6 @@ pytest-xdist
33
pytest-parametrization
44
pytest-html
55
filelock
6-
urllib3==2.0.6
6+
# urllib3>=2.2.2 fixes CVE-2023-45803 and CVE-2024-37891
7+
# Upper bound <3.0.0 prevents breaking changes from future major versions
8+
urllib3>=2.2.2,<3.0.0

integration_tests/tests/test_column_anomalies.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,3 +476,105 @@ def test_anomalous_boolean_column_anomalies(test_id: str, dbt_project: DbtProjec
476476
"count_true",
477477
"count_false",
478478
}
479+
480+
481+
# Anomalies currently not supported on ClickHouse
482+
@pytest.mark.skip_targets(["clickhouse"])
483+
def test_col_anom_excl_detect_train(test_id: str, dbt_project: DbtProject):
484+
"""
485+
Test the exclude_detection_period_from_training flag functionality for column anomalies.
486+
487+
Scenario:
488+
- 30 days of normal data with low null count (0-2 nulls per day)
489+
- 7 days of anomalous data with high null count (20 nulls per day) in detection period
490+
- Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
491+
- With exclusion: anomaly excluded from training, test fails (detects anomaly)
492+
"""
493+
utc_today = datetime.utcnow().date()
494+
495+
# Generate 30 days of normal data with variance in null count (8, 10, 12 pattern)
496+
normal_pattern = [8, 10, 12]
497+
normal_data = []
498+
for i in range(30):
499+
date = utc_today - timedelta(days=37 - i)
500+
null_count = normal_pattern[i % 3]
501+
normal_data.extend(
502+
[
503+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": superhero}
504+
for superhero in ["Superman", "Batman", "Wonder Woman", "Flash"] * 10
505+
]
506+
)
507+
normal_data.extend(
508+
[
509+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None}
510+
for _ in range(null_count)
511+
]
512+
)
513+
514+
# Generate 7 days of anomalous data (20 nulls per day) - 100% increase from mean
515+
anomalous_data = []
516+
for i in range(7):
517+
date = utc_today - timedelta(days=7 - i)
518+
anomalous_data.extend(
519+
[
520+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": superhero}
521+
for superhero in ["Superman", "Batman", "Wonder Woman", "Flash"] * 10
522+
]
523+
)
524+
anomalous_data.extend(
525+
[
526+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None}
527+
for _ in range(20)
528+
]
529+
)
530+
531+
all_data = normal_data + anomalous_data
532+
533+
# Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
534+
test_args_without_exclusion = {
535+
"timestamp_column": TIMESTAMP_COLUMN,
536+
"column_anomalies": ["null_count"],
537+
"time_bucket": {"period": "day", "count": 1},
538+
"training_period": {"period": "day", "count": 30},
539+
"detection_period": {"period": "day", "count": 7},
540+
"min_training_set_size": 5,
541+
"anomaly_sensitivity": 5,
542+
"anomaly_direction": "spike",
543+
"exclude_detection_period_from_training": False,
544+
}
545+
546+
test_result_without_exclusion = dbt_project.test(
547+
test_id + "_f",
548+
DBT_TEST_NAME,
549+
test_args_without_exclusion,
550+
data=all_data,
551+
test_column="superhero",
552+
test_vars={"force_metrics_backfill": True},
553+
)
554+
555+
# This should PASS because the anomaly is included in training, making it part of the baseline
556+
assert test_result_without_exclusion["status"] == "pass", (
557+
"Expected PASS when exclude_detection_period_from_training=False "
558+
"(detection data included in training baseline)"
559+
)
560+
561+
# Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
562+
test_args_with_exclusion = {
563+
**test_args_without_exclusion,
564+
"exclude_detection_period_from_training": True,
565+
}
566+
567+
test_result_with_exclusion = dbt_project.test(
568+
test_id + "_t",
569+
DBT_TEST_NAME,
570+
test_args_with_exclusion,
571+
data=all_data,
572+
test_column="superhero",
573+
test_vars={"force_metrics_backfill": True},
574+
)
575+
576+
# This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
577+
assert test_result_with_exclusion["status"] == "fail", (
578+
"Expected FAIL when exclude_detection_period_from_training=True "
579+
"(detection data excluded from training baseline, anomaly detected)"
580+
)

integration_tests/tests/test_event_freshness_anomalies.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,105 @@ def test_slower_rate_event_freshness(test_id: str, dbt_project: DbtProject):
8888
test_vars={"custom_run_started_at": test_started_at.isoformat()},
8989
)
9090
assert result["status"] == "fail"
91+
92+
93+
# Anomalies currently not supported on ClickHouse
94+
@pytest.mark.skip_targets(["clickhouse"])
95+
def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
96+
"""
97+
Test the exclude_detection_period_from_training flag functionality for event freshness anomalies.
98+
99+
Scenario:
100+
- 7 days of normal data (5 minute lag between event and update) - training period
101+
- 7 days of anomalous data (5 hour lag) - detection period
102+
- Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
103+
- With exclusion: anomaly excluded from training, test fails (detects anomaly)
104+
105+
"""
106+
utc_now = datetime.utcnow()
107+
test_started_at = (utc_now + timedelta(days=1)).replace(
108+
hour=0, minute=0, second=0, microsecond=0
109+
)
110+
111+
# Generate 7 days of normal data with varying lag (2-8 minutes) to ensure training_stddev > 0
112+
training_lags_minutes = [2, 3, 4, 5, 6, 7, 8]
113+
normal_data = []
114+
for i in range(7):
115+
event_date = test_started_at - timedelta(days=14 - i)
116+
event_time = event_date.replace(hour=12, minute=0, second=0, microsecond=0)
117+
update_time = event_time + timedelta(minutes=training_lags_minutes[i])
118+
normal_data.append(
119+
{
120+
EVENT_TIMESTAMP_COLUMN: event_time.strftime(DATE_FORMAT),
121+
UPDATE_TIMESTAMP_COLUMN: update_time.strftime(DATE_FORMAT),
122+
}
123+
)
124+
125+
# Generate 7 days of anomalous data with 5-hour lag (detection period)
126+
anomalous_data = []
127+
for i in range(7):
128+
event_date = test_started_at - timedelta(days=7 - i)
129+
event_time = event_date.replace(hour=12, minute=0, second=0, microsecond=0)
130+
update_time = event_time + timedelta(hours=5)
131+
anomalous_data.append(
132+
{
133+
EVENT_TIMESTAMP_COLUMN: event_time.strftime(DATE_FORMAT),
134+
UPDATE_TIMESTAMP_COLUMN: update_time.strftime(DATE_FORMAT),
135+
}
136+
)
137+
138+
all_data = normal_data + anomalous_data
139+
140+
# Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
141+
test_args_without_exclusion = {
142+
"event_timestamp_column": EVENT_TIMESTAMP_COLUMN,
143+
"update_timestamp_column": UPDATE_TIMESTAMP_COLUMN,
144+
"days_back": 14, # Scoring window: 14 days to include both training and detection
145+
"backfill_days": 7, # Detection period: last 7 days (days 7-1 before test_started_at)
146+
"time_bucket": {
147+
"period": "day",
148+
"count": 1,
149+
}, # Daily buckets to avoid boundary issues
150+
"sensitivity": 3,
151+
"anomaly_direction": "spike", # Explicit direction since we're testing increased lag
152+
"min_training_set_size": 5, # Explicit minimum to avoid threshold issues
153+
# exclude_detection_period_from_training is not set (defaults to False/None)
154+
}
155+
156+
test_result_without_exclusion = dbt_project.test(
157+
test_id + "_without_exclusion",
158+
TEST_NAME,
159+
test_args_without_exclusion,
160+
data=all_data,
161+
test_vars={
162+
"custom_run_started_at": test_started_at.isoformat(),
163+
"force_metrics_backfill": True,
164+
},
165+
)
166+
167+
# This should PASS because the anomaly is included in training, making it part of the baseline
168+
assert (
169+
test_result_without_exclusion["status"] == "pass"
170+
), "Test should pass when anomaly is included in training"
171+
172+
# Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
173+
test_args_with_exclusion = {
174+
**test_args_without_exclusion,
175+
"exclude_detection_period_from_training": True,
176+
}
177+
178+
test_result_with_exclusion = dbt_project.test(
179+
test_id + "_with_exclusion",
180+
TEST_NAME,
181+
test_args_with_exclusion,
182+
data=all_data,
183+
test_vars={
184+
"custom_run_started_at": test_started_at.isoformat(),
185+
"force_metrics_backfill": True,
186+
},
187+
)
188+
189+
# This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
190+
assert (
191+
test_result_with_exclusion["status"] == "fail"
192+
), "Test should fail when anomaly is excluded from training"

integration_tests/tests/test_freshness_anomalies.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,3 +233,88 @@ def test_first_metric_null(test_id, dbt_project: DbtProject):
233233
materialization="incremental",
234234
)
235235
assert result["status"] == "pass"
236+
237+
238+
@pytest.mark.skip_targets(["clickhouse"])
239+
def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
240+
"""
241+
Test exclude_detection_period_from_training flag for freshness anomalies.
242+
243+
Data: 7 days normal (frequent updates, days -14 to -8) + 7 days anomalous (1 update/day, days -7 to -1)
244+
Without exclusion: anomalous data in training baseline → test passes
245+
With exclusion: anomalous data excluded from training → test fails
246+
"""
247+
utc_now = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
248+
249+
# Generate 7 days of normal data with frequent updates (every 2 hours) from day -14 to day -8
250+
normal_data = [
251+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT)}
252+
for date in generate_dates(
253+
base_date=utc_now - timedelta(days=8),
254+
step=timedelta(hours=2),
255+
days_back=7,
256+
)
257+
]
258+
259+
# Generate 7 days of anomalous data (only 1 update per day at noon) from day -7 to day -1
260+
anomalous_data = [
261+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT)}
262+
for date in generate_dates(
263+
base_date=(utc_now - timedelta(days=1)).replace(hour=12, minute=0),
264+
step=timedelta(hours=24),
265+
days_back=7,
266+
)
267+
]
268+
269+
all_data = normal_data + anomalous_data
270+
271+
# Test 1: WITHOUT exclusion (should pass - training includes detection window with anomalous pattern)
272+
test_args_without_exclusion = {
273+
"timestamp_column": TIMESTAMP_COLUMN,
274+
"training_period": {"period": "day", "count": 14},
275+
"detection_period": {"period": "day", "count": 7},
276+
"time_bucket": {"period": "day", "count": 1},
277+
"days_back": 20,
278+
"backfill_days": 0,
279+
"sensitivity": 3,
280+
"min_training_set_size": 3,
281+
"anomaly_direction": "spike",
282+
"ignore_small_changes": {
283+
"spike_failure_percent_threshold": 0,
284+
"drop_failure_percent_threshold": 0,
285+
},
286+
}
287+
288+
detection_end = utc_now
289+
290+
test_result_without_exclusion = dbt_project.test(
291+
test_id + "_without_exclusion",
292+
TEST_NAME,
293+
test_args_without_exclusion,
294+
data=all_data,
295+
test_vars={"custom_run_started_at": detection_end.isoformat()},
296+
)
297+
298+
# This should PASS because the anomaly is included in training, making it part of the baseline
299+
assert (
300+
test_result_without_exclusion["status"] == "pass"
301+
), "Test should pass when anomaly is included in training"
302+
303+
# Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
304+
test_args_with_exclusion = {
305+
**test_args_without_exclusion,
306+
"exclude_detection_period_from_training": True,
307+
}
308+
309+
test_result_with_exclusion = dbt_project.test(
310+
test_id + "_with_exclusion",
311+
TEST_NAME,
312+
test_args_with_exclusion,
313+
data=all_data,
314+
test_vars={"custom_run_started_at": detection_end.isoformat()},
315+
)
316+
317+
# This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
318+
assert (
319+
test_result_with_exclusion["status"] == "fail"
320+
), "Test should fail when anomaly is excluded from training"

macros/edr/tests/test_event_freshness_anomalies.sql

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{% test event_freshness_anomalies(model, event_timestamp_column, update_timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity, ignore_small_changes, detection_delay, anomaly_exclude_metrics, detection_period, training_period) %}
1+
{% test event_freshness_anomalies(model, event_timestamp_column, update_timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity, ignore_small_changes, detection_delay, anomaly_exclude_metrics, detection_period, training_period, exclude_detection_period_from_training=false) %}
22
{{ config(tags = ['elementary-tests']) }}
33
{% if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %}
44
{% set model_relation = elementary.get_model_relation_for_test(model, elementary.get_test_model()) %}
@@ -32,7 +32,8 @@
3232
detection_delay=detection_delay,
3333
anomaly_exclude_metrics=anomaly_exclude_metrics,
3434
detection_period=detection_period,
35-
training_period=training_period
35+
training_period=training_period,
36+
exclude_detection_period_from_training=exclude_detection_period_from_training
3637
)
3738
}}
3839
{% endtest %}

macros/edr/tests/test_freshness_anomalies.sql

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{% test freshness_anomalies(model, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity, ignore_small_changes, detection_delay, anomaly_exclude_metrics, detection_period, training_period) %}
1+
{% test freshness_anomalies(model, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity, ignore_small_changes, detection_delay, anomaly_exclude_metrics, detection_period, training_period, exclude_detection_period_from_training=false) %}
22
{{ config(tags = ['elementary-tests']) }}
33
{{ elementary.test_table_anomalies(
44
model=model,
@@ -18,7 +18,8 @@
1818
detection_delay=detection_delay,
1919
anomaly_exclude_metrics=anomaly_exclude_metrics,
2020
detection_period=detection_period,
21-
training_period=training_period
21+
training_period=training_period,
22+
exclude_detection_period_from_training=exclude_detection_period_from_training
2223
)
2324
}}
2425
{% endtest %}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{% macro get_query_settings() %}
2+
{% do return(adapter.dispatch("get_query_settings", "elementary")()) %}
3+
{% endmacro %}
4+
5+
{% macro default__get_query_settings() %}
6+
{% do return("") %}
7+
{% endmacro %}
8+
9+
{% macro clickhouse__get_query_settings() %}
10+
{% do return(adapter.get_model_query_settings(model)) %}
11+
{% endmacro %}
12+

macros/utils/table_operations/delete_and_insert.sql

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,15 @@
6161
{% set delete_query %}
6262
alter table {{ relation }} delete where
6363
{{ delete_column_key }} is null
64-
or {{ delete_column_key }} in (select {{ delete_column_key }} from {{ delete_relation }});
64+
or {{ delete_column_key }} in (select {{ delete_column_key }} from {{ delete_relation }})
65+
{{ adapter.get_model_query_settings(model) }};
6566
{% endset %}
6667
{% do queries.append(delete_query) %}
6768
{% endif %}
6869

6970
{% if insert_relation %}
7071
{% set insert_query %}
71-
insert into {{ relation }} select * from {{ insert_relation }};
72+
insert into {{ relation }} {{ adapter.get_model_query_settings(model) }} select * from {{ insert_relation }};
7273
{% endset %}
7374
{% do queries.append(insert_query) %}
7475
{% endif %}

macros/utils/table_operations/insert_rows.sql

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@
6363
insert into {{ table_relation }}
6464
({%- for column in columns -%}
6565
{{- elementary.escape_reserved_keywords(column.name) -}} {{- "," if not loop.last else "" -}}
66-
{%- endfor -%}) values
66+
{%- endfor -%})
67+
{{ elementary.get_query_settings() }}
68+
values
6769
{% endset %}
6870
{% do elementary.end_duration_measure_context('base_query_calc') %}
6971

0 commit comments

Comments
 (0)