Skip to content

Commit 52961f0

Browse files
authored
Fix: Dont backfill seeds if the seed data didnt actually change (#5398)
1 parent df7d61b commit 52961f0

File tree

3 files changed

+179
-1
lines changed

3 files changed

+179
-1
lines changed

sqlmesh/core/plan/common.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ def should_force_rebuild(old: Snapshot, new: Snapshot) -> bool:
1616
if new.is_view and new.is_indirect_non_breaking and not new.is_forward_only:
1717
# View models always need to be rebuilt to reflect updated upstream dependencies
1818
return True
19-
if new.is_seed:
19+
if new.is_seed and not new.is_metadata:
2020
# Seed models always need to be rebuilt to reflect changes in the seed file
21+
# Unless only their metadata has been updated (eg description added) and the seed file has not been touched
2122
return True
2223
return is_breaking_kind_change(old, new)
2324

tests/core/test_integration.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10662,3 +10662,121 @@ def _run_restatement_plan(tmp_path: Path, config: Config, q: queue.Queue):
1066210662
assert len(model_a.intervals)
1066310663

1066410664
set_console(orig_console)
10665+
10666+
10667+
def test_seed_model_metadata_update_does_not_trigger_backfill(tmp_path: Path):
10668+
"""
10669+
Scenario:
10670+
- Create a seed model; perform initial population
10671+
- Modify the model with a metadata-only change and trigger a plan
10672+
10673+
Outcome:
10674+
- The seed model is modified (metadata-only) but this should NOT trigger backfill
10675+
- There should be no missing_intervals on the plan to backfill
10676+
"""
10677+
10678+
models_path = tmp_path / "models"
10679+
seeds_path = tmp_path / "seeds"
10680+
models_path.mkdir()
10681+
seeds_path.mkdir()
10682+
10683+
seed_model_path = models_path / "seed.sql"
10684+
seed_path = seeds_path / "seed_data.csv"
10685+
10686+
seed_path.write_text("\n".join(["id,name", "1,test"]))
10687+
10688+
seed_model_path.write_text("""
10689+
MODEL (
10690+
name test.source_data,
10691+
kind SEED (
10692+
path '../seeds/seed_data.csv'
10693+
)
10694+
);
10695+
""")
10696+
10697+
config = Config(
10698+
gateways={"": GatewayConfig(connection=DuckDBConnectionConfig())},
10699+
model_defaults=ModelDefaultsConfig(dialect="duckdb", start="2024-01-01"),
10700+
)
10701+
ctx = Context(paths=tmp_path, config=config)
10702+
10703+
plan = ctx.plan(auto_apply=True)
10704+
10705+
original_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"']
10706+
assert plan.directly_modified == {original_seed_snapshot.snapshot_id}
10707+
assert plan.metadata_updated == set()
10708+
assert plan.missing_intervals
10709+
10710+
# prove data loaded
10711+
assert ctx.engine_adapter.fetchall("select id, name from memory.test.source_data") == [
10712+
(1, "test")
10713+
]
10714+
10715+
# prove no diff
10716+
ctx.load()
10717+
plan = ctx.plan(auto_apply=True)
10718+
assert not plan.has_changes
10719+
assert not plan.missing_intervals
10720+
10721+
# make metadata-only change
10722+
seed_model_path.write_text("""
10723+
MODEL (
10724+
name test.source_data,
10725+
kind SEED (
10726+
path '../seeds/seed_data.csv'
10727+
),
10728+
description 'updated by test'
10729+
);
10730+
""")
10731+
10732+
ctx.load()
10733+
plan = ctx.plan(auto_apply=True)
10734+
assert plan.has_changes
10735+
10736+
new_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"']
10737+
assert (
10738+
new_seed_snapshot.version == original_seed_snapshot.version
10739+
) # should be using the same physical table
10740+
assert (
10741+
new_seed_snapshot.snapshot_id != original_seed_snapshot.snapshot_id
10742+
) # but still be different due to the metadata change
10743+
assert plan.directly_modified == set()
10744+
assert plan.metadata_updated == {new_seed_snapshot.snapshot_id}
10745+
10746+
# there should be no missing intervals to backfill since all we did is update a description
10747+
assert not plan.missing_intervals
10748+
10749+
# there should still be no diff or missing intervals in 3 days time
10750+
assert new_seed_snapshot.model.interval_unit.is_day
10751+
with time_machine.travel(timedelta(days=3)):
10752+
ctx.clear_caches()
10753+
ctx.load()
10754+
plan = ctx.plan(auto_apply=True)
10755+
assert not plan.has_changes
10756+
assert not plan.missing_intervals
10757+
10758+
# change seed data
10759+
seed_path.write_text("\n".join(["id,name", "1,test", "2,updated"]))
10760+
10761+
# new plan - NOW we should backfill because data changed
10762+
ctx.load()
10763+
plan = ctx.plan(auto_apply=True)
10764+
assert plan.has_changes
10765+
10766+
updated_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"']
10767+
10768+
assert (
10769+
updated_seed_snapshot.snapshot_id
10770+
!= new_seed_snapshot.snapshot_id
10771+
!= original_seed_snapshot.snapshot_id
10772+
)
10773+
assert not updated_seed_snapshot.forward_only
10774+
assert plan.directly_modified == {updated_seed_snapshot.snapshot_id}
10775+
assert plan.metadata_updated == set()
10776+
assert plan.missing_intervals
10777+
10778+
# prove backfilled data loaded
10779+
assert ctx.engine_adapter.fetchall("select id, name from memory.test.source_data") == [
10780+
(1, "test"),
10781+
(2, "updated"),
10782+
]

tests/core/test_plan.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,6 +1189,65 @@ def test_forward_only_plan_seed_models(make_snapshot, mocker: MockerFixture):
11891189
assert not snapshot_a_updated.is_forward_only
11901190

11911191

1192+
def test_seed_model_metadata_change_no_missing_intervals(
1193+
make_snapshot: t.Callable[..., Snapshot],
1194+
):
1195+
snapshot_a = make_snapshot(
1196+
SeedModel(
1197+
name="a",
1198+
kind=SeedKind(path="./path/to/seed"),
1199+
seed=Seed(content="content"),
1200+
column_hashes={"col": "hash1"},
1201+
depends_on=set(),
1202+
)
1203+
)
1204+
snapshot_a.categorize_as(SnapshotChangeCategory.BREAKING)
1205+
snapshot_a.add_interval("2022-01-01", now())
1206+
1207+
snapshot_a_metadata_updated = make_snapshot(
1208+
SeedModel(
1209+
name="a",
1210+
kind=SeedKind(path="./path/to/seed"),
1211+
seed=Seed(content="content"),
1212+
column_hashes={"col": "hash1"},
1213+
depends_on=set(),
1214+
description="foo",
1215+
)
1216+
)
1217+
assert snapshot_a_metadata_updated.version is None
1218+
assert snapshot_a_metadata_updated.change_category is None
1219+
1220+
context_diff = ContextDiff(
1221+
environment="prod",
1222+
is_new_environment=True,
1223+
is_unfinalized_environment=False,
1224+
normalize_environment_name=True,
1225+
create_from="prod",
1226+
create_from_env_exists=True,
1227+
added=set(),
1228+
removed_snapshots={},
1229+
modified_snapshots={
1230+
snapshot_a_metadata_updated.name: (snapshot_a_metadata_updated, snapshot_a)
1231+
},
1232+
snapshots={snapshot_a_metadata_updated.snapshot_id: snapshot_a_metadata_updated},
1233+
new_snapshots={snapshot_a_metadata_updated.snapshot_id: snapshot_a},
1234+
previous_plan_id=None,
1235+
previously_promoted_snapshot_ids=set(),
1236+
previous_finalized_snapshots=None,
1237+
previous_gateway_managed_virtual_layer=False,
1238+
gateway_managed_virtual_layer=False,
1239+
environment_statements=[],
1240+
)
1241+
1242+
plan = PlanBuilder(context_diff).build()
1243+
assert snapshot_a_metadata_updated.change_category == SnapshotChangeCategory.METADATA
1244+
assert not snapshot_a_metadata_updated.is_forward_only
1245+
assert not plan.missing_intervals # plan should have no missing intervals
1246+
assert (
1247+
snapshot_a_metadata_updated.intervals == snapshot_a.intervals
1248+
) # intervals should have been copied
1249+
1250+
11921251
def test_start_inference(make_snapshot, mocker: MockerFixture):
11931252
snapshot_a = make_snapshot(
11941253
SqlModel(name="a", query=parse_one("select 1, ds"), start="2022-01-01")

0 commit comments

Comments
 (0)