Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions sqlmesh/core/plan/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,12 @@ def _categorize_snapshot(
snapshot.categorize_as(SnapshotChangeCategory.INDIRECT_NON_BREAKING, forward_only)
else:
# Metadata updated.
if not forward_only and snapshot.is_seed:
# seeds with metadata updates should be categorized as forward_only to prevent unnecessary backfill
# backfill should only happen if the actual seed data changes, in which case it will be classed as
# directly modified / breaking and not forward only as this code path will not be hit
forward_only = True
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I've realized this might not be necessary. METADATA category already reuses the version

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, you're right. I've simplified this PR


snapshot.categorize_as(SnapshotChangeCategory.METADATA, forward_only)

def _get_orphaned_indirect_change_category(
Expand Down
3 changes: 2 additions & 1 deletion sqlmesh/core/plan/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ def should_force_rebuild(old: Snapshot, new: Snapshot) -> bool:
if new.is_view and new.is_indirect_non_breaking and not new.is_forward_only:
# View models always need to be rebuilt to reflect updated upstream dependencies
return True
if new.is_seed:
if new.is_seed and not new.is_metadata:
# Seed models always need to be rebuilt to reflect changes in the seed file
# Unless only their metadata has been updated (eg description added) and the seed file has not been touched
return True
return is_breaking_kind_change(old, new)

Expand Down
122 changes: 122 additions & 0 deletions tests/core/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10662,3 +10662,125 @@ def _run_restatement_plan(tmp_path: Path, config: Config, q: queue.Queue):
assert len(model_a.intervals)

set_console(orig_console)


def test_seed_model_metadata_update_sets_forward_only(tmp_path: Path):
"""
Scenario:
- Create a seed model; perform initial population
- Modify the model with a metadata-only change and trigger a plan

Outcome:
- The seed model is modified (metadata-only) but this should NOT trigger backfill
- To prevent backfill, the categorizer needs to set forward_only on the seed
"""

models_path = tmp_path / "models"
seeds_path = tmp_path / "seeds"
models_path.mkdir()
seeds_path.mkdir()

seed_model_path = models_path / "seed.sql"
seed_path = seeds_path / "seed_data.csv"

seed_path.write_text("\n".join(["id,name", "1,test"]))

seed_model_path.write_text("""
MODEL (
name test.source_data,
kind SEED (
path '../seeds/seed_data.csv'
)
);
""")

config = Config(
gateways={"": GatewayConfig(connection=DuckDBConnectionConfig())},
model_defaults=ModelDefaultsConfig(dialect="duckdb", start="2024-01-01"),
)
ctx = Context(paths=tmp_path, config=config)

plan = ctx.plan(auto_apply=True)

original_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"']
assert not original_seed_snapshot.forward_only
assert plan.directly_modified == {original_seed_snapshot.snapshot_id}
assert plan.metadata_updated == set()
assert plan.missing_intervals

# prove data loaded
assert ctx.engine_adapter.fetchall("select id, name from memory.test.source_data") == [
(1, "test")
]

# prove no diff
ctx.load()
plan = ctx.plan(auto_apply=True)
assert not plan.has_changes
assert not plan.missing_intervals

# make metadata-only change
seed_model_path.write_text("""
MODEL (
name test.source_data,
kind SEED (
path '../seeds/seed_data.csv'
),
description 'updated by test'
);
""")

ctx.load()
plan = ctx.plan(auto_apply=True)
assert plan.has_changes

new_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"']
assert (
new_seed_snapshot.forward_only
) # change needs to be applied as forward-only to prevent backfill
assert (
new_seed_snapshot.version == original_seed_snapshot.version
) # should be using the same physical table
assert (
new_seed_snapshot.snapshot_id != original_seed_snapshot.snapshot_id
) # but still be different due to the metadata change
assert plan.directly_modified == set()
assert plan.metadata_updated == {new_seed_snapshot.snapshot_id}

# there should be no missing intervals to backfill since all we did is update a description
assert not plan.missing_intervals

# there should still be no diff or missing intervals in 3 days time
assert new_seed_snapshot.model.interval_unit.is_day
with time_machine.travel(timedelta(days=3)):
ctx.clear_caches()
ctx.load()
plan = ctx.plan(auto_apply=True)
assert not plan.has_changes
assert not plan.missing_intervals

# change seed data
seed_path.write_text("\n".join(["id,name", "1,test", "2,updated"]))

# new plan - NOW we should backfill because data changed
ctx.load()
plan = ctx.plan(auto_apply=True)
assert plan.has_changes

updated_seed_snapshot = ctx.snapshots['"memory"."test"."source_data"']

assert (
updated_seed_snapshot.snapshot_id
!= new_seed_snapshot.snapshot_id
!= original_seed_snapshot.snapshot_id
)
assert not updated_seed_snapshot.forward_only
assert plan.directly_modified == {updated_seed_snapshot.snapshot_id}
assert plan.metadata_updated == set()
assert plan.missing_intervals

# prove backfilled data loaded
assert ctx.engine_adapter.fetchall("select id, name from memory.test.source_data") == [
(1, "test"),
(2, "updated"),
]
61 changes: 61 additions & 0 deletions tests/core/test_plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -1189,6 +1189,67 @@ def test_forward_only_plan_seed_models(make_snapshot, mocker: MockerFixture):
assert not snapshot_a_updated.is_forward_only


def test_seed_model_metadata_change_categorized_forward_only(
make_snapshot: t.Callable[..., Snapshot],
):
snapshot_a = make_snapshot(
SeedModel(
name="a",
kind=SeedKind(path="./path/to/seed"),
seed=Seed(content="content"),
column_hashes={"col": "hash1"},
depends_on=set(),
)
)
snapshot_a.categorize_as(SnapshotChangeCategory.BREAKING)
snapshot_a.add_interval("2022-01-01", now())

snapshot_a_metadata_updated = make_snapshot(
SeedModel(
name="a",
kind=SeedKind(path="./path/to/seed"),
seed=Seed(content="content"),
column_hashes={"col": "hash1"},
depends_on=set(),
description="foo",
)
)
assert snapshot_a_metadata_updated.version is None
assert snapshot_a_metadata_updated.change_category is None

context_diff = ContextDiff(
environment="prod",
is_new_environment=True,
is_unfinalized_environment=False,
normalize_environment_name=True,
create_from="prod",
create_from_env_exists=True,
added=set(),
removed_snapshots={},
modified_snapshots={
snapshot_a_metadata_updated.name: (snapshot_a_metadata_updated, snapshot_a)
},
snapshots={snapshot_a_metadata_updated.snapshot_id: snapshot_a_metadata_updated},
new_snapshots={snapshot_a_metadata_updated.snapshot_id: snapshot_a},
previous_plan_id=None,
previously_promoted_snapshot_ids=set(),
previous_finalized_snapshots=None,
previous_gateway_managed_virtual_layer=False,
gateway_managed_virtual_layer=False,
environment_statements=[],
)

plan = PlanBuilder(context_diff).build()
assert snapshot_a_metadata_updated.change_category == SnapshotChangeCategory.METADATA
assert (
snapshot_a_metadata_updated.is_forward_only
) # needs to be forward_only to prevent backfill on a metadata change
assert (
snapshot_a_metadata_updated.intervals == snapshot_a.intervals
) # intervals should have been copied
assert not plan.missing_intervals # plan should have no missing intervals


def test_start_inference(make_snapshot, mocker: MockerFixture):
snapshot_a = make_snapshot(
SqlModel(name="a", query=parse_one("select 1, ds"), start="2022-01-01")
Expand Down