|
| 1 | +"""Tests for invalidating revisions and refetching.""" |
| 2 | +from datetime import datetime, timezone |
| 3 | + |
| 4 | +from ingestify import Source, DatasetResource |
| 5 | +from ingestify.domain import DataSpecVersionCollection, DraftFile, Selector |
| 6 | +from ingestify.domain.models.dataset.collection_metadata import ( |
| 7 | + DatasetCollectionMetadata, |
| 8 | +) |
| 9 | +from ingestify.domain.models.dataset.revision import RevisionState |
| 10 | +from ingestify.domain.models.fetch_policy import FetchPolicy |
| 11 | +from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan |
| 12 | + |
| 13 | +# Fixed timestamp so last_modified doesn't change between runs |
| 14 | +FIXED_TIME = datetime(2026, 1, 1, tzinfo=timezone.utc) |
| 15 | + |
| 16 | +call_count = 0 |
| 17 | + |
| 18 | + |
| 19 | +def counting_loader(file_resource, current_file, **kwargs): |
| 20 | + global call_count |
| 21 | + call_count += 1 |
| 22 | + return DraftFile.from_input(f"data-{call_count}", data_feed_key="f1") |
| 23 | + |
| 24 | + |
| 25 | +class SimpleSource(Source): |
| 26 | + provider = "test_provider" |
| 27 | + |
| 28 | + def find_datasets( |
| 29 | + self, dataset_type, data_spec_versions, dataset_collection_metadata, **kwargs |
| 30 | + ): |
| 31 | + r = DatasetResource( |
| 32 | + dataset_resource_id={"item_id": 1}, |
| 33 | + provider=self.provider, |
| 34 | + dataset_type="test", |
| 35 | + name="item-1", |
| 36 | + ) |
| 37 | + r.add_file( |
| 38 | + last_modified=FIXED_TIME, |
| 39 | + data_feed_key="f1", |
| 40 | + data_spec_version="v1", |
| 41 | + file_loader=counting_loader, |
| 42 | + ) |
| 43 | + yield r |
| 44 | + |
| 45 | + |
| 46 | +def _setup(engine): |
| 47 | + dsv = DataSpecVersionCollection.from_dict({"default": {"v1"}}) |
| 48 | + engine.add_ingestion_plan( |
| 49 | + IngestionPlan( |
| 50 | + source=SimpleSource("s"), |
| 51 | + fetch_policy=FetchPolicy(), |
| 52 | + dataset_type="test", |
| 53 | + selectors=[Selector.build({}, data_spec_versions=dsv)], |
| 54 | + data_spec_versions=dsv, |
| 55 | + ) |
| 56 | + ) |
| 57 | + |
| 58 | + |
| 59 | +def test_normal_second_run_skips(engine): |
| 60 | + """Verify a second run with same last_modified does NOT refetch.""" |
| 61 | + global call_count |
| 62 | + call_count = 0 |
| 63 | + _setup(engine) |
| 64 | + |
| 65 | + engine.run() |
| 66 | + assert call_count == 1 |
| 67 | + |
| 68 | + engine.run() |
| 69 | + assert call_count == 1, "Should NOT refetch when nothing changed" |
| 70 | + |
| 71 | + |
| 72 | +def test_invalidate_revision_triggers_refetch(engine): |
| 73 | + """Invalidating a revision causes ingestify to refetch on next run.""" |
| 74 | + global call_count |
| 75 | + call_count = 0 |
| 76 | + _setup(engine) |
| 77 | + |
| 78 | + # First run: creates the dataset |
| 79 | + engine.run() |
| 80 | + assert call_count == 1 |
| 81 | + |
| 82 | + # Invalidate the current revision |
| 83 | + datasets = list( |
| 84 | + engine.store.get_dataset_collection( |
| 85 | + provider="test_provider", dataset_type="test" |
| 86 | + ) |
| 87 | + ) |
| 88 | + dataset = datasets[0] |
| 89 | + engine.store.invalidate_revision(dataset, reason="Data quality check failed") |
| 90 | + |
| 91 | + # Verify state |
| 92 | + datasets = list( |
| 93 | + engine.store.get_dataset_collection( |
| 94 | + provider="test_provider", dataset_type="test" |
| 95 | + ) |
| 96 | + ) |
| 97 | + assert datasets[0].current_revision.state == RevisionState.VALIDATION_FAILED |
| 98 | + |
| 99 | + # Second run: should refetch |
| 100 | + engine.run() |
| 101 | + assert call_count == 2, "Dataset with invalidated revision should be refetched" |
0 commit comments