Skip to content

Commit cd01bdc

Browse files
authored
feat: add datasets MD5 hash (#1665)
1 parent a0ba672 commit cd01bdc

13 files changed

Lines changed: 665 additions & 14 deletions

File tree

api/src/shared/db_models/gtfs_dataset_impl.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def from_orm(cls, gtfs_dataset: Gtfsdataset | None) -> GtfsDataset | None:
4747
note=gtfs_dataset.note,
4848
downloaded_at=gtfs_dataset.downloaded_at,
4949
hash=gtfs_dataset.hash,
50+
hash_md5=gtfs_dataset.hash_md5,
5051
bounding_box=BoundingBoxImpl.from_orm(gtfs_dataset.bounding_box),
5152
validation_report=cls.from_orm_latest_validation_report(gtfs_dataset.validation_reports),
5253
service_date_range_start=gtfs_dataset.service_date_range_start,

api/src/shared/db_models/latest_dataset_impl.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def from_orm(cls, dataset: Gtfsdataset | None) -> LatestDataset | None:
4848
service_date_range_end=dataset.service_date_range_end,
4949
agency_timezone=dataset.agency_timezone,
5050
hash=dataset.hash,
51+
hash_md5=dataset.hash_md5,
5152
validation_report=validation_report,
5253
unzipped_folder_size_mb=round(dataset.unzipped_size_bytes / 1024**2, 2)
5354
if dataset.unzipped_size_bytes

docs/DatabaseCatalogAPI.yaml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -922,9 +922,13 @@ components:
922922
example: 2023-07-10T22:06:00Z
923923
format: date-time
924924
hash:
925-
description: A hash of the dataset.
925+
description: SHA-256 hash of the dataset.
926926
type: string
927927
example: ad3805c4941cd37881ff40c342e831b5f5224f3d8a9a2ec3ac197d3652c78e42
928+
hash_md5:
929+
description: MD5 hash of the dataset.
930+
type: string
931+
example: 098f6bcd4621d373cade4e832627b4f6
928932
service_date_range_start:
929933
description: The start date of the service date range for the dataset in UTC. Timing starts at 00:00:00 of the day.
930934
type: string
@@ -1173,9 +1177,13 @@ components:
11731177
example: 2023-07-10T22:06:00Z
11741178
format: date-time
11751179
hash:
1176-
description: A hash of the dataset.
1180+
description: SHA-256 hash of the dataset.
11771181
type: string
11781182
example: 6497e85e34390b8b377130881f2f10ec29c18a80dd6005d504a2038cdd00aa71
1183+
hash_md5:
1184+
description: MD5 hash of the dataset.
1185+
type: string
1186+
example: 098f6bcd4621d373cade4e832627b4f6
11791187
bounding_box:
11801188
$ref: "#/components/schemas/BoundingBox"
11811189
validation_report:

docs/OperationsAPI.yaml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -859,9 +859,13 @@ components:
859859
example: 2023-07-10T22:06:00Z
860860
format: date-time
861861
hash:
862-
description: A hash of the dataset.
862+
description: SHA-256 hash of the dataset.
863863
type: string
864864
example: ad3805c4941cd37881ff40c342e831b5f5224f3d8a9a2ec3ac197d3652c78e42
865+
hash_md5:
866+
description: MD5 hash of the dataset.
867+
type: string
868+
example: 098f6bcd4621d373cade4e832627b4f6
865869
service_date_range_start:
866870
description: The start date of the service date range for the dataset in UTC. Timing starts at 00:00:00 of the day.
867871
type: string
@@ -1103,9 +1107,13 @@ components:
11031107
example: 2023-07-10T22:06:00Z
11041108
format: date-time
11051109
hash:
1106-
description: A hash of the dataset.
1110+
description: SHA-256 hash of the dataset.
11071111
type: string
11081112
example: 6497e85e34390b8b377130881f2f10ec29c18a80dd6005d504a2038cdd00aa71
1113+
hash_md5:
1114+
description: MD5 hash of the dataset.
1115+
type: string
1116+
example: 098f6bcd4621d373cade4e832627b4f6
11091117
bounding_box:
11101118
$ref: "#/components/schemas/BoundingBox"
11111119
validation_report:
@@ -1495,6 +1503,7 @@ components:
14951503
14961504
14971505
1506+
14981507
* vp - vehicle positions
14991508
* tu - trip updates
15001509
* sa - service alerts
@@ -1617,6 +1626,7 @@ components:
16171626
16181627
16191628
1629+
16201630
* vp - vehicle positions
16211631
* tu - trip updates
16221632
* sa - service alerts
@@ -1696,6 +1706,7 @@ components:
16961706
16971707
16981708
1709+
16991710
* `active` Feed should be used in public trip planners.
17001711
* `deprecated` Feed is explicitly deprecated and should not be used in public trip planners.
17011712
* `inactive` Feed hasn't been recently updated and should be used at risk of providing outdated information.
@@ -1717,6 +1728,7 @@ components:
17171728
17181729
17191730
1731+
17201732
* `gtfs` GTFS feed.
17211733
* `gtfs_rt` GTFS-RT feed.
17221734
* `gbfs` GBFS feed.

functions-python/batch_process_dataset/src/main.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ class DatasetFile:
6565
stable_id: str
6666
extracted_files: List[Gtfsfile] = None
6767
file_sha256_hash: Optional[str] = None
68+
file_md5_hash: Optional[str] = None
6869
hosted_url: Optional[str] = None
6970
zipped_size: Optional[int] = None
7071

@@ -156,22 +157,27 @@ def upload_dataset_zip_to_storage(
156157
source_file_path,
157158
dataset_stable_id,
158159
public=True,
159-
):
160+
) -> Optional[str]:
160161
"""
161162
Uploads the dataset zip file to GCP storage as latest.zip and versioned zip.
163+
Returns the MD5 hash (hex) of the uploaded file as provided by GCS.
162164
"""
163165
bucket = storage.Client().get_bucket(self.bucket_name)
164166
target_paths = [
165167
f"{self.feed_stable_id}/latest.zip",
166168
f"{self.feed_stable_id}/{dataset_stable_id}/{dataset_stable_id}.zip",
167169
]
168170

171+
md5_hash_hex = None
169172
for target_path in target_paths:
170173
blob = bucket.blob(target_path)
171174
blob.upload_from_filename(source_file_path)
172175
if public:
173176
blob.make_public()
174177
self.logger.info(f"Uploaded {blob.public_url}")
178+
if blob.md5_hash:
179+
md5_hash_hex = base64.b64decode(blob.md5_hash).hex()
180+
return md5_hash_hex
175181

176182
def _extract_and_upload_single_file(
177183
self,
@@ -320,7 +326,7 @@ def transfer_dataset(self, feed_id, public=True) -> DatasetFile or None:
320326
self.logger.info(
321327
f"Creating file {dataset_full_path} in bucket {self.bucket_name}"
322328
)
323-
self.upload_dataset_zip_to_storage(
329+
file_md5_hash = self.upload_dataset_zip_to_storage(
324330
temp_file_path,
325331
dataset_stable_id,
326332
public=public,
@@ -336,6 +342,7 @@ def transfer_dataset(self, feed_id, public=True) -> DatasetFile or None:
336342
return DatasetFile(
337343
stable_id=dataset_stable_id,
338344
file_sha256_hash=file_sha256_hash,
345+
file_md5_hash=file_md5_hash,
339346
hosted_url=f"{self.public_hosted_datasets_url}/{dataset_full_path}",
340347
extracted_files=extracted_files,
341348
zipped_size=(
@@ -459,6 +466,7 @@ def create_dataset_entities(
459466
bounding_box=None,
460467
note=None,
461468
hash=dataset_file.file_sha256_hash,
469+
hash_md5=dataset_file.file_md5_hash,
462470
downloaded_at=func.now(),
463471
hosted_url=dataset_file.hosted_url,
464472
gtfsfiles=(

functions-python/batch_process_dataset/tests/test_batch_process_dataset_main.py

Lines changed: 77 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,8 @@ def test_upload_dataset_diff_hash(
7272
"""
7373
Test upload_dataset method of DatasetProcessor class with different hash from the latest one
7474
"""
75-
mock_blob = MagicMock()
76-
mock_blob.public_url = public_url
77-
mock_blob.path = public_url
78-
79-
# Mock the new methods used in transfer_dataset
80-
mock_upload_dataset_zip.return_value = mock_blob
75+
mock_md5_hex = "098f6bcd4621d373cade4e832627b4f6"
76+
mock_upload_dataset_zip.return_value = mock_md5_hex
8177
mock_extracted_files = [] # Empty list of extracted files
8278
mock_extract_and_upload.return_value = mock_extracted_files
8379
mock_download_url_content.return_value = file_hash, True
@@ -105,6 +101,7 @@ def test_upload_dataset_diff_hash(
105101
f"/feed_stable_id-mocked_timestamp.zip",
106102
)
107103
self.assertEqual(result.file_sha256_hash, file_hash)
104+
self.assertEqual(result.file_md5_hash, mock_md5_hex)
108105
# Verify the new methods were called
109106
self.assertEqual(mock_upload_dataset_zip.call_count, 1)
110107
self.assertEqual(mock_extract_and_upload.call_count, 1)
@@ -905,3 +902,77 @@ def test_create_dataset_entities_update_existing_no_files(self, db_session):
905902
self.assertIsNone(result_dataset.unzipped_size_bytes) # None when no files
906903

907904
mock_refresh_task.assert_called_once()
905+
906+
907+
class TestUploadDatasetZipToStorage(unittest.TestCase):
908+
@patch("main.storage.Client")
909+
def test_md5_hash_read_from_gcs_blob_after_upload(self, mock_storage_client):
910+
"""
911+
upload_dataset_zip_to_storage should return the hex MD5 hash read from the GCS
912+
blob's md5_hash attribute (base64-encoded) after upload.
913+
"""
914+
import base64
915+
import tempfile
916+
917+
raw_md5 = b"\x09\x8f\x6b\xcd\x46\x21\xd3\x73\xca\xde\x4e\x83\x26\x27\xb4\xf6"
918+
b64_md5 = base64.b64encode(raw_md5).decode()
919+
expected_hex = raw_md5.hex()
920+
921+
mock_blob = MagicMock()
922+
mock_blob.md5_hash = b64_md5
923+
mock_blob.public_url = "https://storage.googleapis.com/bucket/path.zip"
924+
mock_bucket = MagicMock()
925+
mock_bucket.blob.return_value = mock_blob
926+
mock_storage_client.return_value.get_bucket.return_value = mock_bucket
927+
928+
processor = DatasetProcessor(
929+
producer_url="https://example.com/feed.zip",
930+
feed_id="feed_id",
931+
feed_stable_id="feed_stable",
932+
execution_id="exec_id",
933+
latest_hash="hash",
934+
bucket_name="test-bucket",
935+
authentication_type=0,
936+
api_key_parameter_name=None,
937+
public_hosted_datasets_url="https://public.example.com",
938+
)
939+
940+
with tempfile.NamedTemporaryFile(suffix=".zip") as tmp:
941+
result = processor.upload_dataset_zip_to_storage(
942+
tmp.name, "dataset_stable_id"
943+
)
944+
945+
self.assertEqual(result, expected_hex)
946+
947+
@patch("main.storage.Client")
948+
def test_md5_hash_none_when_blob_has_no_md5(self, mock_storage_client):
949+
"""
950+
upload_dataset_zip_to_storage should return None if the GCS blob provides no md5_hash.
951+
"""
952+
import tempfile
953+
954+
mock_blob = MagicMock()
955+
mock_blob.md5_hash = None
956+
mock_blob.public_url = "https://storage.googleapis.com/bucket/path.zip"
957+
mock_bucket = MagicMock()
958+
mock_bucket.blob.return_value = mock_blob
959+
mock_storage_client.return_value.get_bucket.return_value = mock_bucket
960+
961+
processor = DatasetProcessor(
962+
producer_url="https://example.com/feed.zip",
963+
feed_id="feed_id",
964+
feed_stable_id="feed_stable",
965+
execution_id="exec_id",
966+
latest_hash="hash",
967+
bucket_name="test-bucket",
968+
authentication_type=0,
969+
api_key_parameter_name=None,
970+
public_hosted_datasets_url="https://public.example.com",
971+
)
972+
973+
with tempfile.NamedTemporaryFile(suffix=".zip") as tmp:
974+
result = processor.upload_dataset_zip_to_storage(
975+
tmp.name, "dataset_stable_id"
976+
)
977+
978+
self.assertIsNone(result)

functions-python/tasks_executor/README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,27 @@ To populate licenses:
9393
}
9494
```
9595

96+
To backfill MD5 hashes for existing GTFS datasets (reads the MD5 from the GCS object metadata):
97+
98+
```json
99+
{
100+
"task": "backfill_dataset_hash_md5",
101+
"payload": {
102+
"dry_run": true,
103+
"only_latest": true,
104+
"only_missing_hashes": true,
105+
"limit": 10
106+
}
107+
}
108+
```
109+
110+
| Parameter | Type | Default | Description |
111+
|---|---|---|---|
112+
| `dry_run` | bool | `true` | Log changes without writing to the database |
113+
| `only_latest` | bool | `true` | Process only datasets that are the current latest for their feed |
114+
| `only_missing_hashes` | bool | `true` | Skip datasets that already have `hash_md5` set |
115+
| `limit` | int \| null | `10` | Maximum number of datasets to process; omit or pass `null` for no limit |
116+
96117
## Response Content Type
97118

98119
When the request includes the header `Accept: text/csv`, the server returns the response as a CSV file generated from the handler’s output.

functions-python/tasks_executor/src/main.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
from tasks.dataset_files.rebuild_missing_dataset_files import (
3030
rebuild_missing_dataset_files_handler,
3131
)
32+
from tasks.dataset_files.backfill_dataset_hash_md5 import (
33+
backfill_dataset_hash_md5_handler,
34+
)
3235
from tasks.licenses.license_matcher import match_license_handler
3336
from tasks.missing_bounding_boxes.rebuild_missing_bounding_boxes import (
3437
rebuild_missing_bounding_boxes_handler,
@@ -109,6 +112,14 @@
109112
"description": "Rebuilds missing dataset files for GTFS datasets.",
110113
"handler": rebuild_missing_dataset_files_handler,
111114
},
115+
"backfill_dataset_hash_md5": {
116+
"description": (
117+
"Backfills the MD5 hash for existing GTFS datasets by reading it from GCS blob metadata. "
118+
"Parameters: dry_run (default true), only_latest (default true), "
119+
"only_missing_hashes (default true), limit (default 10)."
120+
),
121+
"handler": backfill_dataset_hash_md5_handler,
122+
},
112123
"update_geojson_files": {
113124
"description": "Iterate over bucket looking for {feed_stable_id}/geolocation.geojson and update precision.",
114125
"handler": update_geojson_files_precision_handler,

0 commit comments

Comments
 (0)