@@ -72,12 +72,8 @@ def test_upload_dataset_diff_hash(
7272 """
7373 Test upload_dataset method of DatasetProcessor class with different hash from the latest one
7474 """
75- mock_blob = MagicMock ()
76- mock_blob .public_url = public_url
77- mock_blob .path = public_url
78-
79- # Mock the new methods used in transfer_dataset
80- mock_upload_dataset_zip .return_value = mock_blob
75+ mock_md5_hex = "098f6bcd4621d373cade4e832627b4f6"
76+ mock_upload_dataset_zip .return_value = mock_md5_hex
8177 mock_extracted_files = [] # Empty list of extracted files
8278 mock_extract_and_upload .return_value = mock_extracted_files
8379 mock_download_url_content .return_value = file_hash , True
@@ -105,6 +101,7 @@ def test_upload_dataset_diff_hash(
105101 f"/feed_stable_id-mocked_timestamp.zip" ,
106102 )
107103 self .assertEqual (result .file_sha256_hash , file_hash )
104+ self .assertEqual (result .file_md5_hash , mock_md5_hex )
108105 # Verify the new methods were called
109106 self .assertEqual (mock_upload_dataset_zip .call_count , 1 )
110107 self .assertEqual (mock_extract_and_upload .call_count , 1 )
@@ -905,3 +902,77 @@ def test_create_dataset_entities_update_existing_no_files(self, db_session):
905902 self .assertIsNone (result_dataset .unzipped_size_bytes ) # None when no files
906903
907904 mock_refresh_task .assert_called_once ()
905+
906+
907+ class TestUploadDatasetZipToStorage (unittest .TestCase ):
908+ @patch ("main.storage.Client" )
909+ def test_md5_hash_read_from_gcs_blob_after_upload (self , mock_storage_client ):
910+ """
911+ upload_dataset_zip_to_storage should return the hex MD5 hash read from the GCS
912+ blob's md5_hash attribute (base64-encoded) after upload.
913+ """
914+ import base64
915+ import tempfile
916+
917+ raw_md5 = b"\x09 \x8f \x6b \xcd \x46 \x21 \xd3 \x73 \xca \xde \x4e \x83 \x26 \x27 \xb4 \xf6 "
918+ b64_md5 = base64 .b64encode (raw_md5 ).decode ()
919+ expected_hex = raw_md5 .hex ()
920+
921+ mock_blob = MagicMock ()
922+ mock_blob .md5_hash = b64_md5
923+ mock_blob .public_url = "https://storage.googleapis.com/bucket/path.zip"
924+ mock_bucket = MagicMock ()
925+ mock_bucket .blob .return_value = mock_blob
926+ mock_storage_client .return_value .get_bucket .return_value = mock_bucket
927+
928+ processor = DatasetProcessor (
929+ producer_url = "https://example.com/feed.zip" ,
930+ feed_id = "feed_id" ,
931+ feed_stable_id = "feed_stable" ,
932+ execution_id = "exec_id" ,
933+ latest_hash = "hash" ,
934+ bucket_name = "test-bucket" ,
935+ authentication_type = 0 ,
936+ api_key_parameter_name = None ,
937+ public_hosted_datasets_url = "https://public.example.com" ,
938+ )
939+
940+ with tempfile .NamedTemporaryFile (suffix = ".zip" ) as tmp :
941+ result = processor .upload_dataset_zip_to_storage (
942+ tmp .name , "dataset_stable_id"
943+ )
944+
945+ self .assertEqual (result , expected_hex )
946+
947+ @patch ("main.storage.Client" )
948+ def test_md5_hash_none_when_blob_has_no_md5 (self , mock_storage_client ):
949+ """
950+ upload_dataset_zip_to_storage should return None if the GCS blob provides no md5_hash.
951+ """
952+ import tempfile
953+
954+ mock_blob = MagicMock ()
955+ mock_blob .md5_hash = None
956+ mock_blob .public_url = "https://storage.googleapis.com/bucket/path.zip"
957+ mock_bucket = MagicMock ()
958+ mock_bucket .blob .return_value = mock_blob
959+ mock_storage_client .return_value .get_bucket .return_value = mock_bucket
960+
961+ processor = DatasetProcessor (
962+ producer_url = "https://example.com/feed.zip" ,
963+ feed_id = "feed_id" ,
964+ feed_stable_id = "feed_stable" ,
965+ execution_id = "exec_id" ,
966+ latest_hash = "hash" ,
967+ bucket_name = "test-bucket" ,
968+ authentication_type = 0 ,
969+ api_key_parameter_name = None ,
970+ public_hosted_datasets_url = "https://public.example.com" ,
971+ )
972+
973+ with tempfile .NamedTemporaryFile (suffix = ".zip" ) as tmp :
974+ result = processor .upload_dataset_zip_to_storage (
975+ tmp .name , "dataset_stable_id"
976+ )
977+
978+ self .assertIsNone (result )
0 commit comments