1+ import json
12import logging
23import os
3- import shutil
4- import ssl
54import tempfile
6- import urllib . request
5+ import traceback
76import uuid
87import zipfile
98
109from google .cloud import storage
10+ from sqlalchemy .orm import joinedload
1111
1212from shared .database .database import with_db_session
1313from shared .database_gen .sqlacodegen_models import Gtfsdataset , Gtfsfile
14- from shared .helpers .utils import get_hash_from_file
15-
16- # Disable SSL verification — trusted internal sources only
17- ssl ._create_default_https_context = ssl ._create_unverified_context
14+ from shared .helpers .utils import get_hash_from_file , download_and_get_hash
1815
1916
2017def rebuild_missing_dataset_files_handler (payload ) -> dict :
@@ -56,6 +53,7 @@ def get_datasets_with_missing_files_query(db_session, after_date, latest_only):
5653 | Gtfsdataset .unzipped_size_bytes .is_ (None )
5754 | ~ Gtfsdataset .gtfsfiles .any ()
5855 )
56+ .options (joinedload (Gtfsdataset .feed ))
5957 )
6058
6159 if after_date :
@@ -67,29 +65,29 @@ def get_datasets_with_missing_files_query(db_session, after_date, latest_only):
6765 return query
6866
6967
70- def download_to_file (url : str , local_path : str ):
71- """
72- Downloads a file from URL and writes it to local disk.
73- """
74- with urllib .request .urlopen (url ) as response , open (local_path , "wb" ) as out_file :
75- shutil .copyfileobj (response , out_file )
76-
77-
78- def process_dataset (dataset : Gtfsdataset ):
68+ def process_dataset (dataset : Gtfsdataset , credentials = None ):
7969 """
8070 Downloads, extracts, uploads, and indexes files for a GTFS dataset.
8171
8272 Args:
8373 dataset (Gtfsdataset): The dataset to process.
74+ credentials (str): Optional credentials for authentication.
8475 """
8576 hosted_url = dataset .hosted_url
8677 stable_id = dataset .stable_id
8778 logging .info ("Processing dataset %s with URL %s" , stable_id , hosted_url )
88- bucket_name = os .environ [ "DATASETS_BUCKET_NAME" ]
79+ bucket_name = os .getenv ( "DATASETS_BUCKET_NAME" )
8980
9081 with tempfile .TemporaryDirectory () as tmp_dir :
9182 zip_path = os .path .join (tmp_dir , "dataset.zip" )
92- download_to_file (hosted_url , zip_path )
83+ download_and_get_hash (
84+ hosted_url ,
85+ zip_path ,
86+ authentication_type = dataset .feed .authentication_type ,
87+ api_key_parameter_name = dataset .feed .api_key_parameter_name ,
88+ credentials = credentials ,
89+ trusted_certs = True ,
90+ )
9391 dataset .zipped_size_bytes = os .path .getsize (zip_path )
9492
9593 with zipfile .ZipFile (zip_path , "r" ) as zip_ref :
@@ -110,7 +108,7 @@ def process_dataset(dataset: Gtfsdataset):
110108 file_path = os .path .join (root , file_name )
111109 # Only store files in GCS for latest datasets
112110 if dataset .latest :
113- logging .info ("Storing latest dataset files" )
111+ logging .info ("Storing latest dataset file %s" , file_name )
114112 blob_path = f"{ '-' .join (stable_id .split ('-' )[:2 ])} /{ stable_id } /extracted/{ file_name } "
115113 blob = bucket .blob (blob_path )
116114 blob .upload_from_filename (file_path )
@@ -182,10 +180,18 @@ def rebuild_missing_dataset_files(
182180 total_processed = 0
183181 count = 0
184182 batch_count = 5
183+ credentials = json .loads (os .getenv ("FEEDS_CREDENTIALS" , "{}" ))
185184 logging .info ("Starting to process datasets with missing files..." )
186185
187186 for dataset in datasets .all ():
188- process_dataset (dataset )
187+ try :
188+ process_dataset (
189+ dataset , credentials = credentials .get (dataset .feed .stable_id )
190+ )
191+ except Exception :
192+ logging .error ("Error processing dataset %s:" , dataset .stable_id )
193+ traceback .print_exc ()
194+ continue
189195 count += 1
190196 total_processed += 1
191197
0 commit comments