Skip to content

Commit 73ae432

Browse files
committed
initial file integrity check
1 parent cd86dbd commit 73ae432

1 file changed

Lines changed: 49 additions & 12 deletions

File tree

src/ibc_api/utils.py

Lines changed: 49 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
"""API to fetch IBC data from EBRAINS via Human Data Gateway using siibra.
2-
"""
1+
"""API to fetch IBC data from EBRAINS via Human Data Gateway using siibra."""
32

43
# %$
54
import json
@@ -165,6 +164,10 @@ def download_gm_mask(resolution=1.5, save_to=None):
165164
return save_as
166165

167166

167+
def _is_empty_db(db):
168+
return db is None or db.empty or len(db) == 0
169+
170+
168171
def get_info(data_type="volume_maps", save_to=None, metadata=METADATA):
169172
"""Fetch a csv file describing each file in a given IBC dataset on EBRAINS.
170173
@@ -182,14 +185,48 @@ def get_info(data_type="volume_maps", save_to=None, metadata=METADATA):
182185
pandas.DataFrame
183186
dataframe with information about each file in the dataset
184187
"""
185-
# file with all information about the dataset
186-
db_file = md.fetch_dataset_db(data_type, metadata)
187-
# load the file as dataframe
188-
# convert subject, session and run to string to avoid losing leading zeros
189-
db = pd.read_csv(
190-
db_file, converters={"subject": str, "session": str, "run": str}
191-
)
192-
db.drop(columns=["Unnamed: 0"], inplace=True, errors="ignore")
188+
189+
datasets = metadata[data_type]
190+
latest_idx = md._find_latest_version(datasets)
191+
192+
last_exception = None
193+
194+
# Try from latest version → older versions
195+
for version_idx in range(latest_idx, -1, -1):
196+
# fetch the information corresponding to this version
197+
dataset = datasets[version_idx]
198+
db_file = md.fetch_remote_file(dataset["db_file"])
199+
# load the file as dataframe
200+
# convert subject, session and run to string to avoid losing
201+
# leading zeros
202+
db = pd.read_csv(
203+
db_file,
204+
converters={"subject": str, "session": str, "run": str},
205+
)
206+
db.drop(columns=["Unnamed: 0"], inplace=True, errors="ignore")
207+
208+
if not _is_empty_db(db):
209+
print(
210+
f"Fetched database for {data_type}, version {dataset['version']}."
211+
)
212+
break
213+
else:
214+
last_exception = ValueError(
215+
f"No versions found for dataset {data_type}, version {dataset['version']}."
216+
)
217+
print(
218+
f"Failed to fetch database for {data_type}, version {dataset['version']}."
219+
"Trying older version..."
220+
)
221+
222+
# If all versions failed, raise the last exception
223+
if _is_empty_db(db):
224+
raise (
225+
last_exception
226+
if last_exception
227+
else ValueError(f"No versions found for dataset {data_type}.")
228+
)
229+
193230
# save the database file
194231
save_to = _create_root_dir(save_to)
195232
save_as = os.path.join(save_to, f"available_{data_type}.csv")
@@ -411,7 +448,7 @@ def download_data(db, n_jobs=2, save_to=None):
411448
dataframe with information about files in the dataset, ideally a subset
412449
of the full dataset
413450
n_jobs : int, optional
414-
number of parallel jobs to run, by default 2. -1 would use all the CPUs.
451+
number of parallel jobs to run, by default 2. -1 would use all the CPUs.
415452
See: https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html
416453
save_to : str, optional
417454
where to save the data, by default None, in which case the data is
@@ -457,7 +494,7 @@ def _download_and_update_progress(src_file, dst_file, connector):
457494
CACHE.run_maintenance() # keep cache < 2GB
458495
return file_name, file_time
459496
except Exception as e:
460-
raise(f"Error downloading {src_file}. Error: {e}")
497+
raise (f"Error downloading {src_file}. Error: {e}")
461498

462499
# download finally
463500
print(f"\n...Starting download of {len(src_file_names)} files...")

0 commit comments

Comments
 (0)