Skip to content

Commit ebb44be

Browse files
author
Yasser Alemán Gómez
committed
Refactoring the method to compress the folders with DICOM fless
1 parent f02fdaa commit ebb44be

1 file changed

Lines changed: 169 additions & 69 deletions

File tree

clabtoolkit/dicomtools.py

Lines changed: 169 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from glob import glob
33
import subprocess
44
import tarfile
5+
import shutil
56
import pandas as pd
67
import sys
78
import pydicom
@@ -538,7 +539,7 @@ def create_session_series_names(dataset):
538539
ser_id = ser_id.replace(cad, "")
539540

540541
# Removing the dupplicated _ characters and replacing the remaining by -
541-
ser_id = cltmisc.rem_dupplicate_char(ser_id, "_")
542+
ser_id = cltmisc.rem_duplicate_char(ser_id, "_")
542543
ser_id = ser_id.replace("_", "-")
543544

544545
if any("SeriesNumber" in s for s in attributes):
@@ -613,6 +614,7 @@ def uncompress_dicom_session(
613614
... boolrmtar=True
614615
... )
615616
"""
617+
616618
# Validate input directory
617619
dic_path = Path(dic_dir)
618620
if not dic_path.exists():
@@ -711,89 +713,187 @@ def uncompress_dicom_session(
711713
return failed_sessions
712714

713715

714-
def compress_dicom_session(dic_dir: str, subj_ids=None):
716+
def compress_dicom_session(
717+
dic_dir: str,
718+
subj_ids: Optional[Union[str, List[str]]] = None,
719+
remove_original: bool = True,
720+
) -> List[str]:
715721
"""
716-
Compress session folders
717-
@params:
718-
dic_dir - Required : Directory containing the subjects. It assumes an organization in:
719-
<subj_id>/<session_id>/<series_id>(Str)
722+
Compress session folders containing DICOM files into tar.gz archives.
723+
724+
Parameters
725+
----------
726+
dic_dir : str
727+
Directory containing the subjects. It assumes an organization in:
728+
<subj_id>/<session_id>/<series_id>
729+
subj_ids : str, list of str, or None, optional
730+
Subject IDs to be considered. Can be:
731+
- None: consider all subjects in the directory (default)
732+
- str: path to text file containing subject IDs (one per line)
733+
- list of str: explicit list of subject IDs
734+
remove_original : bool, optional, default=True
735+
Whether to remove the original session directories after successful compression.
736+
737+
Returns
738+
-------
739+
list of str
740+
List of session directories that failed to be compressed. Empty list if all successful.
741+
742+
Raises
743+
------
744+
FileNotFoundError
745+
If the specified directory does not exist.
746+
ValueError
747+
If subj_ids is not None, str, or list, or if subject IDs file cannot be read.
748+
tarfile.TarError
749+
If there are issues with creating or writing tar files.
750+
PermissionError
751+
If there are insufficient permissions to compress files or remove directories.
752+
OSError
753+
If there are filesystem-related errors during compression.
754+
755+
Examples
756+
--------
757+
>>> # Basic usage - compress all sessions in directory
758+
>>> failed = compress_dicom_session('/path/to/dicom/directory')
759+
>>> if not failed:
760+
... print("All sessions compressed successfully")
761+
762+
>>> # Compress sessions but keep original directories
763+
>>> failed = compress_dicom_session(
764+
... dic_dir='/path/to/dicom/directory',
765+
... remove_original=False
766+
... )
767+
768+
>>> # Compress sessions for specific subjects only
769+
>>> failed = compress_dicom_session(
770+
... dic_dir='/path/to/dicom/directory',
771+
... subj_ids=['sub-001', 'sub-002', 'sub-003']
772+
... )
773+
774+
>>> # Use subject IDs from file
775+
>>> failed = compress_dicom_session(
776+
... dic_dir='/path/to/dicom/directory',
777+
... subj_ids='/path/to/subject_ids.txt'
778+
... )
720779
"""
780+
# Validate input directory
781+
dic_path = Path(dic_dir)
782+
if not dic_path.exists():
783+
raise FileNotFoundError(f"Directory {dic_dir} does not exist")
784+
if not dic_path.is_dir():
785+
raise ValueError(f"{dic_dir} is not a directory")
721786

787+
# Process subject IDs
722788
if subj_ids is None:
723-
# Listing the subject ids inside the dicom folder
724-
my_list = os.listdir(dic_dir)
725-
subj_ids = []
726-
for it in my_list:
727-
if "sub-" in it:
728-
subj_ids.append(it)
789+
# Get all subjects with 'sub-' prefix
790+
subj_ids = [
791+
item.name
792+
for item in dic_path.iterdir()
793+
if item.is_dir() and item.name.startswith("sub-")
794+
]
729795
subj_ids.sort()
796+
elif isinstance(subj_ids, str):
797+
# Read subject IDs from file
798+
try:
799+
with open(subj_ids, "r", encoding="utf-8") as file:
800+
subj_ids = [line.strip() for line in file if line.strip()]
801+
except FileNotFoundError:
802+
raise FileNotFoundError(f"Subject IDs file {subj_ids} not found")
803+
except Exception as e:
804+
raise ValueError(f"Error reading subject IDs file: {e}")
805+
elif isinstance(subj_ids, list):
806+
# Validate list elements
807+
if not all(isinstance(subj_id, str) for subj_id in subj_ids):
808+
raise ValueError("All subject IDs must be strings")
730809
else:
731-
if isinstance(subj_ids, str):
732-
# Read the text file and save the lines in a list
733-
with open(subj_ids, "r") as file:
734-
subj_ids = file.readlines()
735-
subj_ids = [x.strip() for x in subj_ids]
736-
elif not isinstance(subj_ids, list):
737-
raise ValueError("The subj_ids parameter must be a list or a string")
810+
raise ValueError("subj_ids must be None, str (file path), or list of str")
811+
812+
if not subj_ids:
813+
print("No subjects found to process")
814+
return []
738815

739816
n_subj = len(subj_ids)
740-
# Failed sessions
741-
fail_sess = []
817+
failed_sessions = []
818+
total_sessions = 0
819+
compressed_sessions = 0
820+
742821
with Progress() as pb:
743-
t1 = pb.add_task("[green]Compressing subjects...", total=n_subj)
822+
task = pb.add_task("[green]Compressing sessions...", total=n_subj)
823+
824+
for i, subj_id in enumerate(subj_ids):
825+
subj_dir = dic_path / subj_id
744826

745-
# Loop around all the subjects
746-
nsubj = len(subj_ids)
747-
for i, subj_id in enumerate(subj_ids): # Loop along the IDs
748-
subj_dir = os.path.join(dic_dir, subj_id)
749827
pb.update(
750-
task_id=t1,
751-
description=f"[green]Compressing sessions for {subj_id} ({i+1}/{n_subj})",
752-
completed=i + 1,
828+
task_id=task,
829+
description=f"[green]Processing {subj_id} ({i+1}/{n_subj})",
830+
completed=i,
753831
)
754832

755-
# Loop along all the sessions inside the subject directory
756-
ses_dirs = os.listdir(subj_dir)
757-
758-
# Detect which of the folders are sessions
759-
ses_dirs = [x for x in ses_dirs if os.path.isdir(os.path.join(subj_dir, x))]
760-
761-
# Detect which of the folders start with 'ses-'
762-
ses_dirs = [x for x in ses_dirs if x.startswith("ses-")]
763-
n_sessions = len(ses_dirs)
764-
765-
for n_ses, ses_id in enumerate(ses_dirs): # Loop along the session
766-
ses_dir = os.path.join(subj_dir, ses_id)
767-
# print('SubjectId: ' + subjId + ' ======> Session: ' + sesId)
768-
# Compress only if it is a folder
769-
if os.path.isdir(ses_dir):
770-
tar_filename = ses_dir + ".tar.gz"
771-
try:
772-
# Compressing the folder
773-
subprocess.run(
774-
["tar", "-C", subj_dir, "-czvf", tar_filename, ses_id],
775-
stdout=subprocess.PIPE,
776-
universal_newlines=True,
777-
)
833+
# Skip if subject directory doesn't exist
834+
if not subj_dir.exists():
835+
print(f"Warning: Subject directory {subj_dir} not found, skipping...")
836+
continue
778837

779-
# Removing the uncompressed dicom folder
780-
subprocess.run(
781-
["rm", "-r", ses_dir],
782-
stdout=subprocess.PIPE,
783-
universal_newlines=True,
784-
)
785-
except:
786-
fail_sess.append(ses_dir)
838+
# Find all session directories (starting with 'ses-')
839+
session_dirs = [
840+
item
841+
for item in subj_dir.iterdir()
842+
if item.is_dir() and item.name.startswith("ses-")
843+
]
844+
845+
total_sessions += len(session_dirs)
846+
847+
for ses_dir in session_dirs:
848+
tar_file_path = ses_dir.with_suffix(".tar.gz")
849+
850+
# Skip if tar file already exists
851+
if tar_file_path.exists():
852+
print(f"Warning: {tar_file_path} already exists, skipping...")
853+
continue
854+
855+
try:
856+
# Create tar.gz archive using Python's tarfile module
857+
with tarfile.open(tar_file_path, "w:gz") as tar:
858+
# Add the session directory to the archive
859+
# Use arcname to preserve the directory structure
860+
tar.add(ses_dir, arcname=ses_dir.name)
861+
862+
# Remove original directory if requested and compression succeeded
863+
if remove_original:
864+
shutil.rmtree(ses_dir)
865+
866+
compressed_sessions += 1
867+
868+
except tarfile.TarError as e:
869+
print(f"Error compressing {ses_dir}: {e}")
870+
failed_sessions.append(str(ses_dir))
871+
# Clean up partially created tar file
872+
if tar_file_path.exists():
873+
try:
874+
tar_file_path.unlink()
875+
except Exception:
876+
pass
877+
except PermissionError as e:
878+
print(f"Permission error with {ses_dir}: {e}")
879+
failed_sessions.append(str(ses_dir))
880+
except Exception as e:
881+
print(f"Unexpected error with {ses_dir}: {e}")
882+
failed_sessions.append(str(ses_dir))
787883

788884
pb.update(
789-
task_id=t1,
790-
description=f"[green]Compressing sessions for {subj_id} ({n_subj}/{n_subj})",
791-
completed=n_subj,
885+
task_id=task, description=f"[green]Completed compression", completed=n_subj
792886
)
793887

794-
if fail_sess:
795-
print("THE PROCESS FAILED TO COMPRESS THE FOLLOWING SESSIONS:")
796-
for i in fail_sess:
797-
print(i)
798-
print(" ")
799-
print("End of the compression process.")
888+
# Report results
889+
if failed_sessions:
890+
print("\nTHE PROCESS FAILED TO COMPRESS THE FOLLOWING SESSIONS:")
891+
for failed_session in failed_sessions:
892+
print(f" - {failed_session}")
893+
else:
894+
print("\nAll sessions compressed successfully!")
895+
896+
print(
897+
f"\nProcessed {n_subj} subjects, {compressed_sessions}/{total_sessions} sessions compressed successfully."
898+
)
899+
return failed_sessions

0 commit comments

Comments
 (0)