|
2 | 2 | from glob import glob |
3 | 3 | import subprocess |
4 | 4 | import tarfile |
| 5 | +import shutil |
5 | 6 | import pandas as pd |
6 | 7 | import sys |
7 | 8 | import pydicom |
@@ -538,7 +539,7 @@ def create_session_series_names(dataset): |
538 | 539 | ser_id = ser_id.replace(cad, "") |
539 | 540 |
|
540 | 541 | # Removing the dupplicated _ characters and replacing the remaining by - |
541 | | - ser_id = cltmisc.rem_dupplicate_char(ser_id, "_") |
| 542 | + ser_id = cltmisc.rem_duplicate_char(ser_id, "_") |
542 | 543 | ser_id = ser_id.replace("_", "-") |
543 | 544 |
|
544 | 545 | if any("SeriesNumber" in s for s in attributes): |
@@ -613,6 +614,7 @@ def uncompress_dicom_session( |
613 | 614 | ... boolrmtar=True |
614 | 615 | ... ) |
615 | 616 | """ |
| 617 | + |
616 | 618 | # Validate input directory |
617 | 619 | dic_path = Path(dic_dir) |
618 | 620 | if not dic_path.exists(): |
@@ -711,89 +713,187 @@ def uncompress_dicom_session( |
711 | 713 | return failed_sessions |
712 | 714 |
|
713 | 715 |
|
714 | | -def compress_dicom_session(dic_dir: str, subj_ids=None): |
| 716 | +def compress_dicom_session( |
| 717 | + dic_dir: str, |
| 718 | + subj_ids: Optional[Union[str, List[str]]] = None, |
| 719 | + remove_original: bool = True, |
| 720 | +) -> List[str]: |
715 | 721 | """ |
716 | | - Compress session folders |
717 | | - @params: |
718 | | - dic_dir - Required : Directory containing the subjects. It assumes an organization in: |
719 | | - <subj_id>/<session_id>/<series_id>(Str) |
| 722 | + Compress session folders containing DICOM files into tar.gz archives. |
| 723 | +
|
| 724 | + Parameters |
| 725 | + ---------- |
| 726 | + dic_dir : str |
| 727 | + Directory containing the subjects. It assumes an organization in: |
| 728 | + <subj_id>/<session_id>/<series_id> |
| 729 | + subj_ids : str, list of str, or None, optional |
| 730 | + Subject IDs to be considered. Can be: |
| 731 | + - None: consider all subjects in the directory (default) |
| 732 | + - str: path to text file containing subject IDs (one per line) |
| 733 | + - list of str: explicit list of subject IDs |
| 734 | + remove_original : bool, optional, default=True |
| 735 | + Whether to remove the original session directories after successful compression. |
| 736 | +
|
| 737 | + Returns |
| 738 | + ------- |
| 739 | + list of str |
| 740 | + List of session directories that failed to be compressed. Empty list if all successful. |
| 741 | +
|
| 742 | + Raises |
| 743 | + ------ |
| 744 | + FileNotFoundError |
| 745 | + If the specified directory does not exist. |
| 746 | + ValueError |
| 747 | + If subj_ids is not None, str, or list, or if subject IDs file cannot be read. |
| 748 | + tarfile.TarError |
| 749 | + If there are issues with creating or writing tar files. |
| 750 | + PermissionError |
| 751 | + If there are insufficient permissions to compress files or remove directories. |
| 752 | + OSError |
| 753 | + If there are filesystem-related errors during compression. |
| 754 | +
|
| 755 | + Examples |
| 756 | + -------- |
| 757 | + >>> # Basic usage - compress all sessions in directory |
| 758 | + >>> failed = compress_dicom_session('/path/to/dicom/directory') |
| 759 | + >>> if not failed: |
| 760 | + ... print("All sessions compressed successfully") |
| 761 | +
|
| 762 | + >>> # Compress sessions but keep original directories |
| 763 | + >>> failed = compress_dicom_session( |
| 764 | + ... dic_dir='/path/to/dicom/directory', |
| 765 | + ... remove_original=False |
| 766 | + ... ) |
| 767 | +
|
| 768 | + >>> # Compress sessions for specific subjects only |
| 769 | + >>> failed = compress_dicom_session( |
| 770 | + ... dic_dir='/path/to/dicom/directory', |
| 771 | + ... subj_ids=['sub-001', 'sub-002', 'sub-003'] |
| 772 | + ... ) |
| 773 | +
|
| 774 | + >>> # Use subject IDs from file |
| 775 | + >>> failed = compress_dicom_session( |
| 776 | + ... dic_dir='/path/to/dicom/directory', |
| 777 | + ... subj_ids='/path/to/subject_ids.txt' |
| 778 | + ... ) |
720 | 779 | """ |
| 780 | + # Validate input directory |
| 781 | + dic_path = Path(dic_dir) |
| 782 | + if not dic_path.exists(): |
| 783 | + raise FileNotFoundError(f"Directory {dic_dir} does not exist") |
| 784 | + if not dic_path.is_dir(): |
| 785 | + raise ValueError(f"{dic_dir} is not a directory") |
721 | 786 |
|
| 787 | + # Process subject IDs |
722 | 788 | if subj_ids is None: |
723 | | - # Listing the subject ids inside the dicom folder |
724 | | - my_list = os.listdir(dic_dir) |
725 | | - subj_ids = [] |
726 | | - for it in my_list: |
727 | | - if "sub-" in it: |
728 | | - subj_ids.append(it) |
| 789 | + # Get all subjects with 'sub-' prefix |
| 790 | + subj_ids = [ |
| 791 | + item.name |
| 792 | + for item in dic_path.iterdir() |
| 793 | + if item.is_dir() and item.name.startswith("sub-") |
| 794 | + ] |
729 | 795 | subj_ids.sort() |
| 796 | + elif isinstance(subj_ids, str): |
| 797 | + # Read subject IDs from file |
| 798 | + try: |
| 799 | + with open(subj_ids, "r", encoding="utf-8") as file: |
| 800 | + subj_ids = [line.strip() for line in file if line.strip()] |
| 801 | + except FileNotFoundError: |
| 802 | + raise FileNotFoundError(f"Subject IDs file {subj_ids} not found") |
| 803 | + except Exception as e: |
| 804 | + raise ValueError(f"Error reading subject IDs file: {e}") |
| 805 | + elif isinstance(subj_ids, list): |
| 806 | + # Validate list elements |
| 807 | + if not all(isinstance(subj_id, str) for subj_id in subj_ids): |
| 808 | + raise ValueError("All subject IDs must be strings") |
730 | 809 | else: |
731 | | - if isinstance(subj_ids, str): |
732 | | - # Read the text file and save the lines in a list |
733 | | - with open(subj_ids, "r") as file: |
734 | | - subj_ids = file.readlines() |
735 | | - subj_ids = [x.strip() for x in subj_ids] |
736 | | - elif not isinstance(subj_ids, list): |
737 | | - raise ValueError("The subj_ids parameter must be a list or a string") |
| 810 | + raise ValueError("subj_ids must be None, str (file path), or list of str") |
| 811 | + |
| 812 | + if not subj_ids: |
| 813 | + print("No subjects found to process") |
| 814 | + return [] |
738 | 815 |
|
739 | 816 | n_subj = len(subj_ids) |
740 | | - # Failed sessions |
741 | | - fail_sess = [] |
| 817 | + failed_sessions = [] |
| 818 | + total_sessions = 0 |
| 819 | + compressed_sessions = 0 |
| 820 | + |
742 | 821 | with Progress() as pb: |
743 | | - t1 = pb.add_task("[green]Compressing subjects...", total=n_subj) |
| 822 | + task = pb.add_task("[green]Compressing sessions...", total=n_subj) |
| 823 | + |
| 824 | + for i, subj_id in enumerate(subj_ids): |
| 825 | + subj_dir = dic_path / subj_id |
744 | 826 |
|
745 | | - # Loop around all the subjects |
746 | | - nsubj = len(subj_ids) |
747 | | - for i, subj_id in enumerate(subj_ids): # Loop along the IDs |
748 | | - subj_dir = os.path.join(dic_dir, subj_id) |
749 | 827 | pb.update( |
750 | | - task_id=t1, |
751 | | - description=f"[green]Compressing sessions for {subj_id} ({i+1}/{n_subj})", |
752 | | - completed=i + 1, |
| 828 | + task_id=task, |
| 829 | + description=f"[green]Processing {subj_id} ({i+1}/{n_subj})", |
| 830 | + completed=i, |
753 | 831 | ) |
754 | 832 |
|
755 | | - # Loop along all the sessions inside the subject directory |
756 | | - ses_dirs = os.listdir(subj_dir) |
757 | | - |
758 | | - # Detect which of the folders are sessions |
759 | | - ses_dirs = [x for x in ses_dirs if os.path.isdir(os.path.join(subj_dir, x))] |
760 | | - |
761 | | - # Detect which of the folders start with 'ses-' |
762 | | - ses_dirs = [x for x in ses_dirs if x.startswith("ses-")] |
763 | | - n_sessions = len(ses_dirs) |
764 | | - |
765 | | - for n_ses, ses_id in enumerate(ses_dirs): # Loop along the session |
766 | | - ses_dir = os.path.join(subj_dir, ses_id) |
767 | | - # print('SubjectId: ' + subjId + ' ======> Session: ' + sesId) |
768 | | - # Compress only if it is a folder |
769 | | - if os.path.isdir(ses_dir): |
770 | | - tar_filename = ses_dir + ".tar.gz" |
771 | | - try: |
772 | | - # Compressing the folder |
773 | | - subprocess.run( |
774 | | - ["tar", "-C", subj_dir, "-czvf", tar_filename, ses_id], |
775 | | - stdout=subprocess.PIPE, |
776 | | - universal_newlines=True, |
777 | | - ) |
| 833 | + # Skip if subject directory doesn't exist |
| 834 | + if not subj_dir.exists(): |
| 835 | + print(f"Warning: Subject directory {subj_dir} not found, skipping...") |
| 836 | + continue |
778 | 837 |
|
779 | | - # Removing the uncompressed dicom folder |
780 | | - subprocess.run( |
781 | | - ["rm", "-r", ses_dir], |
782 | | - stdout=subprocess.PIPE, |
783 | | - universal_newlines=True, |
784 | | - ) |
785 | | - except: |
786 | | - fail_sess.append(ses_dir) |
| 838 | + # Find all session directories (starting with 'ses-') |
| 839 | + session_dirs = [ |
| 840 | + item |
| 841 | + for item in subj_dir.iterdir() |
| 842 | + if item.is_dir() and item.name.startswith("ses-") |
| 843 | + ] |
| 844 | + |
| 845 | + total_sessions += len(session_dirs) |
| 846 | + |
| 847 | + for ses_dir in session_dirs: |
| 848 | + tar_file_path = ses_dir.with_suffix(".tar.gz") |
| 849 | + |
| 850 | + # Skip if tar file already exists |
| 851 | + if tar_file_path.exists(): |
| 852 | + print(f"Warning: {tar_file_path} already exists, skipping...") |
| 853 | + continue |
| 854 | + |
| 855 | + try: |
| 856 | + # Create tar.gz archive using Python's tarfile module |
| 857 | + with tarfile.open(tar_file_path, "w:gz") as tar: |
| 858 | + # Add the session directory to the archive |
| 859 | + # Use arcname to preserve the directory structure |
| 860 | + tar.add(ses_dir, arcname=ses_dir.name) |
| 861 | + |
| 862 | + # Remove original directory if requested and compression succeeded |
| 863 | + if remove_original: |
| 864 | + shutil.rmtree(ses_dir) |
| 865 | + |
| 866 | + compressed_sessions += 1 |
| 867 | + |
| 868 | + except tarfile.TarError as e: |
| 869 | + print(f"Error compressing {ses_dir}: {e}") |
| 870 | + failed_sessions.append(str(ses_dir)) |
| 871 | + # Clean up partially created tar file |
| 872 | + if tar_file_path.exists(): |
| 873 | + try: |
| 874 | + tar_file_path.unlink() |
| 875 | + except Exception: |
| 876 | + pass |
| 877 | + except PermissionError as e: |
| 878 | + print(f"Permission error with {ses_dir}: {e}") |
| 879 | + failed_sessions.append(str(ses_dir)) |
| 880 | + except Exception as e: |
| 881 | + print(f"Unexpected error with {ses_dir}: {e}") |
| 882 | + failed_sessions.append(str(ses_dir)) |
787 | 883 |
|
788 | 884 | pb.update( |
789 | | - task_id=t1, |
790 | | - description=f"[green]Compressing sessions for {subj_id} ({n_subj}/{n_subj})", |
791 | | - completed=n_subj, |
| 885 | + task_id=task, description=f"[green]Completed compression", completed=n_subj |
792 | 886 | ) |
793 | 887 |
|
794 | | - if fail_sess: |
795 | | - print("THE PROCESS FAILED TO COMPRESS THE FOLLOWING SESSIONS:") |
796 | | - for i in fail_sess: |
797 | | - print(i) |
798 | | - print(" ") |
799 | | - print("End of the compression process.") |
| 888 | + # Report results |
| 889 | + if failed_sessions: |
| 890 | + print("\nTHE PROCESS FAILED TO COMPRESS THE FOLLOWING SESSIONS:") |
| 891 | + for failed_session in failed_sessions: |
| 892 | + print(f" - {failed_session}") |
| 893 | + else: |
| 894 | + print("\nAll sessions compressed successfully!") |
| 895 | + |
| 896 | + print( |
| 897 | + f"\nProcessed {n_subj} subjects, {compressed_sessions}/{total_sessions} sessions compressed successfully." |
| 898 | + ) |
| 899 | + return failed_sessions |
0 commit comments