Add STARC-9 dataset (#10)

barathi-1993 · PierreMarza · web-flow · commit 1c5939f08899 · 2026-04-08T17:17:25.000+02:00
Co-authored-by: PierreMarza &lt;pierre.marza@gmail.com&gt;
diff --git a/src/thunder/config/dataset/starc9.yaml b/src/thunder/config/dataset/starc9.yaml
@@ -0,0 +1,42 @@
+dataset_name: starc9
+nb_classes: 9
+base_data_folder: ${oc.env:THUNDER_BASE_DATA_FOLDER}/datasets/
+compatible_tasks: ["adversarial_attack", "alignment_scoring", "image_retrieval", "knn", "linear_probing", "pre_computing_embeddings", "simple_shot", "transformation_invariance", "zero_shot_vlm"]
+nb_train_samples: 630000
+nb_val_samples: 18000
+nb_test_samples: 54000
+md5sum: "3010519777b46827fdb16e656ed74975"
+image_sizes: [[256, 256]]
+mpp: 0.5
+cancer_type: colorectal
+classes: ["ADI", "LYM", "MUC", "MUS", "NCS", "NOR", "BLD", "FCT", "TUM"]
+class_to_id:
+  ADI: 0
+  LYM: 1
+  MUC: 2
+  MUS: 3
+  NCS: 4
+  NOR: 5
+  BLD: 6
+  FCT: 7
+  TUM: 8
+id_to_class:
+  0: ADI
+  1: LYM
+  2: MUC
+  3: MUS
+  4: NCS
+  5: NOR
+  6: BLD
+  7: FCT
+  8: TUM
+id_to_classname:
+  0: adipose tissue
+  1: lymphoid tissue 
+  2: mucin
+  3: muscle
+  4: necrosis
+  5: normal mucosa 
+  6: blood
+  7: fibroconnective tissue 
+  8: tumor
diff --git a/src/thunder/datasets/__init__.py b/src/thunder/datasets/__init__.py
@@ -16,6 +16,7 @@
     spider_colorectal,
     spider_skin,
     spider_thorax,
+    starc9,
     tcga_crc_msi,
     tcga_tils,
     tcga_uniform,
diff --git a/src/thunder/datasets/data_splits.py b/src/thunder/datasets/data_splits.py
@@ -39,6 +39,7 @@ def generate_splits(datasets: Union[List[str], str]) -> None:
                 "spider_colorectal",
                 "spider_skin",
                 "spider_thorax",
+                "starc9",
             ]
         elif datasets[0] == "classification":
             datasets = [
@@ -58,6 +59,7 @@ def generate_splits(datasets: Union[List[str], str]) -> None:
                 "spider_colorectal",
                 "spider_skin",
                 "spider_thorax",
+                "starc9",
             ]
         elif datasets[0] == "segmentation":
             datasets = [
@@ -104,6 +106,7 @@ def generate_splits_for_dataset(dataset_name: str) -> None:
         create_splits_spider_colorectal,
         create_splits_spider_skin,
         create_splits_spider_thorax,
+        create_splits_starc9,
         create_splits_tcga_crc_msi,
         create_splits_tcga_tils,
         create_splits_tcga_uniform,
@@ -128,6 +131,7 @@ def generate_splits_for_dataset(dataset_name: str) -> None:
         "spider_colorectal": create_splits_spider_colorectal,
         "spider_skin": create_splits_spider_skin,
         "spider_thorax": create_splits_spider_thorax,
+        "starc9": create_splits_starc9,
         # Segmentation
         "ocelot": create_splits_ocelot,
         "pannuke": create_splits_pannuke,
diff --git a/src/thunder/datasets/dataset/__init__.py b/src/thunder/datasets/dataset/__init__.py
@@ -25,6 +25,7 @@
 )
 from .spider_skin import create_splits_spider_skin, download_spider_skin
 from .spider_thorax import create_splits_spider_thorax, download_spider_thorax
+from .starc9 import create_splits_starc9, download_starc9
 from .tcga_crc_msi import create_splits_tcga_crc_msi, download_tcga_crc_msi
 from .tcga_tils import create_splits_tcga_tils, download_tcga_tils
 from .tcga_uniform import create_splits_tcga_uniform, download_tcga_uniform
diff --git a/src/thunder/datasets/dataset/starc9.py b/src/thunder/datasets/dataset/starc9.py
@@ -0,0 +1,167 @@
+from typing import Dict, List, Tuple
+
+CLASS_TO_ID = {
+    "ADI": 0,
+    "LYM": 1,
+    "MUC": 2,
+    "MUS": 3,
+    "NCS": 4,
+    "NOR": 5,
+    "BLD": 6,
+    "FCT": 7,
+    "TUM": 8,
+}
+
+VALID_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}
+
+
+def download_starc9(root_folder: str) -> None:
+    """
+    Download the STARC-9 dataset from Hugging Face and extract all zip files.
+
+    Final split mapping:
+    - train: Training_data_normalized
+    - val:   Validation_data/STANFORD-CRC-HE-VAL-SMALL
+    - test:  Validation_data/STANFORD-CRC-HE-VAL-LARGE
+
+    CURATED-TCGA is intentionally ignored here.
+    """
+    from huggingface_hub import snapshot_download
+
+    snapshot_download(
+        repo_id="Path2AI/STARC-9",
+        repo_type="dataset",
+        local_dir=root_folder,
+        local_dir_use_symlinks=False,
+    )
+
+    extract_all_zips(root_folder)
+
+
+def extract_all_zips(root_dir: str) -> None:
+    """
+    Recursively extract every .zip under root_dir into a folder with the same stem.
+    """
+    import os
+    from pathlib import Path
+
+    from ..utils import unzip_file
+
+    for current_root, _, files in os.walk(root_dir):
+        for file_name in files:
+            if not file_name.lower().endswith(".zip"):
+                continue
+
+            unzip_file(
+                os.path.join(current_root, file_name),
+                current_root,
+            )
+
+            # Renaming folder extracted from STANFORD-CRC-HE-VAL-LARGE-NORMALIZED.zip
+            if file_name == "STANFORD-CRC-HE-VAL-LARGE-NORMALIZED.zip":
+                os.rename(
+                    os.path.join(current_root, "NORMALIZED"),
+                    os.path.join(current_root, "STANFORD-CRC-HE-VAL-LARGE"),
+                )
+
+
+def collect_images_from_class_root(
+    class_root: str,
+) -> Tuple[List[str], List[int], Dict[str, int]]:
+    """
+    Read all images from a directory structured like:
+        class_root/
+            ADI/
+            LYM/
+            ...
+    """
+    from pathlib import Path
+
+    images: List[str] = []
+    labels: List[int] = []
+
+    class_root_path = Path(class_root)
+    if not class_root_path.exists():
+        raise FileNotFoundError(f"Class root does not exist: {class_root}")
+
+    missing_classes = [c for c in CLASS_TO_ID if not (class_root_path / c).exists()]
+    if missing_classes:
+        raise FileNotFoundError(
+            f"Missing expected class folders under {class_root}: {missing_classes}"
+        )
+
+    for class_name, class_id in CLASS_TO_ID.items():
+        class_dir = class_root_path / class_name
+        for img_path in sorted(class_dir.rglob("*")):
+            if img_path.is_file() and img_path.suffix.lower() in VALID_EXTS:
+                images.append(str(img_path.resolve()))
+                labels.append(class_id)
+
+    return images, labels
+
+
+def create_splits_starc9(base_folder: str, dataset_cfg: dict) -> None:
+    """
+    Generating data splits for the STARC-9 dataset.
+
+    :param base_folder: path to the main folder storing datasets.
+    :param dataset_cfg: dataset-specific config.
+    """
+    import os
+
+    from ...utils.constants import UtilsConstants
+    from ...utils.utils import set_seed
+    from ..data_splits import (
+        check_dataset,
+        create_few_shot_training_data,
+        init_dict,
+        save_dict,
+    )
+
+    # Setting the random seed
+    set_seed(UtilsConstants.DEFAULT_SEED.value)
+
+    # Initializing dict
+    starc9_data_splits = init_dict()
+
+    # Getting folder paths
+    dataset_root = os.path.join(base_folder, "starc9")
+    train_root = os.path.join(dataset_root, "Training_data_normalized")
+    val_root = os.path.join(
+        dataset_root,
+        "Validation_data",
+        "STANFORD-CRC-HE-VAL-SMALL",
+    )
+    test_root = os.path.join(
+        dataset_root,
+        "Validation_data",
+        "STANFORD-CRC-HE-VAL-LARGE",
+    )
+
+    # Collecting data
+    train_images, train_labels = collect_images_from_class_root(train_root)
+    val_images, val_labels = collect_images_from_class_root(val_root)
+    test_images, test_labels = collect_images_from_class_root(test_root)
+
+    # Updating dict
+    starc9_data_splits["train"]["images"] = train_images
+    starc9_data_splits["train"]["labels"] = train_labels
+    starc9_data_splits["val"]["images"] = val_images
+    starc9_data_splits["val"]["labels"] = val_labels
+    starc9_data_splits["test"]["images"] = test_images
+    starc9_data_splits["test"]["labels"] = test_labels
+
+    # Few-shot training data
+    starc9_data_splits = create_few_shot_training_data(starc9_data_splits)
+
+    # Checking dataset characteristics
+    check_dataset(
+        starc9_data_splits,
+        dataset_cfg,
+        base_folder,
+    )
+
+    # Saving dict
+    save_dict(
+        starc9_data_splits, os.path.join(base_folder, "data_splits", "starc9.json")
+    )
diff --git a/src/thunder/datasets/download.py b/src/thunder/datasets/download.py
@@ -27,6 +27,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False
         * spider_colorectal
         * spider_skin
         * spider_thorax
+        * starc9
         * tcga_crc_msi
         * tcga_tils
         * tcga_uniform
@@ -65,6 +66,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False
                 "spider_colorectal",
                 "spider_skin",
                 "spider_thorax",
+                "starc9",
                 "tcga_crc_msi",
                 "tcga_tils",
                 "tcga_uniform",
@@ -84,6 +86,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False
                 "spider_colorectal",
                 "spider_skin",
                 "spider_thorax",
+                "starc9",
                 "tcga_crc_msi",
                 "tcga_tils",
                 "tcga_uniform",
@@ -160,5 +163,7 @@ def download_dataset(dataset: str):
         download_spider_skin(root_folder)
     elif dataset == "spider_thorax":
         download_spider_thorax(root_folder)
+    elif dataset == "starc9":
+        download_starc9(root_folder)
     else:
         raise ValueError(f"Dataset {dataset} is not supported.")
diff --git a/src/thunder/utils/constants.py b/src/thunder/utils/constants.py
@@ -62,6 +62,7 @@ class DatasetConstants(Enum):
         "spider_colorectal",
         "spider_skin",
         "spider_thorax",
+        "starc9",
         "tcga_crc_msi",
         "tcga_tils",
         "tcga_uniform",

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@`
`25`	`25`	`)`
`26`	`26`	`from .spider_skin import create_splits_spider_skin, download_spider_skin`
`27`	`27`	`from .spider_thorax import create_splits_spider_thorax, download_spider_thorax`
	`28`	`+from .starc9 import create_splits_starc9, download_starc9`
`28`	`29`	`from .tcga_crc_msi import create_splits_tcga_crc_msi, download_tcga_crc_msi`
`29`	`30`	`from .tcga_tils import create_splits_tcga_tils, download_tcga_tils`
`30`	`31`	`from .tcga_uniform import create_splits_tcga_uniform, download_tcga_uniform`