|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# Made by Paul Kiessling pakiessling@ukaachen.de |
| 4 | + |
| 5 | +import argparse |
| 6 | +import json |
| 7 | +import os |
| 8 | +import shutil |
| 9 | +import tempfile |
| 10 | + |
| 11 | +import pandas as pd |
| 12 | +import scipy |
| 13 | +from pypdl import Downloader |
| 14 | +from spatialdata_io import xenium |
| 15 | + |
| 16 | +LINKS = { |
| 17 | + "https://s3-us-west-2.amazonaws.com/10x.files/samples/xenium/1.0.2/Xenium_V1_FFPE_Human_Breast_IDC_With_Addon/Xenium_V1_FFPE_Human_Breast_IDC_With_Addon_outs.zip": "7d3374472092b320ee9b876cb56c520b", |
| 18 | + "https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FFPE_Human_Breast_ILC_With_Addon/Xenium_V1_FFPE_Human_Breast_ILC_With_Addon_outs.zip": "cf779754817893dc98ff4311df2db61e", |
| 19 | + "https://zenodo.org/records/15411357/files/idc.csv": "219392e7c41587efaeb315d172fba7b0", |
| 20 | + "https://zenodo.org/records/15411357/files/ilc.csv": "d1b150c706539f0f20d9df5496a08984", |
| 21 | +} |
| 22 | + |
| 23 | + |
| 24 | +def download_links(links, temp_dir): |
| 25 | + headers = { |
| 26 | + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0" |
| 27 | + } |
| 28 | + dl = Downloader(headers=headers) |
| 29 | + for link, checksum in links.items(): |
| 30 | + print(f"Downloading {link}") |
| 31 | + file = dl.start( |
| 32 | + url=link, |
| 33 | + file_path=temp_dir, |
| 34 | + segments=10, |
| 35 | + display=True, |
| 36 | + multithread=True, |
| 37 | + block=True, |
| 38 | + retries=3, |
| 39 | + ) |
| 40 | + if not file.validate_hash(checksum, "md5"): |
| 41 | + raise ValueError(f"File {file} is corrupted") |
| 42 | + |
| 43 | + |
| 44 | +def process_files(temp_folder, out_path, annotation_path, sample_name): |
| 45 | + print(f"Transferring {temp_folder} to {out_path}") |
| 46 | + sdata = xenium( |
| 47 | + temp_folder, |
| 48 | + cells_boundaries=False, |
| 49 | + nucleus_boundaries=False, |
| 50 | + cells_as_circles=False, |
| 51 | + cells_labels=False, |
| 52 | + nucleus_labels=False, |
| 53 | + transcripts=False, |
| 54 | + morphology_mip=False, |
| 55 | + morphology_focus=False, |
| 56 | + ) |
| 57 | + sdata = sdata["table"] |
| 58 | + annotation = pd.read_csv(annotation_path) |
| 59 | + sdata.obs = sdata.obs.merge( |
| 60 | + annotation, how="left", left_on="cell_id", right_on="cell" |
| 61 | + ) |
| 62 | + sdata = sdata[sdata.obs["domain"].notna()].copy() |
| 63 | + process_adata(sdata, out_path, sample_name) |
| 64 | + |
| 65 | + |
| 66 | +def process_adata(adata, out_path, sample_name): |
| 67 | + complete_path = os.path.join(out_path, sample_name) |
| 68 | + os.makedirs(complete_path, exist_ok=True) |
| 69 | + |
| 70 | + # Observations |
| 71 | + obs = adata.obs.copy() |
| 72 | + obs["selected"] = "true" |
| 73 | + obs.to_csv(f"{complete_path}/observations.tsv", sep="\t", index_label="") |
| 74 | + |
| 75 | + # Features |
| 76 | + vars = adata.var.copy() |
| 77 | + vars["selected"] = "true" |
| 78 | + vars.to_csv(f"{complete_path}/features.tsv", sep="\t", index_label="") |
| 79 | + |
| 80 | + # Coordinates |
| 81 | + coords = pd.DataFrame(adata.obsm["spatial"], columns=["x", "y"]) |
| 82 | + coords.index = adata.obs.index |
| 83 | + coords.to_csv(f"{complete_path}/coordinates.tsv", sep="\t", index_label="") |
| 84 | + |
| 85 | + # Matrix |
| 86 | + scipy.io.mmwrite(f"{complete_path}/counts.mtx", adata.X) |
| 87 | + |
| 88 | + # Write labels.tsv |
| 89 | + labels = adata.obs["domain"] |
| 90 | + labels.to_csv(f"{complete_path}/labels.tsv", sep="\t", index_label="") |
| 91 | + |
| 92 | + |
| 93 | +def write_json(out_path): |
| 94 | + experiment = { |
| 95 | + "technology": "Xenium", |
| 96 | + "species": "human", |
| 97 | + "is_3D": False, |
| 98 | + } |
| 99 | + with open(os.path.join(out_path, "experiment.json"), "w") as f: |
| 100 | + json.dump(experiment, f) |
| 101 | + |
| 102 | + |
| 103 | +def write_table(out_path): |
| 104 | + data = { |
| 105 | + "patient": ["ILC", "IDC"], |
| 106 | + "sample": ["ILC", "IDC"], |
| 107 | + "position": [0, 0], |
| 108 | + "replicate": [0, 0], |
| 109 | + "directory": ["ILC", "IDC"], |
| 110 | + "n_clusters": [6, 8], |
| 111 | + } |
| 112 | + df = pd.DataFrame(data) |
| 113 | + df.to_csv(f"{out_path}/samples.tsv", sep="\t", index_label=False) |
| 114 | + |
| 115 | + |
| 116 | +def main(): |
| 117 | + # Set up command-line argument parser |
| 118 | + parser = argparse.ArgumentParser( |
| 119 | + description="Convert Xenium data to Spacehack format." |
| 120 | + ) |
| 121 | + |
| 122 | + # Add arguments for output folder |
| 123 | + parser.add_argument( |
| 124 | + "-o", "--out_dir", help="Output directory to write files to.", required=True |
| 125 | + ) |
| 126 | + |
| 127 | + # Parse the command-line arguments |
| 128 | + args = parser.parse_args() |
| 129 | + |
| 130 | + # Download and process |
| 131 | + with tempfile.TemporaryDirectory() as temp_dir: # |
| 132 | + download_links(LINKS, temp_dir) |
| 133 | + for file in os.listdir(temp_dir): |
| 134 | + if file.endswith(".zip"): |
| 135 | + sample_name = file.split(".")[0] |
| 136 | + sample_path = os.path.join(temp_dir, sample_name) |
| 137 | + |
| 138 | + shutil.unpack_archive(os.path.join(temp_dir, file), sample_path) |
| 139 | + |
| 140 | + if sample_name == "Xenium_V1_FFPE_Human_Breast_IDC_With_Addon_outs": |
| 141 | + annotation_path = os.path.join(temp_dir, "idc.csv") |
| 142 | + sample_name_short = "IDC" |
| 143 | + |
| 144 | + elif sample_name == "Xenium_V1_FFPE_Human_Breast_ILC_With_Addon_outs": |
| 145 | + annotation_path = os.path.join(temp_dir, "ilc.csv") |
| 146 | + sample_name_short = "ILC" |
| 147 | + else: |
| 148 | + raise ValueError("Unknown sample found in downloaded files") |
| 149 | + |
| 150 | + output_path = os.path.join(args.out_dir, sample_name_short) |
| 151 | + process_files( |
| 152 | + sample_path, output_path, annotation_path, sample_name_short |
| 153 | + ) |
| 154 | + write_json(args.out_dir) |
| 155 | + write_table(args.out_dir) |
| 156 | + |
| 157 | + |
| 158 | +if __name__ == "__main__": |
| 159 | + main() |
0 commit comments