From 4a6f1f3e665afb49d7465dd0800a8fe9eb280a98 Mon Sep 17 00:00:00 2001 From: Friedrich Preusser Date: Wed, 20 May 2026 16:52:54 +0200 Subject: [PATCH] Add MERSCOPE data conversion script from raw to spatialdata (in spatial imaging dataset openproblems convention) for Vizgen datasets --- .../vzg_merscope/convert_to_spatialdata.py | 88 +++++++++++++++++++ .../vzg_merscope/metadata_breastcancer.yml | 18 ++++ .../vzg_merscope/metadata_humanbrain.yml | 14 +++ 3 files changed, 120 insertions(+) create mode 100644 src/datasets/loaders/vzg_merscope/convert_to_spatialdata.py create mode 100644 src/datasets/loaders/vzg_merscope/metadata_breastcancer.yml create mode 100644 src/datasets/loaders/vzg_merscope/metadata_humanbrain.yml diff --git a/src/datasets/loaders/vzg_merscope/convert_to_spatialdata.py b/src/datasets/loaders/vzg_merscope/convert_to_spatialdata.py new file mode 100644 index 0000000..3f8a4a8 --- /dev/null +++ b/src/datasets/loaders/vzg_merscope/convert_to_spatialdata.py @@ -0,0 +1,88 @@ +"""Convert MERSCOPE regions to SpatialData .zarr files. + +Produces one .zarr per region with: + images: 'morphology_mip' + points: 'transcripts' + shapes: 'cell_boundaries' (from cell_boundaries.parquet in region folder) + tables: 'table' (obs × var counts, linked to cell_boundaries) + coordinate_systems: 'global' +""" + +import shutil +from pathlib import Path + +import spatialdata as sd +import yaml +from spatialdata_io import merscope + +# --------------------------------------------------------------------------- +# Config +# --------------------------------------------------------------------------- +with open("metadata.yml") as f: + cfg = yaml.safe_load(f) + +BASE_DIR = Path(cfg["base_dir"]) +OUTPUT_DIR = Path(cfg["output_dir"]) +REGIONS = cfg["regions"] +SLIDE_NAME = BASE_DIR.name +Z_LAYER = cfg["z_layer"] +DATASET_METADATA = cfg["dataset"] + +# --------------------------------------------------------------------------- +# Load & convert each region +# --------------------------------------------------------------------------- +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +for region in REGIONS: + region_dir = BASE_DIR / region + dataset_id = f"{SLIDE_NAME}_{region}" + print(f"\n{'='*60}") + print(f"Processing {region} (dataset_id={dataset_id})") + print(f"{'='*60}") + + # --- Load with spatialdata-io merscope reader --- + sdata = merscope( + path=region_dir, + z_layers=Z_LAYER, + region_name=region, + slide_name=SLIDE_NAME, + backend=None, + transcripts=True, + cells_boundaries=True, + cells_table=True, + mosaic_images=True, + ) + print(f"[load] {sdata}") + + # --- Rename elements to canonical names --- + + # Images: {dataset_id}_z{Z_LAYER} → morphology_mip + image_key = f"{dataset_id}_z{Z_LAYER}" + if image_key in sdata.images: + sdata.images["morphology_mip"] = sdata.images.pop(image_key) + + # Points: {dataset_id}_transcripts → transcripts + points_key = f"{dataset_id}_transcripts" + if points_key in sdata.points: + sdata.points["transcripts"] = sdata.points.pop(points_key) + + # Shapes: {dataset_id}_polygons → cell_boundaries + polygons_key = f"{dataset_id}_polygons" + if polygons_key in sdata.shapes: + sdata.shapes["cell_boundaries"] = sdata.shapes.pop(polygons_key) + + # --- Update uns metadata on table --- + DATASET_METADATA["dataset_id"] = dataset_id + for key, value in DATASET_METADATA.items(): + sdata.tables["table"].uns[key] = value + + print(f"[done] {sdata}") + + # --- Write output --- + out_path = OUTPUT_DIR / f"{region}.zarr" + if out_path.exists(): + shutil.rmtree(out_path) + sdata.write(out_path) + print(f"[write] {out_path}") + +print("\nDone.") diff --git a/src/datasets/loaders/vzg_merscope/metadata_breastcancer.yml b/src/datasets/loaders/vzg_merscope/metadata_breastcancer.yml new file mode 100644 index 0000000..3e1d754 --- /dev/null +++ b/src/datasets/loaders/vzg_merscope/metadata_breastcancer.yml @@ -0,0 +1,18 @@ +base_dir: "data/BreastCancerTMA/202409242358_240916JHHUBC0005XQ-V2V-HubcTMA-V2-BY_VMSC02511" +output_dir: "output/BreastCancerTMA" +regions: + - region_R1 + - region_R2 + - region_R3 + - region_R4 + - region_R5 +z_layer: 3 + +dataset: + dataset_name: "240916JHHUBC0005XQ-V2V-HubcTMA-V2-BY" + dataset_url: "https://info.vizgen.com/merfish-2.0-data-release-form" + dataset_reference: "" + dataset_summary: "MERSCOPE V2 data release." + dataset_description: "MERSCOPE V2 data release. Human Breast Cancer TMA FFPE, D2M1774, V2 SOP, cell bond stain" + dataset_organism: "Homo sapiens" + segmentation_id: "cellpose" diff --git a/src/datasets/loaders/vzg_merscope/metadata_humanbrain.yml b/src/datasets/loaders/vzg_merscope/metadata_humanbrain.yml new file mode 100644 index 0000000..1b11993 --- /dev/null +++ b/src/datasets/loaders/vzg_merscope/metadata_humanbrain.yml @@ -0,0 +1,14 @@ +base_dir: "data/HumanBrain/202503211645_HuBrain-FF-22676A1-D2M1769-RC_VMSC16710" +output_dir: "output/HumanBrain" +regions: + - region_R1 +z_layer: 3 + +dataset: + dataset_name: "HuBrain-FF-22676A1-D2M1769-RC" + dataset_url: "https://info.vizgen.com/access-to-merfish-2.0-data-release" + dataset_reference: "" + dataset_summary: "MERSCOPE V2 Human Brain fresh frozen sample." + dataset_description: "MERSCOPE V2 Human Brain fresh frozen, sample 22676A1, panel DM1769, Cellpose segmentation." + dataset_organism: "Homo sapiens" + segmentation_id: "cellpose"