From f45dcff701c214bc8f960650983c5b5447245103 Mon Sep 17 00:00:00 2001 From: f641l Date: Wed, 13 May 2026 17:34:33 +0200 Subject: [PATCH 1/3] adding script for h and e data loader --- scripts/create_resources/process_hae.sh | 38 +++++++++++ src/datasets/loaders/hae/config.vsh.yaml | 64 +++++++++++++++++ src/datasets/loaders/hae/script.py | 68 +++++++++++++++++++ .../workflows/process_hae/config.vsh.yaml | 59 ++++++++++++++++ src/datasets/workflows/process_hae/main.nf | 39 +++++++++++ 5 files changed, 268 insertions(+) create mode 100644 scripts/create_resources/process_hae.sh create mode 100644 src/datasets/loaders/hae/config.vsh.yaml create mode 100644 src/datasets/loaders/hae/script.py create mode 100644 src/datasets/workflows/process_hae/config.vsh.yaml create mode 100644 src/datasets/workflows/process_hae/main.nf diff --git a/scripts/create_resources/process_hae.sh b/scripts/create_resources/process_hae.sh new file mode 100644 index 0000000..b51e807 --- /dev/null +++ b/scripts/create_resources/process_hae.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +publish_dir="s3://openproblems-data/resources/datasets" + +cat > /tmp/params.yaml << HERE +param_list: + + - id: "10x_xenium/10x_mouse_breast_cancer_xenium/rep1" + input: https://cf.10xgenomics.com/samples/xenium/1.0.1/Xenium_FFPE_Human_Breast_Cancer_Rep1/Xenium_FFPE_Human_Breast_Cancer_Rep1_he_image.tif + dataset_name: "Xenium FFPE Human Breast Cancer Replicate 1" + dataset_url: "https://www.10xgenomics.com/products/xenium-in-situ/preview-dataset-human-breast" + dataset_summary: "The Xenium data was registered with post-Xenium IF / H&E images (workflow is non-destructive to the tissue) and integrated with Chromium and Visium data." + dataset_description: "Two formalin-fixed & paraffin-embedded (FFPE) breast cancer tissue blocks were obtained from Discovery Life Sciences. Sample #1 was annotated by a pathologist to be T2N1M0, Stage II-B, ER+/HER2+/PR−. Sample #2 was characterized as stage pT2 pN1a pMX, ER−/HER2+/PR−. Corresponding dissociated tumor cells for Sample #1, fresh frozen (FF) in liquid nitrogen, were also sampled from the same 2.5 cm biopsy. For the Chromium Flex workflow, two 25 μm curls were pooled as a single replicate. 5 μm sections from Sample #1 were taken from the FFPE tissue using a microtome. Two replicate 5 μm sections were taken each for Visium CytAssist and Xenium. A 5 μm section was also taken from Sample #2 for Xenium." + dataset_organism: "homo_sapiens" + +output_dataset: "\$id/dataset.zarr" +output_state: "\$id/state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/task_spatial_segmentation.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/process_datasets/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config common/nextflow_helpers/labels_tw.config \ + --labels task_template,process_datasets diff --git a/src/datasets/loaders/hae/config.vsh.yaml b/src/datasets/loaders/hae/config.vsh.yaml new file mode 100644 index 0000000..8e89399 --- /dev/null +++ b/src/datasets/loaders/hae/config.vsh.yaml @@ -0,0 +1,64 @@ +name: hae +namespace: datasets/loaders + +argument_groups: + - name: Inputs + arguments: + - type: string + name: --input + required: true + description: A H&E image file + - name: Metadata + arguments: + - type: string + name: --dataset_id + description: "A unique identifier for the dataset" + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: --dataset_url + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: Outputs + arguments: + - name: "--output" + direction: output + required: true + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: python + pypi: + - spatialdata-io + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [midmem, midcpu, midtime] \ No newline at end of file diff --git a/src/datasets/loaders/hae/script.py b/src/datasets/loaders/hae/script.py new file mode 100644 index 0000000..ed0ff63 --- /dev/null +++ b/src/datasets/loaders/hae/script.py @@ -0,0 +1,68 @@ +import spatialdata as sd +import anndata as ad +from spatialdata_io import xenium +import shutil +import os +import zipfile +import tempfile +import tifffile as tiff +import json + +## VIASH START +par = { + "input": "https://cf.10xgenomics.com/samples/xenium/1.0.1/Xenium_FFPE_Human_Breast_Cancer_Rep1/Xenium_FFPE_Human_Breast_Cancer_Rep1_he_image.tif", + "dataset_id": "value", + "dataset_name": "value", + "dataset_url": "value", + "dataset_reference": "value", + "dataset_summary": "value", + "dataset_description": "value", + "dataset_organism": "value", + "output": "temp/datasets/hae/breast/breast.tiff" +} +meta = { + "cpus": 1, +} + +## VIASH END + +# Download the data if it's a download url, extract the data if it's a zip file +par_input = par["input"] +with tempfile.TemporaryDirectory() as tmpdirname: + if par_input.startswith("http"): + print(f"Downloading data to {tmpdirname}", flush=True) + file_name = par_input.split("/")[-1] + # download the data + os.system(f"wget {par['input']} -O {tmpdirname}/{file_name}") + par_input = tmpdirname + "/" + file_name + + if zipfile.is_zipfile(par_input): + print(f"Extracting input zip to {tmpdirname}", flush=True) + with zipfile.ZipFile(par_input, "r") as zip_ref: + zip_ref.extractall(tmpdirname) + par_input = tmpdirname + + # read the data + img = tiff.imread(par_input) + + metadata = { + "dataset_id": par["dataset_id"], + "dataset_name": par["dataset_name"], + "dataset_url": par["dataset_url"], + "dataset_reference": par["dataset_reference"], + "dataset_summary": par["dataset_summary"], + "dataset_description": par["dataset_description"], + "dataset_organism": par["dataset_organism"], + "segmentation_id": par["segmentation_id"], + } + + print(f"Writing to '{par['output']}'", flush=True) + if os.path.exists(par["output"]): + shutil.rmtree(par["output"]) + + tiff.imwrite( + par["output"], + img, + description=json.dumps(metadata), + metadata=metadata, + ) \ No newline at end of file diff --git a/src/datasets/workflows/process_hae/config.vsh.yaml b/src/datasets/workflows/process_hae/config.vsh.yaml new file mode 100644 index 0000000..a5b1792 --- /dev/null +++ b/src/datasets/workflows/process_hae/config.vsh.yaml @@ -0,0 +1,59 @@ +name: process_hae +namespace: datasets/workflows + +argument_groups: + - name: Inputs + arguments: + - type: string + name: --input + required: true + description: A 10x xenium directory or zip file or download url + - name: Metadata + arguments: + - type: string + name: --id + description: "A unique identifier for the dataset" + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: --dataset_url + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: Outputs + arguments: + - name: "--output_dataset" + direction: output + required: true + default: "$id/dataset.tiff" + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +dependencies: + - name: datasets/loaders/hae + +runners: + - type: nextflow + directives: + label: [highcpu, midmem, hightime] \ No newline at end of file diff --git a/src/datasets/workflows/process_hae/main.nf b/src/datasets/workflows/process_hae/main.nf new file mode 100644 index 0000000..4baffb1 --- /dev/null +++ b/src/datasets/workflows/process_hae/main.nf @@ -0,0 +1,39 @@ +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + // copy id to the state + | map{ id, state -> + def new_state = state + [dataset_id: id] + [id, new_state] + } + + | hae.run( + fromState: [ + "input", + "dataset_id", + "dataset_name", + "dataset_url", + "dataset_reference", + "dataset_summary", + "dataset_description", + "dataset_organism", + ], + toState: ["output"] + ) + + | setState([output_dataset: "output"]) + + emit: + output_ch +} \ No newline at end of file From 0fd8d99a2a81e0c876c9d1636f9f3652bd025677 Mon Sep 17 00:00:00 2001 From: f641l Date: Tue, 19 May 2026 11:35:23 +0200 Subject: [PATCH 2/3] bugfix in config.vsh.yaml --- src/datasets/loaders/hae/config.vsh.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datasets/loaders/hae/config.vsh.yaml b/src/datasets/loaders/hae/config.vsh.yaml index 8e89399..1f89bad 100644 --- a/src/datasets/loaders/hae/config.vsh.yaml +++ b/src/datasets/loaders/hae/config.vsh.yaml @@ -58,7 +58,6 @@ engines: - type: native runners: - - type: executable - type: nextflow directives: label: [midmem, midcpu, midtime] \ No newline at end of file From 85193dcdc08c2d5cc45e729768280d6201d0a58c Mon Sep 17 00:00:00 2001 From: f641l Date: Tue, 19 May 2026 11:46:25 +0200 Subject: [PATCH 3/3] bugfix in config.vsh.yaml --- src/datasets/loaders/hae/config.vsh.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datasets/loaders/hae/config.vsh.yaml b/src/datasets/loaders/hae/config.vsh.yaml index 1f89bad..8e89399 100644 --- a/src/datasets/loaders/hae/config.vsh.yaml +++ b/src/datasets/loaders/hae/config.vsh.yaml @@ -58,6 +58,7 @@ engines: - type: native runners: + - type: executable - type: nextflow directives: label: [midmem, midcpu, midtime] \ No newline at end of file