From f45dcff701c214bc8f960650983c5b5447245103 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Wed, 13 May 2026 17:34:33 +0200
Subject: [PATCH 1/3] adding script for h and e data loader

---
 scripts/create_resources/process_hae.sh       | 38 +++++++++++
 src/datasets/loaders/hae/config.vsh.yaml      | 64 +++++++++++++++++
 src/datasets/loaders/hae/script.py            | 68 +++++++++++++++++++
 .../workflows/process_hae/config.vsh.yaml     | 59 ++++++++++++++++
 src/datasets/workflows/process_hae/main.nf    | 39 +++++++++++
 5 files changed, 268 insertions(+)
 create mode 100644 scripts/create_resources/process_hae.sh
 create mode 100644 src/datasets/loaders/hae/config.vsh.yaml
 create mode 100644 src/datasets/loaders/hae/script.py
 create mode 100644 src/datasets/workflows/process_hae/config.vsh.yaml
 create mode 100644 src/datasets/workflows/process_hae/main.nf

diff --git a/scripts/create_resources/process_hae.sh b/scripts/create_resources/process_hae.sh
new file mode 100644
index 0000000..b51e807
--- /dev/null
+++ b/scripts/create_resources/process_hae.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+publish_dir="s3://openproblems-data/resources/datasets"
+
+cat > /tmp/params.yaml << HERE
+param_list:
+
+  - id: "10x_xenium/10x_mouse_breast_cancer_xenium/rep1"
+    input: https://cf.10xgenomics.com/samples/xenium/1.0.1/Xenium_FFPE_Human_Breast_Cancer_Rep1/Xenium_FFPE_Human_Breast_Cancer_Rep1_he_image.tif
+    dataset_name: "Xenium FFPE Human Breast Cancer Replicate 1"
+    dataset_url: "https://www.10xgenomics.com/products/xenium-in-situ/preview-dataset-human-breast"
+    dataset_summary: "The Xenium data was registered with post-Xenium IF / H&E images (workflow is non-destructive to the tissue) and integrated with Chromium and Visium data."
+    dataset_description: "Two formalin-fixed & paraffin-embedded (FFPE) breast cancer tissue blocks were obtained from Discovery Life Sciences. Sample #1 was annotated by a pathologist to be T2N1M0, Stage II-B, ER+/HER2+/PR−. Sample #2 was characterized as stage pT2 pN1a pMX, ER−/HER2+/PR−. Corresponding dissociated tumor cells for Sample #1, fresh frozen (FF) in liquid nitrogen, were also sampled from the same 2.5 cm biopsy. For the Chromium Flex workflow, two 25 μm curls were pooled as a single replicate. 5 μm sections from Sample #1 were taken from the FFPE tissue using a microtome. Two replicate 5 μm sections were taken each for Visium CytAssist and Xenium. A 5 μm section was also taken from Sample #2 for Xenium."
+    dataset_organism: "homo_sapiens"
+
+output_dataset: "\$id/dataset.zarr"
+output_state: "\$id/state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+tw launch https://github.com/openproblems-bio/task_spatial_segmentation.git \
+  --revision build/main \
+  --pull-latest \
+  --main-script target/nextflow/workflows/process_datasets/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --params-file /tmp/params.yaml \
+  --entry-name auto \
+  --config common/nextflow_helpers/labels_tw.config \
+  --labels task_template,process_datasets
diff --git a/src/datasets/loaders/hae/config.vsh.yaml b/src/datasets/loaders/hae/config.vsh.yaml
new file mode 100644
index 0000000..8e89399
--- /dev/null
+++ b/src/datasets/loaders/hae/config.vsh.yaml
@@ -0,0 +1,64 @@
+name: hae
+namespace: datasets/loaders
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - type: string
+        name: --input
+        required: true
+        description: A H&E image file
+  - name: Metadata
+    arguments:
+      - type: string
+        name: --dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+  - name: Outputs
+    arguments:
+      - name: "--output"
+        direction: output
+        required: true
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    setup:
+      - type: python
+        pypi:
+          - spatialdata-io
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midmem, midcpu, midtime]
\ No newline at end of file
diff --git a/src/datasets/loaders/hae/script.py b/src/datasets/loaders/hae/script.py
new file mode 100644
index 0000000..ed0ff63
--- /dev/null
+++ b/src/datasets/loaders/hae/script.py
@@ -0,0 +1,68 @@
+import spatialdata as sd
+import anndata as ad
+from spatialdata_io import xenium
+import shutil
+import os
+import zipfile
+import tempfile
+import tifffile as tiff
+import json
+
+## VIASH START
+par = {
+    "input": "https://cf.10xgenomics.com/samples/xenium/1.0.1/Xenium_FFPE_Human_Breast_Cancer_Rep1/Xenium_FFPE_Human_Breast_Cancer_Rep1_he_image.tif",
+    "dataset_id": "value",
+    "dataset_name": "value",
+    "dataset_url": "value",
+    "dataset_reference": "value",
+    "dataset_summary": "value",
+    "dataset_description": "value",
+    "dataset_organism": "value",
+    "output": "temp/datasets/hae/breast/breast.tiff"
+}
+meta = {
+    "cpus": 1,
+}
+
+## VIASH END
+
+# Download the data if it's a download url, extract the data if it's a zip file
+par_input = par["input"]
+with tempfile.TemporaryDirectory() as tmpdirname:
+    if par_input.startswith("http"):
+        print(f"Downloading data to {tmpdirname}", flush=True)
+        file_name = par_input.split("/")[-1]
+        # download the data
+        os.system(f"wget {par['input']} -O {tmpdirname}/{file_name}")
+        par_input = tmpdirname + "/" + file_name
+
+    if zipfile.is_zipfile(par_input):
+        print(f"Extracting input zip to {tmpdirname}", flush=True)
+        with zipfile.ZipFile(par_input, "r") as zip_ref:
+            zip_ref.extractall(tmpdirname)
+            par_input = tmpdirname
+
+    # read the data
+    img = tiff.imread(par_input)
+
+    metadata = {
+        "dataset_id": par["dataset_id"],
+        "dataset_name": par["dataset_name"],
+        "dataset_url": par["dataset_url"],
+        "dataset_reference": par["dataset_reference"],
+        "dataset_summary": par["dataset_summary"],
+        "dataset_description": par["dataset_description"],
+        "dataset_organism": par["dataset_organism"],
+        "segmentation_id": par["segmentation_id"],
+    }
+
+    print(f"Writing to '{par['output']}'", flush=True)
+    if os.path.exists(par["output"]):
+        shutil.rmtree(par["output"])
+
+    tiff.imwrite(
+        par["output"],
+        img,
+        description=json.dumps(metadata),
+        metadata=metadata,
+    )
\ No newline at end of file
diff --git a/src/datasets/workflows/process_hae/config.vsh.yaml b/src/datasets/workflows/process_hae/config.vsh.yaml
new file mode 100644
index 0000000..a5b1792
--- /dev/null
+++ b/src/datasets/workflows/process_hae/config.vsh.yaml
@@ -0,0 +1,59 @@
+name: process_hae
+namespace: datasets/workflows
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - type: string
+        name: --input
+        required: true
+        description: A 10x xenium directory or zip file or download url
+  - name: Metadata
+    arguments:
+      - type: string
+        name: --id
+        description: "A unique identifier for the dataset"
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+  - name: Outputs
+    arguments:
+    - name: "--output_dataset"
+      direction: output
+      required: true
+      default: "$id/dataset.tiff"
+
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+
+dependencies:
+  - name: datasets/loaders/hae
+
+runners:
+  - type: nextflow
+    directives:
+      label: [highcpu, midmem, hightime]
\ No newline at end of file
diff --git a/src/datasets/workflows/process_hae/main.nf b/src/datasets/workflows/process_hae/main.nf
new file mode 100644
index 0000000..4baffb1
--- /dev/null
+++ b/src/datasets/workflows/process_hae/main.nf
@@ -0,0 +1,39 @@
+workflow auto {
+  findStates(params, meta.config)
+    | meta.workflow.run(
+      auto: [publish: "state"]
+    )
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+  output_ch = input_ch
+
+    // copy id to the state
+    | map{ id, state ->
+      def new_state = state + [dataset_id: id]
+      [id, new_state]
+    }
+
+    | hae.run(
+      fromState: [
+        "input",
+        "dataset_id",
+        "dataset_name",
+        "dataset_url",
+        "dataset_reference",
+        "dataset_summary",
+        "dataset_description",
+        "dataset_organism",
+      ],
+      toState: ["output"]
+    )
+
+    | setState([output_dataset: "output"])
+
+  emit:
+  output_ch
+}
\ No newline at end of file

From 0fd8d99a2a81e0c876c9d1636f9f3652bd025677 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Tue, 19 May 2026 11:35:23 +0200
Subject: [PATCH 2/3] bugfix in config.vsh.yaml

---
 src/datasets/loaders/hae/config.vsh.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/datasets/loaders/hae/config.vsh.yaml b/src/datasets/loaders/hae/config.vsh.yaml
index 8e89399..1f89bad 100644
--- a/src/datasets/loaders/hae/config.vsh.yaml
+++ b/src/datasets/loaders/hae/config.vsh.yaml
@@ -58,7 +58,6 @@ engines:
   - type: native
 
 runners:
-  - type: executable
   - type: nextflow
     directives:
       label: [midmem, midcpu, midtime]
\ No newline at end of file

From 85193dcdc08c2d5cc45e729768280d6201d0a58c Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Tue, 19 May 2026 11:46:25 +0200
Subject: [PATCH 3/3] bugfix in config.vsh.yaml

---
 src/datasets/loaders/hae/config.vsh.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/datasets/loaders/hae/config.vsh.yaml b/src/datasets/loaders/hae/config.vsh.yaml
index 1f89bad..8e89399 100644
--- a/src/datasets/loaders/hae/config.vsh.yaml
+++ b/src/datasets/loaders/hae/config.vsh.yaml
@@ -58,6 +58,7 @@ engines:
   - type: native
 
 runners:
+  - type: executable
   - type: nextflow
     directives:
       label: [midmem, midcpu, midtime]
\ No newline at end of file