From 35a0ef62a007e7f6ea1f514b3ce3185c9268ee5a Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Mon, 18 May 2026 15:14:58 +0200
Subject: [PATCH 1/2] adding atera data loader

---
 scripts/create_resources/process_10x_atera.sh | 37 +++++++++
 .../2026_10x_human_breast_atera.sh            | 52 ++++++++++++
 .../loaders/tenx_atera/config.vsh.yaml        | 70 ++++++++++++++++
 src/datasets/loaders/tenx_atera/script.py     | 82 +++++++++++++++++++
 4 files changed, 241 insertions(+)
 create mode 100755 scripts/create_resources/process_10x_atera.sh
 create mode 100755 scripts/create_test_resources/2026_10x_human_breast_atera.sh
 create mode 100644 src/datasets/loaders/tenx_atera/config.vsh.yaml
 create mode 100644 src/datasets/loaders/tenx_atera/script.py

diff --git a/scripts/create_resources/process_10x_atera.sh b/scripts/create_resources/process_10x_atera.sh
new file mode 100755
index 0000000..31f0264
--- /dev/null
+++ b/scripts/create_resources/process_10x_atera.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+publish_dir="s3://openproblems-data/resources/datasets"
+
+cat > /tmp/params.yaml << HERE
+param_list:
+
+  - id: "10x_xenium/2025_10x_human_breast_cancer_atera"
+    input: https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip
+    dataset_name: "Atera FFPE Human Breast Cancer"
+    dataset_url: "https://www.10xgenomics.com/datasets/atera-wta-ffpe-human-breast-cancer"
+    dataset_summary: "Preview dataset showcasing the pre-commercial Atera Whole Transcriptome Assay (WTA) applied to FFPE human breast cancer tissue, profiling 18,028 genes and detecting 170,057 cells."
+    dataset_description: "This human FFPE breast cancer data showcases results using the pre-commercial version of the Atera Whole Transcriptome Assay (WTA), which is currently under development. The assay is designed to closely match the Chromium Flex Apex assay in terms of content and sensitivity, and includes 18,028 genes. A single 5µm FFPE section of breast cancer tissue (DCIS Grade 3, T1c N0 M0) was analyzed, yielding 170,057 detected cells with a median of 2,116 transcripts per cell and 624,095,990 total high-quality decoded transcripts across 58.9 million µm² of tissue area. Output files are formatted to closely resemble Xenium Onboard Analysis v4 file formats."
+    dataset_organism: "homo_sapiens"
+    segmentation_id: [cell, nucleus]
+
+output_dataset: "\$id/dataset.zarr"
+output_state: "\$id/state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
+  --revision build/main \
+  --pull-latest \
+  --main-script target/nextflow/datasets/workflows/process_tenx_xenium/main.nf \
+  --workspace 53907369739130 \
+  --params-file /tmp/params.yaml \
+  --config common/nextflow_helpers/labels_tw.config \
+  --labels datasets,atera
\ No newline at end of file
diff --git a/scripts/create_test_resources/2026_10x_human_breast_atera.sh b/scripts/create_test_resources/2026_10x_human_breast_atera.sh
new file mode 100755
index 0000000..4831889
--- /dev/null
+++ b/scripts/create_test_resources/2026_10x_human_breast_atera.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+if [ ! -d temp/datasets/10x_xenium/2026_10x_human_breast_atera ]; then
+  mkdir -p temp/datasets/10x_xenium/2026_10x_human_breast_atera
+fi
+if [ ! -f temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip ]; then
+  wget -O temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip \
+    https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip
+fi
+
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: 2026_10x_human_breast_atera
+    input: temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip
+    segmentation_id:
+      - cell
+      - nucleus
+    dataset_name: "Atera FFPE Human Breast Cancer"
+    dataset_url: "https://www.10xgenomics.com/datasets/atera-wta-ffpe-human-breast-cancer"
+    dataset_summary: "Preview dataset showcasing the pre-commercial Atera Whole Transcriptome Assay (WTA) applied to FFPE human breast cancer tissue, profiling 18,028 genes and detecting 170,057 cells."
+    dataset_description: "This human FFPE breast cancer data showcases results using the pre-commercial version of the Atera Whole Transcriptome Assay (WTA), which is currently under development. The assay is designed to closely match the Chromium Flex Apex assay in terms of content and sensitivity, and includes 18,028 genes. A single 5µm FFPE section of breast cancer tissue (DCIS Grade 3, T1c N0 M0) was analyzed, yielding 170,057 detected cells with a median of 2,116 transcripts per cell and 624,095,990 total high-quality decoded transcripts across 58.9 million µm² of tissue area. Output files are formatted to closely resemble Xenium Onboard Analysis v4 file formats."
+    dataset_organism: homo_sapiens
+    crop_region_min_x: 5000
+    crop_region_max_x: 6000
+    crop_region_min_y: 5000
+    crop_region_max_y: 6000
+
+publish_dir: resources_test/common
+output_dataset: '\$id/dataset.zarr'
+output_state: '\$id/state.yaml'
+HERE
+
+# convert to zarr
+nextflow run . \
+  -main-script target/nextflow/datasets/workflows/process_tenx_xenium/main.nf \
+  -profile docker \
+  -resume \
+  -params-file /tmp/params.yaml
+
+# sync to s3
+aws s3 sync --profile op \
+  "resources_test/common/2026_10x_human_breast_atera" \
+  "s3://openproblems-data/resources_test/common/2026_10x_human_breast_atera" \
+  --delete --dryrun
\ No newline at end of file
diff --git a/src/datasets/loaders/tenx_atera/config.vsh.yaml b/src/datasets/loaders/tenx_atera/config.vsh.yaml
new file mode 100644
index 0000000..f83e0db
--- /dev/null
+++ b/src/datasets/loaders/tenx_atera/config.vsh.yaml
@@ -0,0 +1,70 @@
+name: tenx_atera
+namespace: datasets/loaders
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - type: string
+        name: --input
+        required: true
+        description: A 10x atera directory or zip file or download url
+      - type: string
+        name: --segmentation_id
+        required: true
+        description: The segmentation identifier
+        multiple: true
+  - name: Metadata
+    arguments:
+      - type: string
+        name: --dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+  - name: Outputs
+    arguments:
+      - name: "--output"
+        __merge__: /src/api/file_common_ist.yaml
+        direction: output
+        required: true
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    setup:
+      - type: python
+        pypi:
+          - spatialdata-io
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midmem, midcpu, midtime]
diff --git a/src/datasets/loaders/tenx_atera/script.py b/src/datasets/loaders/tenx_atera/script.py
new file mode 100644
index 0000000..2f4c65d
--- /dev/null
+++ b/src/datasets/loaders/tenx_atera/script.py
@@ -0,0 +1,82 @@
+import spatialdata as sd
+import anndata as ad
+from spatialdata_io import xenium
+import shutil
+import os
+import zipfile
+import tempfile
+
+## VIASH START
+par = {
+    "input": "https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip",
+    "segmentation_id": [
+        "cell",
+        "nucleus",
+    ],
+    "dataset_id": "value",
+    "dataset_name": "value",
+    "dataset_url": "value",
+    "dataset_reference": "value",
+    "dataset_summary": "value",
+    "dataset_description": "value",
+    "dataset_organism": "value",
+    "output": "temp/datasets/10x_atera/breast/breast.zarr"
+}
+meta = {
+    "cpus": 1,
+}
+
+## VIASH END
+
+# Download the data if it's a download url, extract the data if it's a zip file
+par_input = par["input"]
+with tempfile.TemporaryDirectory() as tmpdirname:
+    if par_input.startswith("http"):
+        print(f"Downloading data to {tmpdirname}", flush=True)
+        file_name = par_input.split("/")[-1]
+        # download the data
+        os.system(f"wget {par['input']} -O {tmpdirname}/{file_name}")
+        par_input = tmpdirname + "/" + file_name
+
+    if zipfile.is_zipfile(par_input):
+        print(f"Extracting input zip to {tmpdirname}", flush=True)
+        with zipfile.ZipFile(par_input, "r") as zip_ref:
+            zip_ref.extractall(tmpdirname)
+            par_input = tmpdirname
+
+    # read the data
+    sdata = xenium(
+        path=par_input,
+        n_jobs=meta["cpus"] or 1,
+        cells_boundaries=True,
+        nucleus_boundaries=True,
+        morphology_focus=True,
+        cells_as_circles=False,
+    )
+
+    # remove morphology_focus
+    _ = sdata.images.pop("morphology_focus")
+
+    print("Add uns to table", flush=True)
+    new_uns = {
+        "dataset_id": par["dataset_id"],
+        "dataset_name": par["dataset_name"],
+        "dataset_url": par["dataset_url"],
+        "dataset_reference": par["dataset_reference"],
+        "dataset_summary": par["dataset_summary"],
+        "dataset_description": par["dataset_description"],
+        "dataset_organism": par["dataset_organism"],
+        "segmentation_id": par["segmentation_id"],
+    }
+    for key, value in new_uns.items():
+        sdata.tables["table"].uns[key] = value
+
+    print(f"Output: {sdata}", flush=True)
+
+    print(f"Writing to '{par['output']}'", flush=True)
+    if os.path.exists(par["output"]):
+        shutil.rmtree(par["output"])
+
+    print(f"Output: {sdata}")
+
+    sdata.write(par["output"])

From 08f78cc912fb941ef1206a4d6bfd2bea3725e85c Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Tue, 19 May 2026 10:22:41 +0200
Subject: [PATCH 2/2] adding also the cell type annotation (gt) from 10x

---
 .../create_test_resources/2026_10x_human_breast_atera.sh    | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/create_test_resources/2026_10x_human_breast_atera.sh b/scripts/create_test_resources/2026_10x_human_breast_atera.sh
index 4831889..e1a1dd8 100755
--- a/scripts/create_test_resources/2026_10x_human_breast_atera.sh
+++ b/scripts/create_test_resources/2026_10x_human_breast_atera.sh
@@ -15,6 +15,10 @@ if [ ! -f temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_
   wget -O temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip \
     https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip
 fi
+if [ ! -f temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_cell_groups.csv ]; then
+  wget -O temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_cell_groups.csv \
+    https://cf.10xgenomics.com/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_cell_groups.csv
+fi
 
 cat > /tmp/params.yaml << HERE
 param_list:
@@ -26,7 +30,7 @@ param_list:
     dataset_name: "Atera FFPE Human Breast Cancer"
     dataset_url: "https://www.10xgenomics.com/datasets/atera-wta-ffpe-human-breast-cancer"
     dataset_summary: "Preview dataset showcasing the pre-commercial Atera Whole Transcriptome Assay (WTA) applied to FFPE human breast cancer tissue, profiling 18,028 genes and detecting 170,057 cells."
-    dataset_description: "This human FFPE breast cancer data showcases results using the pre-commercial version of the Atera Whole Transcriptome Assay (WTA), which is currently under development. The assay is designed to closely match the Chromium Flex Apex assay in terms of content and sensitivity, and includes 18,028 genes. A single 5µm FFPE section of breast cancer tissue (DCIS Grade 3, T1c N0 M0) was analyzed, yielding 170,057 detected cells with a median of 2,116 transcripts per cell and 624,095,990 total high-quality decoded transcripts across 58.9 million µm² of tissue area. Output files are formatted to closely resemble Xenium Onboard Analysis v4 file formats."
+    dataset_description: "This human FFPE breast cancer data showcases results using the pre-commercial version of the Atera Whole Transcriptome Assay (WTA), which is currently under development. The assay is designed to closely match the Chromium Flex Apex assay in terms of content and sensitivity, and includes 18,028 genes. A single 5µm FFPE section of breast cancer tissue (DCIS Grade 3, T1c N0 M0) was analyzed, yielding 170,057 detected cells with a median of 2,116 transcripts per cell and 624,095,990 total high-quality decoded transcripts across 58.9 million µm² of tissue area. Output files are formatted to closely resemble Xenium Onboard Analysis v4 file formats. Differentially expressed genes from the graph-based clustering results were exported to annotate cell types. Major cell groups were annotated based on Kumar et al. (2023). Invasive versus DCIS (ductal carcinoma in situ) tumor cells were annotated based on molecular markers, myoepithelial cell number, and spatial localization. Similarly, DCIS-associated or invasive cancer-associated fibroblasts (CAFs) were annotated based on their spatial location. We relied partly on H&E to delineate amorphous DCIS and invasive regions. H&E proved insufficient for identifying structured basal-like DCIS which was staged as 'normal' by a pathologist, therefore, we exclusively utilized molecular markers. Cycling cells were validated using the CellCycleScoring function in Seurat. Apocrine cells were identified by histology and PIP expression (encodes prolactin-induced protein)."
     dataset_organism: homo_sapiens
     crop_region_min_x: 5000
     crop_region_max_x: 6000