Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions scripts/create_resources/process_10x_atera.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash

# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)

# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"

set -e

publish_dir="s3://openproblems-data/resources/datasets"

cat > /tmp/params.yaml << HERE
param_list:

- id: "10x_xenium/2025_10x_human_breast_cancer_atera"
input: https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip
dataset_name: "Atera FFPE Human Breast Cancer"
dataset_url: "https://www.10xgenomics.com/datasets/atera-wta-ffpe-human-breast-cancer"
dataset_summary: "Preview dataset showcasing the pre-commercial Atera Whole Transcriptome Assay (WTA) applied to FFPE human breast cancer tissue, profiling 18,028 genes and detecting 170,057 cells."
dataset_description: "This human FFPE breast cancer data showcases results using the pre-commercial version of the Atera Whole Transcriptome Assay (WTA), which is currently under development. The assay is designed to closely match the Chromium Flex Apex assay in terms of content and sensitivity, and includes 18,028 genes. A single 5µm FFPE section of breast cancer tissue (DCIS Grade 3, T1c N0 M0) was analyzed, yielding 170,057 detected cells with a median of 2,116 transcripts per cell and 624,095,990 total high-quality decoded transcripts across 58.9 million µm² of tissue area. Output files are formatted to closely resemble Xenium Onboard Analysis v4 file formats."
dataset_organism: "homo_sapiens"
segmentation_id: [cell, nucleus]

output_dataset: "\$id/dataset.zarr"
output_state: "\$id/state.yaml"
publish_dir: "$publish_dir"
HERE

tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
--revision build/main \
--pull-latest \
--main-script target/nextflow/datasets/workflows/process_tenx_xenium/main.nf \
--workspace 53907369739130 \
--params-file /tmp/params.yaml \
--config common/nextflow_helpers/labels_tw.config \
--labels datasets,atera
56 changes: 56 additions & 0 deletions scripts/create_test_resources/2026_10x_human_breast_atera.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)

# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"

set -e

if [ ! -d temp/datasets/10x_xenium/2026_10x_human_breast_atera ]; then
mkdir -p temp/datasets/10x_xenium/2026_10x_human_breast_atera
fi
if [ ! -f temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip ]; then
wget -O temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip \
https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip
fi
if [ ! -f temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_cell_groups.csv ]; then
wget -O temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_cell_groups.csv \
https://cf.10xgenomics.com/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_cell_groups.csv
fi

cat > /tmp/params.yaml << HERE
param_list:
- id: 2026_10x_human_breast_atera
input: temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip
segmentation_id:
- cell
- nucleus
dataset_name: "Atera FFPE Human Breast Cancer"
dataset_url: "https://www.10xgenomics.com/datasets/atera-wta-ffpe-human-breast-cancer"
dataset_summary: "Preview dataset showcasing the pre-commercial Atera Whole Transcriptome Assay (WTA) applied to FFPE human breast cancer tissue, profiling 18,028 genes and detecting 170,057 cells."
dataset_description: "This human FFPE breast cancer data showcases results using the pre-commercial version of the Atera Whole Transcriptome Assay (WTA), which is currently under development. The assay is designed to closely match the Chromium Flex Apex assay in terms of content and sensitivity, and includes 18,028 genes. A single 5µm FFPE section of breast cancer tissue (DCIS Grade 3, T1c N0 M0) was analyzed, yielding 170,057 detected cells with a median of 2,116 transcripts per cell and 624,095,990 total high-quality decoded transcripts across 58.9 million µm² of tissue area. Output files are formatted to closely resemble Xenium Onboard Analysis v4 file formats. Differentially expressed genes from the graph-based clustering results were exported to annotate cell types. Major cell groups were annotated based on Kumar et al. (2023). Invasive versus DCIS (ductal carcinoma in situ) tumor cells were annotated based on molecular markers, myoepithelial cell number, and spatial localization. Similarly, DCIS-associated or invasive cancer-associated fibroblasts (CAFs) were annotated based on their spatial location. We relied partly on H&E to delineate amorphous DCIS and invasive regions. H&E proved insufficient for identifying structured basal-like DCIS which was staged as 'normal' by a pathologist, therefore, we exclusively utilized molecular markers. Cycling cells were validated using the CellCycleScoring function in Seurat. Apocrine cells were identified by histology and PIP expression (encodes prolactin-induced protein)."
dataset_organism: homo_sapiens
crop_region_min_x: 5000
crop_region_max_x: 6000
crop_region_min_y: 5000
crop_region_max_y: 6000

publish_dir: resources_test/common
output_dataset: '\$id/dataset.zarr'
output_state: '\$id/state.yaml'
HERE

# convert to zarr
nextflow run . \
-main-script target/nextflow/datasets/workflows/process_tenx_xenium/main.nf \
-profile docker \
-resume \
-params-file /tmp/params.yaml

# sync to s3
aws s3 sync --profile op \
"resources_test/common/2026_10x_human_breast_atera" \
"s3://openproblems-data/resources_test/common/2026_10x_human_breast_atera" \
--delete --dryrun
70 changes: 70 additions & 0 deletions src/datasets/loaders/tenx_atera/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: tenx_atera
namespace: datasets/loaders

argument_groups:
- name: Inputs
arguments:
- type: string
name: --input
required: true
description: A 10x atera directory or zip file or download url
- type: string
name: --segmentation_id
required: true
description: The segmentation identifier
multiple: true
- name: Metadata
arguments:
- type: string
name: --dataset_id
description: "A unique identifier for the dataset"
required: true
- name: --dataset_name
type: string
description: Nicely formatted name.
required: true
- type: string
name: --dataset_url
description: Link to the original source of the dataset.
required: false
- name: --dataset_reference
type: string
description: Bibtex reference of the paper in which the dataset was published.
required: false
- name: --dataset_summary
type: string
description: Short description of the dataset.
required: true
- name: --dataset_description
type: string
description: Long description of the dataset.
required: true
- name: --dataset_organism
type: string
description: The organism of the sample in the dataset.
required: false
- name: Outputs
arguments:
- name: "--output"
__merge__: /src/api/file_common_ist.yaml
direction: output
required: true

resources:
- type: python_script
path: script.py

engines:
- type: docker
image: openproblems/base_python:1
setup:
- type: python
pypi:
- spatialdata-io
- type: native

runners:
- type: executable
- type: nextflow
directives:
label: [midmem, midcpu, midtime]
82 changes: 82 additions & 0 deletions src/datasets/loaders/tenx_atera/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import spatialdata as sd
import anndata as ad
from spatialdata_io import xenium
import shutil
import os
import zipfile
import tempfile

## VIASH START
par = {
"input": "https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip",
"segmentation_id": [
"cell",
"nucleus",
],
"dataset_id": "value",
"dataset_name": "value",
"dataset_url": "value",
"dataset_reference": "value",
"dataset_summary": "value",
"dataset_description": "value",
"dataset_organism": "value",
"output": "temp/datasets/10x_atera/breast/breast.zarr"
}
meta = {
"cpus": 1,
}

## VIASH END

# Download the data if it's a download url, extract the data if it's a zip file
par_input = par["input"]
with tempfile.TemporaryDirectory() as tmpdirname:
if par_input.startswith("http"):
print(f"Downloading data to {tmpdirname}", flush=True)
file_name = par_input.split("/")[-1]
# download the data
os.system(f"wget {par['input']} -O {tmpdirname}/{file_name}")
par_input = tmpdirname + "/" + file_name

if zipfile.is_zipfile(par_input):
print(f"Extracting input zip to {tmpdirname}", flush=True)
with zipfile.ZipFile(par_input, "r") as zip_ref:
zip_ref.extractall(tmpdirname)
par_input = tmpdirname

# read the data
sdata = xenium(
path=par_input,
n_jobs=meta["cpus"] or 1,
cells_boundaries=True,
nucleus_boundaries=True,
morphology_focus=True,
cells_as_circles=False,
)

# remove morphology_focus
_ = sdata.images.pop("morphology_focus")

print("Add uns to table", flush=True)
new_uns = {
"dataset_id": par["dataset_id"],
"dataset_name": par["dataset_name"],
"dataset_url": par["dataset_url"],
"dataset_reference": par["dataset_reference"],
"dataset_summary": par["dataset_summary"],
"dataset_description": par["dataset_description"],
"dataset_organism": par["dataset_organism"],
"segmentation_id": par["segmentation_id"],
}
for key, value in new_uns.items():
sdata.tables["table"].uns[key] = value

print(f"Output: {sdata}", flush=True)

print(f"Writing to '{par['output']}'", flush=True)
if os.path.exists(par["output"]):
shutil.rmtree(par["output"])

print(f"Output: {sdata}")

sdata.write(par["output"])
Loading