diff --git a/scripts/create_resources/process_10x_atera.sh b/scripts/create_resources/process_10x_atera.sh new file mode 100755 index 0000000..31f0264 --- /dev/null +++ b/scripts/create_resources/process_10x_atera.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +publish_dir="s3://openproblems-data/resources/datasets" + +cat > /tmp/params.yaml << HERE +param_list: + + - id: "10x_xenium/2025_10x_human_breast_cancer_atera" + input: https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip + dataset_name: "Atera FFPE Human Breast Cancer" + dataset_url: "https://www.10xgenomics.com/datasets/atera-wta-ffpe-human-breast-cancer" + dataset_summary: "Preview dataset showcasing the pre-commercial Atera Whole Transcriptome Assay (WTA) applied to FFPE human breast cancer tissue, profiling 18,028 genes and detecting 170,057 cells." + dataset_description: "This human FFPE breast cancer data showcases results using the pre-commercial version of the Atera Whole Transcriptome Assay (WTA), which is currently under development. The assay is designed to closely match the Chromium Flex Apex assay in terms of content and sensitivity, and includes 18,028 genes. A single 5µm FFPE section of breast cancer tissue (DCIS Grade 3, T1c N0 M0) was analyzed, yielding 170,057 detected cells with a median of 2,116 transcripts per cell and 624,095,990 total high-quality decoded transcripts across 58.9 million µm² of tissue area. Output files are formatted to closely resemble Xenium Onboard Analysis v4 file formats." + dataset_organism: "homo_sapiens" + segmentation_id: [cell, nucleus] + +output_dataset: "\$id/dataset.zarr" +output_state: "\$id/state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_tenx_xenium/main.nf \ + --workspace 53907369739130 \ + --params-file /tmp/params.yaml \ + --config common/nextflow_helpers/labels_tw.config \ + --labels datasets,atera \ No newline at end of file diff --git a/scripts/create_test_resources/2026_10x_human_breast_atera.sh b/scripts/create_test_resources/2026_10x_human_breast_atera.sh new file mode 100755 index 0000000..e1a1dd8 --- /dev/null +++ b/scripts/create_test_resources/2026_10x_human_breast_atera.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +if [ ! -d temp/datasets/10x_xenium/2026_10x_human_breast_atera ]; then + mkdir -p temp/datasets/10x_xenium/2026_10x_human_breast_atera +fi +if [ ! -f temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip ]; then + wget -O temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip \ + https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip +fi +if [ ! -f temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_cell_groups.csv ]; then + wget -O temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_cell_groups.csv \ + https://cf.10xgenomics.com/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_cell_groups.csv +fi + +cat > /tmp/params.yaml << HERE +param_list: + - id: 2026_10x_human_breast_atera + input: temp/datasets/10x_xenium/2026_10x_human_breast_atera/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip + segmentation_id: + - cell + - nucleus + dataset_name: "Atera FFPE Human Breast Cancer" + dataset_url: "https://www.10xgenomics.com/datasets/atera-wta-ffpe-human-breast-cancer" + dataset_summary: "Preview dataset showcasing the pre-commercial Atera Whole Transcriptome Assay (WTA) applied to FFPE human breast cancer tissue, profiling 18,028 genes and detecting 170,057 cells." + dataset_description: "This human FFPE breast cancer data showcases results using the pre-commercial version of the Atera Whole Transcriptome Assay (WTA), which is currently under development. The assay is designed to closely match the Chromium Flex Apex assay in terms of content and sensitivity, and includes 18,028 genes. A single 5µm FFPE section of breast cancer tissue (DCIS Grade 3, T1c N0 M0) was analyzed, yielding 170,057 detected cells with a median of 2,116 transcripts per cell and 624,095,990 total high-quality decoded transcripts across 58.9 million µm² of tissue area. Output files are formatted to closely resemble Xenium Onboard Analysis v4 file formats. Differentially expressed genes from the graph-based clustering results were exported to annotate cell types. Major cell groups were annotated based on Kumar et al. (2023). Invasive versus DCIS (ductal carcinoma in situ) tumor cells were annotated based on molecular markers, myoepithelial cell number, and spatial localization. Similarly, DCIS-associated or invasive cancer-associated fibroblasts (CAFs) were annotated based on their spatial location. We relied partly on H&E to delineate amorphous DCIS and invasive regions. H&E proved insufficient for identifying structured basal-like DCIS which was staged as 'normal' by a pathologist, therefore, we exclusively utilized molecular markers. Cycling cells were validated using the CellCycleScoring function in Seurat. Apocrine cells were identified by histology and PIP expression (encodes prolactin-induced protein)." + dataset_organism: homo_sapiens + crop_region_min_x: 5000 + crop_region_max_x: 6000 + crop_region_min_y: 5000 + crop_region_max_y: 6000 + +publish_dir: resources_test/common +output_dataset: '\$id/dataset.zarr' +output_state: '\$id/state.yaml' +HERE + +# convert to zarr +nextflow run . \ + -main-script target/nextflow/datasets/workflows/process_tenx_xenium/main.nf \ + -profile docker \ + -resume \ + -params-file /tmp/params.yaml + +# sync to s3 +aws s3 sync --profile op \ + "resources_test/common/2026_10x_human_breast_atera" \ + "s3://openproblems-data/resources_test/common/2026_10x_human_breast_atera" \ + --delete --dryrun \ No newline at end of file diff --git a/src/datasets/loaders/tenx_atera/config.vsh.yaml b/src/datasets/loaders/tenx_atera/config.vsh.yaml new file mode 100644 index 0000000..f83e0db --- /dev/null +++ b/src/datasets/loaders/tenx_atera/config.vsh.yaml @@ -0,0 +1,70 @@ +name: tenx_atera +namespace: datasets/loaders + +argument_groups: + - name: Inputs + arguments: + - type: string + name: --input + required: true + description: A 10x atera directory or zip file or download url + - type: string + name: --segmentation_id + required: true + description: The segmentation identifier + multiple: true + - name: Metadata + arguments: + - type: string + name: --dataset_id + description: "A unique identifier for the dataset" + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: --dataset_url + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: Outputs + arguments: + - name: "--output" + __merge__: /src/api/file_common_ist.yaml + direction: output + required: true + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: python + pypi: + - spatialdata-io + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [midmem, midcpu, midtime] diff --git a/src/datasets/loaders/tenx_atera/script.py b/src/datasets/loaders/tenx_atera/script.py new file mode 100644 index 0000000..2f4c65d --- /dev/null +++ b/src/datasets/loaders/tenx_atera/script.py @@ -0,0 +1,82 @@ +import spatialdata as sd +import anndata as ad +from spatialdata_io import xenium +import shutil +import os +import zipfile +import tempfile + +## VIASH START +par = { + "input": "https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip", + "segmentation_id": [ + "cell", + "nucleus", + ], + "dataset_id": "value", + "dataset_name": "value", + "dataset_url": "value", + "dataset_reference": "value", + "dataset_summary": "value", + "dataset_description": "value", + "dataset_organism": "value", + "output": "temp/datasets/10x_atera/breast/breast.zarr" +} +meta = { + "cpus": 1, +} + +## VIASH END + +# Download the data if it's a download url, extract the data if it's a zip file +par_input = par["input"] +with tempfile.TemporaryDirectory() as tmpdirname: + if par_input.startswith("http"): + print(f"Downloading data to {tmpdirname}", flush=True) + file_name = par_input.split("/")[-1] + # download the data + os.system(f"wget {par['input']} -O {tmpdirname}/{file_name}") + par_input = tmpdirname + "/" + file_name + + if zipfile.is_zipfile(par_input): + print(f"Extracting input zip to {tmpdirname}", flush=True) + with zipfile.ZipFile(par_input, "r") as zip_ref: + zip_ref.extractall(tmpdirname) + par_input = tmpdirname + + # read the data + sdata = xenium( + path=par_input, + n_jobs=meta["cpus"] or 1, + cells_boundaries=True, + nucleus_boundaries=True, + morphology_focus=True, + cells_as_circles=False, + ) + + # remove morphology_focus + _ = sdata.images.pop("morphology_focus") + + print("Add uns to table", flush=True) + new_uns = { + "dataset_id": par["dataset_id"], + "dataset_name": par["dataset_name"], + "dataset_url": par["dataset_url"], + "dataset_reference": par["dataset_reference"], + "dataset_summary": par["dataset_summary"], + "dataset_description": par["dataset_description"], + "dataset_organism": par["dataset_organism"], + "segmentation_id": par["segmentation_id"], + } + for key, value in new_uns.items(): + sdata.tables["table"].uns[key] = value + + print(f"Output: {sdata}", flush=True) + + print(f"Writing to '{par['output']}'", flush=True) + if os.path.exists(par["output"]): + shutil.rmtree(par["output"]) + + print(f"Output: {sdata}") + + sdata.write(par["output"])