From 6f4feab8dc48a651656a61856d262e1e472198a2 Mon Sep 17 00:00:00 2001 From: Koen Van den Berge Date: Tue, 19 May 2026 11:10:34 +0200 Subject: [PATCH 01/11] draft dataset loader python --- .../tenx_xenium_groundtruth/config.vsh.yaml | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 src/datasets/loaders/tenx_xenium_groundtruth/config.vsh.yaml diff --git a/src/datasets/loaders/tenx_xenium_groundtruth/config.vsh.yaml b/src/datasets/loaders/tenx_xenium_groundtruth/config.vsh.yaml new file mode 100644 index 0000000..95165d4 --- /dev/null +++ b/src/datasets/loaders/tenx_xenium_groundtruth/config.vsh.yaml @@ -0,0 +1,70 @@ +name: tenx_xenium_groundtruth +namespace: datasets/loaders + +argument_groups: + - name: Inputs + arguments: + - type: string + name: --input + required: true + description: A 10x xenium directory or zip file or download url or spatialData object + - type: string + name: --segmentation_id + required: true + description: The segmentation identifier + multiple: true + - name: Metadata + arguments: + - type: string + name: --dataset_id + description: "A unique identifier for the dataset" + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: --dataset_url + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: Outputs + arguments: + - name: "--output" + __merge__: /src/api/file_common_ist.yaml + direction: output + required: true + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: python + pypi: + - spatialdata-io + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [midmem, midcpu, midtime] From d134ec8ade4153683b3c15e4e597a5150fc3d3fe Mon Sep 17 00:00:00 2001 From: Koen Van den Berge Date: Tue, 19 May 2026 11:11:12 +0200 Subject: [PATCH 02/11] draft dataset loader --- .../loaders/tenx_xenium_groundtruth/script.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 src/datasets/loaders/tenx_xenium_groundtruth/script.py diff --git a/src/datasets/loaders/tenx_xenium_groundtruth/script.py b/src/datasets/loaders/tenx_xenium_groundtruth/script.py new file mode 100644 index 0000000..e6c4a59 --- /dev/null +++ b/src/datasets/loaders/tenx_xenium_groundtruth/script.py @@ -0,0 +1,69 @@ +import spatialdata as sd +import anndata as ad +from spatialdata_io import xenium +import shutil +import os +import zipfile +import tempfile + +## VIASH START +par = { + "input": "temp/datasets/10x_xenium/cervical_cancer/spatialData.zarr", + "segmentation_id": [ + "cell", + "nucleus", + ], + "dataset_id": "value", + "dataset_name": "value", + "dataset_url": "value", + "dataset_reference": "value", + "dataset_summary": "value", + "dataset_description": "value", + "dataset_organism": "value", + "output": "temp/datasets/10x_xenium/cervical_cancer/spatialData.zarr" +} +meta = { + "cpus": 1, +} + +## VIASH END + +# Download the data if it's a download url, extract the data if it's a zip file +par_input = par["input"] +with tempfile.TemporaryDirectory() as tmpdirname: + + # read the data + sdata = sd.read_zarr( + store=par_input, + selection=None + ) + + # remove morphology_focus + _ = sdata.images.pop("morphology_focus") + + print("Add uns to table", flush=True) + new_uns = { + "dataset_id": par["dataset_id"], + "dataset_name": par["dataset_name"], + "dataset_url": par["dataset_url"], + "dataset_reference": par["dataset_reference"], + "dataset_summary": par["dataset_summary"], + "dataset_description": par["dataset_description"], + "dataset_organism": par["dataset_organism"], + "segmentation_id": par["segmentation_id"], + } + for key, value in new_uns.items(): + sdata.tables["table"].uns[key] = value + + # add ground truth cell labels + # ... + + print(f"Output: {sdata}", flush=True) + + print(f"Writing to '{par['output']}'", flush=True) + if os.path.exists(par["output"]): + shutil.rmtree(par["output"]) + + print(f"Output: {sdata}") + + sdata.write(par["output"]) From 9257f8f8aad0b28f59649a54b30de645045d435d Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 11:25:05 +0200 Subject: [PATCH 03/11] wip rename cell type Signed-off-by: Robrecht Cannoodt --- src/datasets/loaders/tenx_xenium_groundtruth/script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/loaders/tenx_xenium_groundtruth/script.py b/src/datasets/loaders/tenx_xenium_groundtruth/script.py index e6c4a59..247cd2e 100644 --- a/src/datasets/loaders/tenx_xenium_groundtruth/script.py +++ b/src/datasets/loaders/tenx_xenium_groundtruth/script.py @@ -56,7 +56,7 @@ sdata.tables["table"].uns[key] = value # add ground truth cell labels - # ... + sdata.tables["table"].obs["groundtruth_celltype"] = sdata.tables["table"].obs.pop("histoplus_cell_class") print(f"Output: {sdata}", flush=True) From 8885ed27629f96ed9d2bae14c2261702daed8607 Mon Sep 17 00:00:00 2001 From: Koen Van den Berge Date: Tue, 19 May 2026 11:40:28 +0200 Subject: [PATCH 04/11] accommodate to expected naming convention --- .../loaders/tenx_xenium_groundtruth/script.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/datasets/loaders/tenx_xenium_groundtruth/script.py b/src/datasets/loaders/tenx_xenium_groundtruth/script.py index 247cd2e..0fcc9cd 100644 --- a/src/datasets/loaders/tenx_xenium_groundtruth/script.py +++ b/src/datasets/loaders/tenx_xenium_groundtruth/script.py @@ -38,8 +38,7 @@ selection=None ) - # remove morphology_focus - _ = sdata.images.pop("morphology_focus") + print("Add uns to table", flush=True) new_uns = { @@ -58,6 +57,23 @@ # add ground truth cell labels sdata.tables["table"].obs["groundtruth_celltype"] = sdata.tables["table"].obs.pop("histoplus_cell_class") + # rename Images + ## rename raw images to accomodate format + sdata.images['image'] = sdata.images['morphology_focus'] + ## rm morphology_focus + _ = sdata.images.pop("morphology_focus") + ## rename hne image + sdata.images['he_image'] = sdata.images['hne_aligned'] + ## rm hne_aligned + _ = sdata.images.pop("hne_aligned") + + # rename Labels + ## add ground truth to cell labels + sdata.Labels['groundtruth_cell_labels'] = sdata.tables['table'].obs.pop('histoplus_cell_class') + + # rename Tables + sdata.Tables['metadata'] = sdata.Tables['table'] + print(f"Output: {sdata}", flush=True) print(f"Writing to '{par['output']}'", flush=True) From f89e185ec932d817eacb0fe759fb586f4de109a7 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 11:47:35 +0200 Subject: [PATCH 05/11] fix raw input spec Signed-off-by: Robrecht Cannoodt --- README.md | 4 ++-- src/api/file_common_ist.yaml | 2 +- src/datasets/loaders/tenx_xenium_groundtruth/script.py | 3 --- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 189b17f..198c62d 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ Format: labels: 'cell_labels', 'nucleus_labels', 'groundtruth_cell_labels' points: 'transcripts' shapes: 'cell_boundaries', 'nucleus_boundaries' - tables: 'metadata' + tables: 'table' coordinate_systems: 'global' @@ -148,7 +148,7 @@ Data structure: *tables* -`metadata`: Metadata of spatial dataset. +`table`: Metadata of spatial dataset. | Slot | Type | Description | |:---|:---|:---| diff --git a/src/api/file_common_ist.yaml b/src/api/file_common_ist.yaml index 5ee883f..8a61976 100644 --- a/src/api/file_common_ist.yaml +++ b/src/api/file_common_ist.yaml @@ -112,7 +112,7 @@ info: description: Geometry of the nucleus boundary tables: - type: anndata - name: "metadata" + name: table description: Metadata of spatial dataset required: true uns: diff --git a/src/datasets/loaders/tenx_xenium_groundtruth/script.py b/src/datasets/loaders/tenx_xenium_groundtruth/script.py index 0fcc9cd..c0f4e99 100644 --- a/src/datasets/loaders/tenx_xenium_groundtruth/script.py +++ b/src/datasets/loaders/tenx_xenium_groundtruth/script.py @@ -71,9 +71,6 @@ ## add ground truth to cell labels sdata.Labels['groundtruth_cell_labels'] = sdata.tables['table'].obs.pop('histoplus_cell_class') - # rename Tables - sdata.Tables['metadata'] = sdata.Tables['table'] - print(f"Output: {sdata}", flush=True) print(f"Writing to '{par['output']}'", flush=True) From 4c8b086742a28e414fdaba7f1fa823f8067177c6 Mon Sep 17 00:00:00 2001 From: Koen Van den Berge Date: Tue, 19 May 2026 11:49:04 +0200 Subject: [PATCH 06/11] add that ground truth is from Caner Ercan --- src/datasets/loaders/tenx_xenium_groundtruth/script.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/datasets/loaders/tenx_xenium_groundtruth/script.py b/src/datasets/loaders/tenx_xenium_groundtruth/script.py index 0fcc9cd..ab0e9a6 100644 --- a/src/datasets/loaders/tenx_xenium_groundtruth/script.py +++ b/src/datasets/loaders/tenx_xenium_groundtruth/script.py @@ -55,6 +55,7 @@ sdata.tables["table"].uns[key] = value # add ground truth cell labels + ## these annotations were derived by Caner Ercan sdata.tables["table"].obs["groundtruth_celltype"] = sdata.tables["table"].obs.pop("histoplus_cell_class") # rename Images @@ -69,6 +70,7 @@ # rename Labels ## add ground truth to cell labels + ## these annotations were derived by Caner Ercan sdata.Labels['groundtruth_cell_labels'] = sdata.tables['table'].obs.pop('histoplus_cell_class') # rename Tables From 933082d445c373ed454e163d7353d5e027c3bee8 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 12:37:42 +0200 Subject: [PATCH 07/11] minor edits Signed-off-by: Robrecht Cannoodt --- .../loaders/tenx_xenium_groundtruth/script.py | 93 +++++++++---------- 1 file changed, 42 insertions(+), 51 deletions(-) diff --git a/src/datasets/loaders/tenx_xenium_groundtruth/script.py b/src/datasets/loaders/tenx_xenium_groundtruth/script.py index c21b36e..40ff7e5 100644 --- a/src/datasets/loaders/tenx_xenium_groundtruth/script.py +++ b/src/datasets/loaders/tenx_xenium_groundtruth/script.py @@ -1,14 +1,10 @@ import spatialdata as sd -import anndata as ad -from spatialdata_io import xenium import shutil import os -import zipfile -import tempfile ## VIASH START par = { - "input": "temp/datasets/10x_xenium/cervical_cancer/spatialData.zarr", + "input": "resources/datasets/gt_annotated_data/Xenium_Prime_Cervical_Cancer_FFPE_Aligned.zarr", "segmentation_id": [ "cell", "nucleus", @@ -25,60 +21,55 @@ meta = { "cpus": 1, } - ## VIASH END -# Download the data if it's a download url, extract the data if it's a zip file -par_input = par["input"] -with tempfile.TemporaryDirectory() as tmpdirname: - - # read the data - sdata = sd.read_zarr( - store=par_input, - selection=None - ) +# read the data +sdata = sd.read_zarr( + store=par["input"], + selection=None +) +print("Raw data input: ", sdata, flush=True) +print("Add uns to table", flush=True) +new_uns = { + "dataset_id": par["dataset_id"], + "dataset_name": par["dataset_name"], + "dataset_url": par["dataset_url"], + "dataset_reference": par["dataset_reference"], + "dataset_summary": par["dataset_summary"], + "dataset_description": par["dataset_description"], + "dataset_organism": par["dataset_organism"], + "segmentation_id": par["segmentation_id"], +} +for key, value in new_uns.items(): + sdata.tables["table"].uns[key] = value - print("Add uns to table", flush=True) - new_uns = { - "dataset_id": par["dataset_id"], - "dataset_name": par["dataset_name"], - "dataset_url": par["dataset_url"], - "dataset_reference": par["dataset_reference"], - "dataset_summary": par["dataset_summary"], - "dataset_description": par["dataset_description"], - "dataset_organism": par["dataset_organism"], - "segmentation_id": par["segmentation_id"], - } - for key, value in new_uns.items(): - sdata.tables["table"].uns[key] = value - - # add ground truth cell labels - ## these annotations were derived by Caner Ercan - sdata.tables["table"].obs["groundtruth_celltype"] = sdata.tables["table"].obs.pop("histoplus_cell_class") +# add ground truth cell labels +## these annotations were derived by Caner Ercan +sdata.tables["table"].obs["groundtruth_celltype"] = sdata.tables["table"].obs.pop("histoplus_cell_class") - # rename Images - ## rename raw images to accomodate format - sdata.images['image'] = sdata.images['morphology_focus'] - ## rm morphology_focus - _ = sdata.images.pop("morphology_focus") - ## rename hne image - sdata.images['he_image'] = sdata.images['hne_aligned'] - ## rm hne_aligned - _ = sdata.images.pop("hne_aligned") +# rename Images +## rename raw images to accomodate format +sdata.images['image'] = sdata.images['morphology_focus'] +## rm morphology_focus +_ = sdata.images.pop("morphology_focus") +## rename hne image +sdata.images['he_image'] = sdata.images['hne_aligned'] +## rm hne_aligned +_ = sdata.images.pop("hne_aligned") - # rename Labels - ## add ground truth to cell labels - ## these annotations were derived by Caner Ercan - sdata.Labels['groundtruth_cell_labels'] = sdata.tables['table'].obs.pop('histoplus_cell_class') +# rename Labels +## add ground truth to cell labels +## these annotations were derived by Caner Ercan +sdata.Labels['groundtruth_cell_labels'] = sdata.tables['table'].obs.pop('histoplus_cell_class') - print(f"Output: {sdata}", flush=True) +print(f"Output: {sdata}", flush=True) - print(f"Writing to '{par['output']}'", flush=True) - if os.path.exists(par["output"]): - shutil.rmtree(par["output"]) +print(f"Writing to '{par['output']}'", flush=True) +if os.path.exists(par["output"]): + shutil.rmtree(par["output"]) - print(f"Output: {sdata}") +print(f"Output: {sdata}") - sdata.write(par["output"]) +sdata.write(par["output"]) From 84ee1d512b52298c0e6ea1fcec9eca15b50cf41a Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 12:40:38 +0200 Subject: [PATCH 08/11] add initial script Signed-off-by: Robrecht Cannoodt --- .../xenium_gt_annotated_data.sh | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100755 scripts/create_resources/xenium_gt_annotated_data.sh diff --git a/scripts/create_resources/xenium_gt_annotated_data.sh b/scripts/create_resources/xenium_gt_annotated_data.sh new file mode 100755 index 0000000..8566173 --- /dev/null +++ b/scripts/create_resources/xenium_gt_annotated_data.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +cat > /tmp/params.yaml << HERE +param_list: + - id: ... + input: s3://hca-op-spatial/datasets/gt_annotated_data/Xenium_Prime_Cervical_Cancer_FFPE_Aligned.zarr + dataset_name: ... + dataset_url: ... + dataset_summary: ... + dataset_description: ... + dataset_organism: ... + +publish_dir: temp +output_dataset: '\$id/dataset.zarr' +output_state: '\$id/state.yaml' +HERE + +# convert to zarr +nextflow run . \ + -main-script target/nextflow/datasets/loaders/tenx_xenium_groundtruth/main.nf \ + -profile docker \ + -resume \ + -params-file /tmp/params.yaml + +# sync to s3 +# aws s3 sync --profile op \ +# "resources_test/datasets/2023_10x_mouse_brain_xenium_rep1" \ +# "s3://openproblems-data/resources_test/common/2023_10x_mouse_brain_xenium_rep1" \ +# --delete --dryrun From cfbf3882d708690e4570744ab4585a2aca7104af Mon Sep 17 00:00:00 2001 From: Koen Van den Berge Date: Tue, 19 May 2026 13:11:04 +0200 Subject: [PATCH 09/11] add dataset descriptions --- scripts/create_resources/xenium_gt_annotated_data.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/create_resources/xenium_gt_annotated_data.sh b/scripts/create_resources/xenium_gt_annotated_data.sh index 8566173..0ee7c97 100755 --- a/scripts/create_resources/xenium_gt_annotated_data.sh +++ b/scripts/create_resources/xenium_gt_annotated_data.sh @@ -10,13 +10,13 @@ set -e cat > /tmp/params.yaml << HERE param_list: - - id: ... + - id: tenx_xenium_groundtruth/cervical_cancer input: s3://hca-op-spatial/datasets/gt_annotated_data/Xenium_Prime_Cervical_Cancer_FFPE_Aligned.zarr - dataset_name: ... - dataset_url: ... - dataset_summary: ... - dataset_description: ... - dataset_organism: ... + dataset_name: 10X Xenium - Cervical Cancer + dataset_url: https://www.10xgenomics.com/datasets/xenium-prime-ffpe-human-cervical-cancer + dataset_summary: Gene expression library for 5K Xenium Prime panel + 100 custom genes on cervical cancer sample + dataset_description: Xenium Prime 5K In Situ Gene Expression with Cell Segmentation data for human cervical cancer (FFPE) using the Xenium Prime 5K Human Pan Tissue and Pathways Panel plus 100 Custom Genes. + dataset_organism: homo_sapiens publish_dir: temp output_dataset: '\$id/dataset.zarr' From f462de1e29c0646b511009c93ec4d2680b947017 Mon Sep 17 00:00:00 2001 From: Koen Van den Berge Date: Tue, 19 May 2026 14:23:30 +0200 Subject: [PATCH 10/11] add config file for specificity metric --- .../config.vsh.yaml | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 src/metrics/sepcificity_celltype_markers/config.vsh.yaml diff --git a/src/metrics/sepcificity_celltype_markers/config.vsh.yaml b/src/metrics/sepcificity_celltype_markers/config.vsh.yaml new file mode 100644 index 0000000..b4b00f2 --- /dev/null +++ b/src/metrics/sepcificity_celltype_markers/config.vsh.yaml @@ -0,0 +1,37 @@ +__merge__: ../../api/comp_metric.yaml + +name: specificity_celltype_marker + +info: + metrics: + - name: specificity_celltype_marker + label: "Specificty based on cell type superset marker gene expression" + summary: "Using a curated list of marker genes, exclusive marker gene expression is checked for each cell." + description: | + We start from a list of curated marker genes for each cell type superset. The number of cell + type supersets can vary. For each cell, we check if we observe at least one RNA molecule from + at least one marker gene of each superset. Biologically, a cell is expected to express marker + genes from no more than one superset. This specificity metric quantifies the fraction of cells + that express genes from at least two supersets (lower is better). Note that this metric will + favor conservative segmentation algorithms (only segmenting DAPI will have high specificity), + hence the metric will be most useful when contrasted with a sensitivity metric. + references: + doi: NULL + min: 0 + max: 1 + maximize: false + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + __merge__: ../../base/setup_spatialdata_partial.yaml + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, midcpu] \ No newline at end of file From 4c3632348e8f6274f903213bf10c2823077dee45 Mon Sep 17 00:00:00 2001 From: Koen Van den Berge Date: Tue, 19 May 2026 14:30:27 +0200 Subject: [PATCH 11/11] adding very draft specifity Python script --- .../sepcificity_celltype_markers/script.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 src/metrics/sepcificity_celltype_markers/script.py diff --git a/src/metrics/sepcificity_celltype_markers/script.py b/src/metrics/sepcificity_celltype_markers/script.py new file mode 100644 index 0000000..f705c50 --- /dev/null +++ b/src/metrics/sepcificity_celltype_markers/script.py @@ -0,0 +1,45 @@ +import numpy as np +import xarray as xr +import anndata as ad +import spatialdata as sd +from sklearn.metrics import adjusted_rand_score + +## VIASH START +par = { + # TODO: add path + 'input_prediction': 'resources_test/task_spatial_segmentation/XXX', + # TODO: this solution should be a list of marker genes from each superset + 'input_solution': 'resources_test/task_spatial_segmentation/XXXX', + 'output': 'output.h5ad' +} +meta = { + 'name': 'specificity_celltype_marker' +} +## VIASH END + +print(">> Reading input files", flush=True) +sdata_pred = sd.read_zarr(par["input_prediction"]) +# TODO: this should be reading in the list, which will not be a Zarr file +sdata_sol = sd.read_zarr(par["input_solution"]) + +dataset_id = sdata_sol.tables["table"].uns["dataset_id"] +method_id = sdata_pred.tables["table"].uns["method_id"] + +print(">> Get ground truth cell IDs from cell_labels", flush=True) +gt_cell_ids = sdata.Labels['groundtruth_cell_labels'] + +# TODO: calculate expression of marker superset for each cell + +# TODO: calculate specificity metric + +print(">> Writing output", flush=True) +output = ad.AnnData( + uns={ + "dataset_id": dataset_id, + "normalization_id": "counts", + "method_id": method_id, + "metric_ids": ["specificity_celltype_marker"], + "metric_values": [float(specificity_score)], + } +) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file