openproblems-bio · dariarom94 · May 18, 2026
diff --git a/scripts/create_test_resources/xenium_multiome_combined.sh b/scripts/create_test_resources/xenium_multiome_combined.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# # remove this when you have implemented the script
+# echo "TODO: replace the commands in this script with the sequence of components that you need to run to generate test_resources."
+# echo "  Inside this script, you will need to place commands to generate example files for each of the 'src/api/file_*.yaml' files."
+# exit 1
+
+set -e
+
+DATASET_ID=Xenium_V1_Human_Kidney_FFPE
+
+RAW_DATA=resources_test/common
+DATASET_DIR=resources_test/task_spatial_segmentation/$DATASET_ID
+
+if [ -d "$DATASET_DIR" ]; then
+  rm -rf "$DATASET_DIR"
+fi
+mkdir -p "$DATASET_DIR"
+
+# process dataset
+viash run src/data_processors/process_dataset_multimodal/config.vsh.yaml -- \
+  --input_sp $RAW_DATA/Xenium_V1_Human_Kidney_FFPE/Xenium_V1_Human_Kidney_FFPE_crop.zarr \
+  --output_spatial_unlabelled $DATASET_DIR/spatial_unlabelled.zarr \
+  --output_spatial_solution $DATASET_DIR/spatial_solution.zarr \
+  --output_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \
+  --dataset_id $DATASET_ID \
+  --dataset_name "Test the multimodal approach from 10X" \
+  --dataset_url "https://www.10xgenomics.com/datasets/xenium-protein-ffpe-human-renal-carcinoma" \
+  --dataset_reference "10.1038/s41586-023-06812-z" \
+  --dataset_summary "Demonstration of gene expression and proteomce profiling for fresh frozen mouse brain on the Xenium platform" \
+  --dataset_description "Demonstration of gene expression profiling for fresh frozen mouse brain" \
+  --dataset_organism "homo_sapiens"
+
+# run one method
+viash run src/control_methods/random_voronoi/config.vsh.yaml -- \
+    --input $DATASET_DIR/spatial_unlabelled.zarr \
+    --input_solution $DATASET_DIR/spatial_solution.zarr \
+    --output $DATASET_DIR/prediction.zarr
+
+# run prediction processor
+viash run src/data_processors/process_prediction/config.vsh.yaml -- \
+    --input_prediction $DATASET_DIR/prediction.zarr \
+    --input_spatial_unlabelled $DATASET_DIR/spatial_unlabelled.zarr \
+    --output $DATASET_DIR/processed_prediction.zarr
+
+# run one metric
+viash run src/metrics/ari/config.vsh.yaml -- \
+    --input_prediction $DATASET_DIR/processed_prediction.zarr \
+    --input_solution $DATASET_DIR/spatial_solution.zarr \
+    --output $DATASET_DIR/score.h5ad
+
+# write manual state.yaml. this is not actually necessary but you never know it might be useful
+cat > $DATASET_DIR/state.yaml << HERE
+id: $DATASET_ID
+spatial_unlabelled: spatial_unlabelled.zarr
+spatial_solution: spatial_solution.zarr
+scrnaseq_reference: scrnaseq_reference.h5ad
+prediction: prediction.zarr
+processed_prediction: processed_prediction.zarr
+score: score.h5ad
+HERE
+
+# only run this if you have access to the openproblems-data bucket
+aws s3 sync --profile op \
+  resources_test/task_spatial_segmentation/mouse_brain_combined/ \
+  s3://openproblems-data/resources_test/task_spatial_segmentation/mouse_brain_combined/ \
+  --delete --dryrun
diff --git a/src/api/comp_data_processor_protein.yaml b/src/api/comp_data_processor_protein.yaml
@@ -0,0 +1,85 @@
+namespace: "data_processors"
+info:
+  type: data_processor
+  type_info:
+    label: Data processor
+    summary: A data processor.
+    description: |
+      A component for processing a Common Dataset into a task-specific dataset.
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input_sp"
+        __merge__: file_common_ist.yaml
+        required: true
+        direction: input
+      - name: "--input_sc"
+        __merge__: file_common_scrnaseq.yaml
+        required: false
+        direction: input
+  - name: Outputs
+    arguments:
+      - name: "--output_spatial_unlabelled"
+        __merge__: file_spatial_unlabelled.yaml
+        direction: output
+        required: true
+      - name: "--output_spatial_solution"
+        __merge__: file_spatial_solution.yaml
+        direction: output
+        required: true
+      - name: "--output_scrnaseq_reference"
+        __merge__: file_scrnaseq_reference.yaml
+        direction: output
+        required: false
+  - name: Combined Dataset Metadata
+    description: Metadata for the combined dataset that will be stored.
+    arguments:
+      - type: string
+        name: --dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+        info:
+          test_default: "mouse_brain_combined"
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+        info:
+          test_default: "Mouse brain combined dataset"
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: true
+        info:
+          test_default: "https://example.com/mouse_brain_combined"
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: true
+        info:
+          test_default: "10.1234/example.doi"
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+        info:
+          test_default: "Combined dataset for mouse brain spatial transcriptomics"
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+        info:
+          test_default: "This is a combined dataset for mouse brain spatial transcriptomics."
+      - name: --dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: true
+        info:
+          test_default: "Mus musculus"
+test_resources:
+  - path: /resources_test/common/2023_10x_mouse_brain_xenium_rep1
+    dest: resources_test/common/2023_10x_mouse_brain_xenium_rep1
+  - path: /resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2
+    dest: resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
diff --git a/src/data_processors/process_dataset_multimodal/config.vsh.yaml b/src/data_processors/process_dataset_multimodal/config.vsh.yaml
@@ -0,0 +1,35 @@
+__merge__: ../../api/comp_data_processor_protein.yaml
+
+name: process_dataset
+
+argument_groups:
+  - name: "Processing parameters"
+    arguments:
+      - name: "--span"
+        type: double
+        description: The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'.
+        default: 0.3
+      - name: "--n_top_genes"
+        type: integer
+        description: Number of highly-variable genes to keep. Mandatory if flavor='seurat_v3'.
+        default: 3000
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    setup:
+      - type: python
+        packages: [scikit-learn, scikit-misc]
+    __merge__: 
+      - /src/base/setup_spatialdata_partial.yaml
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midmem, midcpu, midtime]