openproblems-bio · dariarom94 · Apr 1, 2026 · Apr 8, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/_viash.yaml b/_viash.yaml
@@ -1,4 +1,4 @@
-viash_version: 0.9.4
+viash_version: 0.9.7
 
 name: task_ist_preprocessing
 organization: openproblems-bio

diff --git a/common b/common
diff --git a/scripts/create_test_resources/2023_10x_mouse_brain_xenium_rep1.sh b/scripts/create_test_resources/2023_10x_mouse_brain_xenium_rep1.sh
@@ -11,15 +11,17 @@ set -e
 if [ ! -d temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1 ]; then
   mkdir -p temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1
 fi
-if [ ! -f temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip ]; then
+if [ ! -d temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs ]; then
   wget -O temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip \
     https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip
+  unzip temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip \
+    -d temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs
 fi
 
 cat > /tmp/params.yaml << HERE
 param_list:
   - id: 2023_10x_mouse_brain_xenium_rep1
-    input: temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip
+    input: temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs
     segmentation_id:
       - cell
       - nucleus
@@ -28,6 +30,7 @@ param_list:
     dataset_summary: Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform.
     dataset_description: Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1).
     dataset_organism: mus_musculus
+    dataset_reference: NA
     crop_region_min_x: 10000
     crop_region_max_x: 11000
     crop_region_min_y: 10000

diff --git a/scripts/create_test_resources/test_pipeline.sh b/scripts/create_test_resources/test_pipeline.sh
@@ -55,7 +55,7 @@ viash run src/methods_normalization/normalize_by_volume/config.vsh.yaml -- \
   --output $OUT_DIR/spatial_normalized_counts.h5ad
 
 # run a cell type annotation method
-viash run src/methods_cell_type_annotation/ssam/config.vsh.yaml -- \
+viash run src/methods_cell_type_annotation/tacco/config.vsh.yaml -- \
   --input_spatial_normalized_counts $OUT_DIR/spatial_normalized_counts.h5ad \
   --input_transcript_assignments $OUT_DIR/transcript_assignments.zarr \
   --input_scrnaseq_reference $OUT_DIR/scrnaseq_reference.h5ad \

diff --git a/src/base/setup_spatialdata_partial.yaml b/src/base/setup_spatialdata_partial.yaml
@@ -1,13 +1,3 @@
 setup:
   - type: python
-    pypi: ["spatialdata==0.5.0", "anndata>=0.12.0", "pyarrow<22.0.0", "zarr<3.0.0"]
-    # 1. remove pyarrow when https://github.com/scverse/spatialdata/issues/1007 is fixed.
-    #    This is actually fixed now with the spatialdata release 0.6.0. However, the new
-    #    release now comes with zarr 3.0.0. When reading a zarr file that was saved with
-    #    zarr 3.0.0 we can not load it with zarr<3.0.0. (PathNotFoundError: nothing found at path '')
-    # 2. Currently sopa enforces zarr<3.0.0. Therefore we need to save all our data with zarr<3.0.0.
-    #    As soon as this is fixed (https://github.com/gustaveroussy/sopa/issues/347):
-    #    - remove restriction on spatialdata
-    #    - remove zarr<3.0.0
-    #    - remove pyarrow<22.0.0
-    #    - Recreate all the datasets (scripts/create_resources/combine/process_datasets.sh)
+    pypi: ["spatialdata>=0.7.3", "anndata>=0.12.0", "zarr>=3.0.0"]
diff --git a/src/base/setup_txsim_partial.yaml b/src/base/setup_txsim_partial.yaml
@@ -2,3 +2,13 @@ setup:
   - type: python
     pypi: [squidpy, rasterio]
     github: [theislab/txsim@dev]
+    # 1. remove pyarrow when https://github.com/scverse/spatialdata/issues/1007 is fixed.
+    #    This is actually fixed now with the spatialdata release 0.6.0. However, the new
+    #    release now comes with zarr 3.0.0. When reading a zarr file that was saved with
+    #    zarr 3.0.0 we can not load it with zarr<3.0.0. (PathNotFoundError: nothing found at path '')
+    # 2. Currently sopa enforces zarr<3.0.0. Therefore we need to save all our data with zarr<3.0.0.
+    #    As soon as this is fixed (https://github.com/gustaveroussy/sopa/issues/347):
+    #    - remove restriction on spatialdata
+    #    - remove zarr<3.0.0
+    #    - remove pyarrow<22.0.0
+    #    - Recreate all the datasets (scripts/create_resources/combine/process_datasets.sh)
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
@@ -38,7 +38,7 @@ def get_crop_coords(sdata, max_n_pixels=20000*20000): #50000*50000):
         The crop coordinates
     """
 
-    _, h, w = sdata['morphology_mip']["scale0"].image.shape
+    _, h, w = sdata['image']["scale0"].image.shape
     #h, w = sdata
 
     # Check if the image is already below the maximum number of pixels
@@ -195,18 +195,23 @@ def subsample_adata_group_balanced(adata, group_key, n_samples, seed=0):
 adata.var.reset_index(inplace=True, drop=True)
 adata.var_names = adata.var["feature_name"].values.astype(str).tolist()
 
+# Ensure the metadata table exists in sdata (rename "table" -> "metadata" if needed)
+if "metadata" not in sdata.tables:
+    if "table" in sdata.tables:
+        sdata["metadata"] = sdata["table"]
+    else:
+        sdata["metadata"] = ad.AnnData(uns={})
+
 # store metadata to adata and sdata uns
 metadata_uns_cols = ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]
 for col in metadata_uns_cols:
     orig_col = f"orig_{col}"
     if orig_col in adata.uns:
         adata.uns[orig_col] = adata.uns[col]
     adata.uns[col] = par[col]
-    if not ("table" in sdata.tables):
-        sdata["table"] = ad.AnnData(uns={})
-    if orig_col in sdata["table"].uns:
-        sdata["table"].uns[orig_col] = sdata["table"].uns[col]
-    sdata["table"].uns[col] = par[col]
+    if orig_col in sdata["metadata"].uns:
+        sdata["metadata"].uns[orig_col] = sdata["metadata"].uns[col]
+    sdata["metadata"].uns[col] = par[col]
 
 # Correct the feature_key attribute in sdata if needed
 # NOTE: it would have been better to do this in the loader scripts, but this way the datasets don't need to be re-downloaded
@@ -215,6 +220,11 @@ def subsample_adata_group_balanced(adata, group_key, n_samples, seed=0):
     if feature_key != "feature_name":
         sdata['transcripts'].attrs["spatialdata_attrs"]["feature_key"] = "feature_name"
 
+# Rename image key to match API spec (file_common_ist.yaml expects "image")
+if "morphology_mip" in sdata.images:
+    sdata["image"] = sdata["morphology_mip"]
+    del sdata.images["morphology_mip"]
+
 # Crop datasets that are too large
 crop_coords = get_crop_coords(sdata)
 if crop_coords is not None:

diff --git a/src/datasets/loaders/allen_brain_cell_atlas/config.vsh.yaml b/src/datasets/loaders/allen_brain_cell_atlas/config.vsh.yaml
@@ -52,7 +52,7 @@ argument_groups:
     arguments:
       - type: boolean
         name: --keep_files
-        required: true
+        default: true
         description: Whether to remove the downloaded files after processing.
   - name: Metadata
     arguments:

diff --git a/src/datasets/loaders/tenx_xenium/config.vsh.yaml b/src/datasets/loaders/tenx_xenium/config.vsh.yaml
@@ -4,7 +4,7 @@ namespace: datasets/loaders
 argument_groups:
   - name: Inputs
     arguments:
-      - type: string
+      - type: file
         name: --input
         required: true
         description: A 10x xenium directory or zip file or download url

diff --git a/src/datasets/workflows/process_allen_brain_cell_atlas/config.vsh.yaml b/src/datasets/workflows/process_allen_brain_cell_atlas/config.vsh.yaml
@@ -29,7 +29,7 @@ argument_groups:
     arguments:
       - type: boolean
         name: --keep_files
-        required: true
+        default: true
         description: Whether to remove the downloaded files after processing.
   - name: Metadata
     arguments:

diff --git a/src/datasets/workflows/process_tenx_xenium/config.vsh.yaml b/src/datasets/workflows/process_tenx_xenium/config.vsh.yaml
@@ -4,7 +4,7 @@ namespace: datasets/workflows
 argument_groups:
   - name: Inputs
     arguments:
-      - type: string
+      - type: file
         name: --input
         required: true
         description: A 10x xenium directory or zip file or download url

diff --git a/src/methods_cell_type_annotation/rctd/config.vsh.yaml b/src/methods_cell_type_annotation/rctd/config.vsh.yaml
@@ -22,14 +22,13 @@ engines:
       #  run: |
       #    apt-get update && apt-get install -y wget
       - type: r
-        bioc: [anndataR, rhdf5, devtools]
-      #- type: r
-      #  bioc: [SummarizedExperiment,SingleCellExperiment,SpatialExperiment]
-      #  bioc_force_install: true
+        bioc: [SingleCellExperiment, anndataR, rhdf5, devtools]
+        # bioc_force_install: true
       - type: docker
         run: |
-          Rscript -e "BiocManager::install('SingleCellExperiment', type = 'source', force = TRUE, ask = FALSE); devtools::install_github('dmcable/spacexr', build_vignettes = FALSE)"
-
+          Rscript -e "options(timeout = 600000000); devtools::install_github('dmcable/spacexr', build_vignettes = FALSE)"
+# Rscript -e "BiocManager::install('SingleCellExperiment', type = 'source', force = TRUE, ask = FALSE); devtools::install_github('dmcable/spacexr', build_vignettes = FALSE)"
+
       # This can probably be left out again in the future. It currently fixes a bug described in these issues:
       # https://github.com/drighelli/SpatialExperiment/issues/171
       # https://github.com/satijalab/seurat/issues/9889

diff --git a/src/methods_cell_type_annotation/rctd/script.R b/src/methods_cell_type_annotation/rctd/script.R
@@ -1,6 +1,7 @@
 library(spacexr)
 library(Matrix)
 library(SingleCellExperiment)
+# library(SpatialExperiment)
 library(anndataR)
 
 ## VIASH START

diff --git a/src/methods_data_aggregation/aggregate_spatial_data/config.vsh.yaml b/src/methods_data_aggregation/aggregate_spatial_data/config.vsh.yaml
@@ -20,8 +20,11 @@ resources:
 engines:
   - type: docker
     image: openproblems/base_python:1
-    __merge__: 
+    __merge__:
       - /src/base/setup_spatialdata_partial.yaml
+    setup:
+      - type: python
+        pypi: [sopa]
   - type: native
 
 runners:

diff --git a/src/methods_data_aggregation/aggregate_spatial_data/script.py b/src/methods_data_aggregation/aggregate_spatial_data/script.py
@@ -1,5 +1,10 @@
 import anndata as ad
+import geopandas as gpd
+import sopa
 import spatialdata as sd
+from shapely.geometry import MultiPoint
+from spatialdata.models import ShapesModel
+from sopa.utils import copy_transformations
 
 ## VIASH START
 par = {
@@ -36,9 +41,14 @@
   del sdata.points[key]
 
 for key in list(sdata.tables.keys()):
-  if key != 'metadata':
+  if key not in ['metadata', 'table']:
     del sdata.tables[key]
 
+# raw_ist.zarr stores the metadata table as 'table'; rename to match the output spec
+if 'table' in sdata.tables and 'metadata' not in sdata.tables:
+  sdata['metadata'] = sdata.tables['table']
+  del sdata.tables['table']
+
 # sdata_transcripts
 for col in list(sdata_transcripts["transcripts"].columns):
   if col not in ['x', 'y', 'z', 'feature_name', 'cell_id', 'transcript_id']:
@@ -69,6 +79,20 @@
 adata.obs['passed_QC'] = adata_qc_col.obs['passed_QC']
 sdata['counts'] = adata
 
+#######################
+# Compute cell shapes #
+#######################
+print('Computing cell boundaries from transcripts using convex hulls', flush=True)
+transcripts_df = sdata_transcripts["transcripts"].compute()
+transcripts_assigned = transcripts_df[transcripts_df["cell_id"] != 0]
+cell_shapes = transcripts_assigned.groupby("cell_id")[["x", "y"]].apply(
+  lambda g: MultiPoint(list(zip(g["x"], g["y"]))).convex_hull
+)
+geo_df = gpd.GeoDataFrame(geometry=cell_shapes)
+geo_df = sopa.shapes.to_valid_polygons(geo_df)
+transformations = copy_transformations(sdata_transcripts["transcripts"])
+sdata["cell_boundaries"] = ShapesModel.parse(geo_df, transformations=transformations)
+
 #################
 # Write output #
 #################

diff --git a/src/methods_expression_correction/split/config.vsh.yaml b/src/methods_expression_correction/split/config.vsh.yaml
@@ -29,18 +29,20 @@ engines:
       - type: docker
         run: |
           apt-get update
+      # - type: r
+      #   packages: [fs, rlang, lifecycle]
       - type: r
-        bioc: [anndataR, rhdf5, devtools, scater]
-      - type: docker
-        run: |
-          Rscript -e "BiocManager::install('SingleCellExperiment', type = 'source', force = TRUE, ask = FALSE); options(timeout = 600000000); devtools::install_github('dmcable/spacexr', build_vignettes = FALSE); devtools::install_github('bdsc-tds/SPLIT')"
-
-      # SingleCellExperiment part can probably be left out again in the future. It currently fixes a bug described in these issues:
-      # https://github.com/drighelli/SpatialExperiment/issues/171
-      # https://github.com/satijalab/seurat/issues/9889
-      # The reinstall of SingleCellExperiment triggers the correct re-install of SpatialExperiment.
+        bioc: [SingleCellExperiment, anndataR, rhdf5, devtools, scater]
+        # bioc: [SpatialExperiment, anndataR, rhdf5, devtools, scater]
 
-      # Using a large timeout here to reduce failures during GitHub package installation.
+        # SingleCellExperiment part can probably be left out again in the future. It currently fixes a bug described in these issues:
+        # https://github.com/drighelli/SpatialExperiment/issues/171
+        # https://github.com/satijalab/seurat/issues/9889
+        # The reinstall of SingleCellExperiment triggers the correct re-install of SpatialExperiment.
+      - type: r
+        github: dmcable/spacexr
+      - type: r
+        github: bdsc-tds/SPLIT
 
   - type: native
 

diff --git a/src/methods_segmentation/binning/script.py b/src/methods_segmentation/binning/script.py
@@ -39,17 +39,19 @@ def convert_to_lower_dtype(arr):
 del hyperparameters['output']
 
 sdata = sd.read_zarr(par["input"])
-image = sdata['morphology_mip']['scale0'].image.compute().to_numpy()
-transformation = sdata['morphology_mip']['scale0'].image.transform.copy()
 
 sd_output = sd.SpatialData()
-image = sdata['morphology_mip']['scale0'].image.compute().to_numpy()
-transformation = sdata['morphology_mip']['scale0'].image.transform.copy()
+image = sdata['image']['scale0'].image.compute().to_numpy()
+transformation = sdata['image']['scale0'].image.transform.copy()
 img_arr = tx.preprocessing.segment_binning(image[0], hyperparameters['bin_size'])   ### TOdo find the optimal bin_size
 image = convert_to_lower_dtype(img_arr)
 data_array = xr.DataArray(image, name=f'segmentation', dims=('y', 'x'))
 parsed_data = Labels2DModel.parse(data_array, transformations=transformation)
 sd_output.labels['segmentation'] = parsed_data
+sd_output.tables['table'] = ad.AnnData(
+      obs=sdata.tables["table"].obs[["cell_id", "region"]],
+      var=sdata.tables["table"].var[[]]
+    )
 
 print("Writing output", flush=True)
 if os.path.exists(par["output"]):

diff --git a/src/methods_segmentation/cellpose/script.py b/src/methods_segmentation/cellpose/script.py
@@ -38,18 +38,20 @@ def convert_to_lower_dtype(arr):
 del hyperparameters['output']
 
 sdata = sd.read_zarr(par["input"])
-image = sdata['morphology_mip']['scale0'].image.compute().to_numpy()
-transformation = sdata['morphology_mip']['scale0'].image.transform.copy()
-
 sd_output = sd.SpatialData()
-image = sdata['morphology_mip']['scale0'].image.compute().to_numpy()
-transformation = sdata['morphology_mip']['scale0'].image.transform.copy()
+image = sdata['image']['scale0'].image.compute().to_numpy()
+transformation = sdata['image']['scale0'].image.transform.copy()
 img_arr = tx.preprocessing.segment_cellpose(image[0], hyperparameters) 
 image = convert_to_lower_dtype(img_arr)
 data_array = xr.DataArray(image, name=f'segmentation', dims=('y', 'x'))
 parsed_data = Labels2DModel.parse(data_array, transformations=transformation)
 sd_output.labels['segmentation'] = parsed_data
 
+sd_output.tables['table'] = ad.AnnData(
+      obs=sdata.tables["table"].obs[["cell_id", "region"]],
+      var=sdata.tables["table"].var[[]]
+    )
+
 print("Writing output", flush=True)
 if os.path.exists(par["output"]):
   shutil.rmtree(par["output"])

diff --git a/src/methods_segmentation/custom_segmentation/script.py b/src/methods_segmentation/custom_segmentation/script.py
@@ -23,13 +23,13 @@
 sdata_segmentation_only = sd.SpatialData(
   labels={
     "segmentation": sdata[par["labels_key"]]
-  }#,
-  #tables={
-  #  "table": ad.AnnData(
-  #    obs=sdata.tables["table"].obs[["cell_id", "region"]],
-  #    var=sdata.tables["table"].var[[]]
-  #  )
-  #}
+  },
+  tables={
+    "table": ad.AnnData(
+      obs=sdata.tables["table"].obs[["cell_id", "region"]],
+      var=sdata.tables["table"].var[[]]
+    )
+  }
 )
 
 print("Writing output", flush=True)

diff --git a/src/methods_segmentation/stardist/script.py b/src/methods_segmentation/stardist/script.py
@@ -4,6 +4,7 @@
 import numpy as np
 import xarray as xr
 import spatialdata as sd
+import anndata as ad
 #from csbdeep.utils import normalize
 from csbdeep.data import Normalizer, normalize_mi_ma
 from stardist.models import StarDist2D
@@ -35,8 +36,8 @@ def convert_to_lower_dtype(arr):
 
 # Read image and its transformation
 sdata = sd.read_zarr(par["input"])
-image = sdata['morphology_mip']['scale0'].image.compute().to_numpy()
-transformation = sdata['morphology_mip']['scale0'].image.transform.copy()
+image = sdata['image']['scale0'].image.compute().to_numpy()
+transformation = sdata['image']['scale0'].image.transform.copy()
 
 # Segment image
 
@@ -76,6 +77,11 @@ def do_after(self):
 parsed_labels = sd.models.Labels2DModel.parse(labels_array, transformations=transformation)
 sd_output.labels['segmentation'] = parsed_labels
 
+sd_output.tables['table'] = ad.AnnData(
+      obs=sdata.tables["table"].obs[["cell_id", "region"]],
+      var=sdata.tables["table"].var[[]]
+    )
+
 print("Writing output", flush=True)
 Path(par["output"]).parent.mkdir(parents=True, exist_ok=True)
 if os.path.exists(par["output"]):
+7 −160		component_tests/check_config.py
+6 −174		component_tests/run_and_check_output.py
+21 −0		nextflow_helpers/README.md
+232 −0		nextflow_helpers/benchmarkHelper.nf
+58 −9		nextflow_helpers/labels_tw.config
+2,786 −0		nextflow_helpers/workflowHelper.nf
+35 −0		schemas/results_v4/combined_output.json
+63 −0		schemas/results_v4/core.json
+90 −0		schemas/results_v4/dataset_info.json
+84 −0		schemas/results_v4/method_info.json
+77 −0		schemas/results_v4/metric_info.json
+50 −0		schemas/results_v4/quality_control.json
+183 −0		schemas/results_v4/results.json
+64 −0		schemas/results_v4/task_info.json
+3 −3		scripts/create_component
+4 −4		scripts/create_task_readme
+1 −1		scripts/fetch_task_run
+418 −0		scripts/render_results_report
+3 −3		scripts/sync_resources
+1 −1		scripts/upgrade_config