From eebfa18fb25c2a7d0bd81f605484c9478b5f265c Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 11:12:54 +0200 Subject: [PATCH 1/4] update scripts Signed-off-by: Robrecht Cannoodt --- .../run_test_aws_eu-central-1.sh | 32 +++++++++++++++++++ .../run_benchmark/run_test_aws_eu-west-2.sh | 2 +- scripts/run_benchmark/run_test_nebius.sh | 2 +- 3 files changed, 34 insertions(+), 2 deletions(-) create mode 100755 scripts/run_benchmark/run_test_aws_eu-central-1.sh diff --git a/scripts/run_benchmark/run_test_aws_eu-central-1.sh b/scripts/run_benchmark/run_test_aws_eu-central-1.sh new file mode 100755 index 0000000..d93918c --- /dev/null +++ b/scripts/run_benchmark/run_test_aws_eu-central-1.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +resources_test_s3=s3://openproblems-data/resources_test/task_spatial_segmentation +publish_dir_s3="s3://hca-op-spatial/temp/results/$(date +%Y-%m-%d_%H-%M-%S)" + +# write the parameters to file +cat > /tmp/params.yaml << HERE +id: mouse_brain_combined +input_spatial_unlabelled: $resources_test_s3/mouse_brain_combined/spatial_unlabelled.zarr +input_spatial_solution: $resources_test_s3/mouse_brain_combined/spatial_solution.zarr +input_scrnaseq_reference: $resources_test_s3/mouse_brain_combined/scrnaseq_reference.h5ad +output_state: "state.yaml" +publish_dir: $publish_dir_s3 +HERE + +tw launch https://github.com/openproblems-bio/task_spatial_segmentation.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/run_benchmark/main.nf \ + --workspace 8386213183400 \ + --compute-env 6pOLSmxyAEvsCYsNbfrCSK \ + --params-file /tmp/params.yaml \ + --config src/base/labels_aws_eu-central-1.config \ + --labels task_spatial_segmentation,test diff --git a/scripts/run_benchmark/run_test_aws_eu-west-2.sh b/scripts/run_benchmark/run_test_aws_eu-west-2.sh index 44f14c5..d1e09e2 100755 --- a/scripts/run_benchmark/run_test_aws_eu-west-2.sh +++ b/scripts/run_benchmark/run_test_aws_eu-west-2.sh @@ -28,5 +28,5 @@ tw launch https://github.com/openproblems-bio/task_spatial_segmentation.git \ --workspace 8386213183400 \ --compute-env 7Odt43ln9XureGja6Frdm7 \ --params-file /tmp/params.yaml \ - --config src/base/labels_tw.config \ + --config src/base/labels_aws_eu-west-2.config \ --labels task_spatial_segmentation,test diff --git a/scripts/run_benchmark/run_test_nebius.sh b/scripts/run_benchmark/run_test_nebius.sh index 5cd46fe..c491051 100755 --- a/scripts/run_benchmark/run_test_nebius.sh +++ b/scripts/run_benchmark/run_test_nebius.sh @@ -9,7 +9,7 @@ cd "$REPO_ROOT" set -e resources_test_s3=s3://openproblems-data/resources_test/task_spatial_segmentation -publish_dir_s3="/scratch/temp/results/$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir_s3="/scratch/results/runs/$(date +%Y-%m-%d_%H-%M-%S)" # write the parameters to file cat > /tmp/params.yaml << HERE From 556b8af2c9dbf278313b6f3ba2d3ed21df3f851d Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 11:57:22 +0200 Subject: [PATCH 2/4] add eu central config Signed-off-by: Robrecht Cannoodt --- src/base/labels_aws_eu-central-1.config | 152 ++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 src/base/labels_aws_eu-central-1.config diff --git a/src/base/labels_aws_eu-central-1.config b/src/base/labels_aws_eu-central-1.config new file mode 100644 index 0000000..3c448b1 --- /dev/null +++ b/src/base/labels_aws_eu-central-1.config @@ -0,0 +1,152 @@ +// copied from 'common/nextflow_helpers/labels_tw.config', but the queues in the gpu labels have been updated + +def exitStrat(task, max_attempts = 3) { + println "Determining exit strategy for task (attempt '${task.attempt}', exit status '${task.exitStatus}')" + + // if the component failed 3 times, ignore the error so the workflow can continue + // it's important 'ignore' is returned even if maxRetries is set to 3, + // otherwise the workflow will stop + if (task.attempt >= 3) { + return 'ignore' + } + // when an aws spot instance is reclaimed, nextflow seems to use exit code 2147483647 + // throwing in some extra conditions just in case + if (task.exitStatus == null || task.exitStatus <= -1 || task.exitStatus > 2100000000 || !(task.exitStatus.toString().isNumber())) { + return 'retry' + } + // if component failed, retry once + if (task.exitStatus == 1 && task.attempt < 2) { + return 'retry' + } + // if component ran out of memory, retry with more memory and disk + if (task.exitStatus in [137, 139] && task.attempt < max_attempts) { + return 'retry' + } + // return 'ignore' for all other cases to ignore the error, + // otherwise the workflow will stop + return 'ignore' +} + +aws { + batch { + maxTransferAttempts = 3 + delayBetweenAttempts = '5 sec' + maxSpotAttempts = 8 + } +} + +process { + executor = 'awsbatch' + + // Default disk space + disk = 50.GB + + // Retry for exit codes that have something to do with memory issues + // always retry once + errorStrategy = { exitStrat(task) } + maxRetries = 3 + maxMemory = null + + // Resource labels + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowmem { + memory = { get_memory( 20.GB * task.attempt ) } + disk = { 50.GB * task.attempt } + } + withLabel: midmem { + memory = { get_memory( 50.GB * task.attempt ) } + disk = { 100.GB * task.attempt } + } + withLabel: highmem { + memory = { get_memory( 100.GB * task.attempt ) } + disk = { 200.GB * task.attempt } + } + withLabel: veryhighmem { + memory = { get_memory( 200.GB * task.attempt ) } + disk = { 400.GB * task.attempt } + } + withLabel: lowsharedmem { + containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" : ""} + } + withLabel: midsharedmem { + containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" : ""} + } + withLabel: highsharedmem { + containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" : ""} + } + withLabel: gpu { + // assuming g6.8xlarge + cpus = 32 + accelerator = 1 + memory = 100.GB + queue = "TowerForge-9YTjlzYCo5nGhuhJw2daF-work" + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + withLabel: midgpu { + // assuming g6.8xlarge + cpus = 32 + accelerator = 4 + memory = 100.GB + queue = "TowerForge-9YTjlzYCo5nGhuhJw2daF-work" + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + withLabel: highgpu { + // assuming g6.16xlarge + cpus = 64 + accelerator = 8 + memory = 200.GB + queue = "TowerForge-9YTjlzYCo5nGhuhJw2daF-work" + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + withLabel: biggpu { + // assuming p5.4xlarge + cpus = 16 + accelerator = 1 + memory = 200.GB + queue = "TowerForge-...-work" + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + + // make sure publishstates gets enough disk space and memory + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } +} + +// set tracing file +trace { + enabled = true + overwrite = true + file = "${params.publish_dir}/trace.txt" +} + +aws.batch.maxSpotAttempts = 5 +google.batch.maxSpotAttempts = 5 From e1907966949761ff066f60127c3738eab893a72c Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 12:25:52 +0200 Subject: [PATCH 3/4] update publish dir Signed-off-by: Robrecht Cannoodt --- scripts/run_benchmark/run_test_nebius.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_benchmark/run_test_nebius.sh b/scripts/run_benchmark/run_test_nebius.sh index c491051..415b02d 100755 --- a/scripts/run_benchmark/run_test_nebius.sh +++ b/scripts/run_benchmark/run_test_nebius.sh @@ -9,7 +9,7 @@ cd "$REPO_ROOT" set -e resources_test_s3=s3://openproblems-data/resources_test/task_spatial_segmentation -publish_dir_s3="/scratch/results/runs/$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir_s3="/mnt/data/results/runs/$(date +%Y-%m-%d_%H-%M-%S)" # write the parameters to file cat > /tmp/params.yaml << HERE From 95de0bd08fdec0abcca4831fbfb0f632e370dfd7 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 14:34:42 +0200 Subject: [PATCH 4/4] Change S3 publish directory path in run_test_nebius.sh --- scripts/run_benchmark/run_test_nebius.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_benchmark/run_test_nebius.sh b/scripts/run_benchmark/run_test_nebius.sh index 415b02d..c491051 100755 --- a/scripts/run_benchmark/run_test_nebius.sh +++ b/scripts/run_benchmark/run_test_nebius.sh @@ -9,7 +9,7 @@ cd "$REPO_ROOT" set -e resources_test_s3=s3://openproblems-data/resources_test/task_spatial_segmentation -publish_dir_s3="/mnt/data/results/runs/$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir_s3="/scratch/results/runs/$(date +%Y-%m-%d_%H-%M-%S)" # write the parameters to file cat > /tmp/params.yaml << HERE