diff --git a/src/base/labels_aws_eu-central-1.config b/src/base/labels_aws_eu-central-1.config new file mode 100644 index 0000000..3c448b1 --- /dev/null +++ b/src/base/labels_aws_eu-central-1.config @@ -0,0 +1,152 @@ +// copied from 'common/nextflow_helpers/labels_tw.config', but the queues in the gpu labels have been updated + +def exitStrat(task, max_attempts = 3) { + println "Determining exit strategy for task (attempt '${task.attempt}', exit status '${task.exitStatus}')" + + // if the component failed 3 times, ignore the error so the workflow can continue + // it's important 'ignore' is returned even if maxRetries is set to 3, + // otherwise the workflow will stop + if (task.attempt >= 3) { + return 'ignore' + } + // when an aws spot instance is reclaimed, nextflow seems to use exit code 2147483647 + // throwing in some extra conditions just in case + if (task.exitStatus == null || task.exitStatus <= -1 || task.exitStatus > 2100000000 || !(task.exitStatus.toString().isNumber())) { + return 'retry' + } + // if component failed, retry once + if (task.exitStatus == 1 && task.attempt < 2) { + return 'retry' + } + // if component ran out of memory, retry with more memory and disk + if (task.exitStatus in [137, 139] && task.attempt < max_attempts) { + return 'retry' + } + // return 'ignore' for all other cases to ignore the error, + // otherwise the workflow will stop + return 'ignore' +} + +aws { + batch { + maxTransferAttempts = 3 + delayBetweenAttempts = '5 sec' + maxSpotAttempts = 8 + } +} + +process { + executor = 'awsbatch' + + // Default disk space + disk = 50.GB + + // Retry for exit codes that have something to do with memory issues + // always retry once + errorStrategy = { exitStrat(task) } + maxRetries = 3 + maxMemory = null + + // Resource labels + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowmem { + memory = { get_memory( 20.GB * task.attempt ) } + disk = { 50.GB * task.attempt } + } + withLabel: midmem { + memory = { get_memory( 50.GB * task.attempt ) } + disk = { 100.GB * task.attempt } + } + withLabel: highmem { + memory = { get_memory( 100.GB * task.attempt ) } + disk = { 200.GB * task.attempt } + } + withLabel: veryhighmem { + memory = { get_memory( 200.GB * task.attempt ) } + disk = { 400.GB * task.attempt } + } + withLabel: lowsharedmem { + containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" : ""} + } + withLabel: midsharedmem { + containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" : ""} + } + withLabel: highsharedmem { + containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" : ""} + } + withLabel: gpu { + // assuming g6.8xlarge + cpus = 32 + accelerator = 1 + memory = 100.GB + queue = "TowerForge-9YTjlzYCo5nGhuhJw2daF-work" + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + withLabel: midgpu { + // assuming g6.8xlarge + cpus = 32 + accelerator = 4 + memory = 100.GB + queue = "TowerForge-9YTjlzYCo5nGhuhJw2daF-work" + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + withLabel: highgpu { + // assuming g6.16xlarge + cpus = 64 + accelerator = 8 + memory = 200.GB + queue = "TowerForge-9YTjlzYCo5nGhuhJw2daF-work" + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + withLabel: biggpu { + // assuming p5.4xlarge + cpus = 16 + accelerator = 1 + memory = 200.GB + queue = "TowerForge-...-work" + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + + // make sure publishstates gets enough disk space and memory + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } +} + +// set tracing file +trace { + enabled = true + overwrite = true + file = "${params.publish_dir}/trace.txt" +} + +aws.batch.maxSpotAttempts = 5 +google.batch.maxSpotAttempts = 5