Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 152 additions & 0 deletions src/base/labels_aws_eu-central-1.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
// copied from 'common/nextflow_helpers/labels_tw.config', but the queues in the gpu labels have been updated

def exitStrat(task, max_attempts = 3) {
println "Determining exit strategy for task (attempt '${task.attempt}', exit status '${task.exitStatus}')"

// if the component failed 3 times, ignore the error so the workflow can continue
// it's important 'ignore' is returned even if maxRetries is set to 3,
// otherwise the workflow will stop
if (task.attempt >= 3) {
return 'ignore'
}
// when an aws spot instance is reclaimed, nextflow seems to use exit code 2147483647
// throwing in some extra conditions just in case
if (task.exitStatus == null || task.exitStatus <= -1 || task.exitStatus > 2100000000 || !(task.exitStatus.toString().isNumber())) {
return 'retry'
}
// if component failed, retry once
if (task.exitStatus == 1 && task.attempt < 2) {
return 'retry'
}
// if component ran out of memory, retry with more memory and disk
if (task.exitStatus in [137, 139] && task.attempt < max_attempts) {
return 'retry'
}
// return 'ignore' for all other cases to ignore the error,
// otherwise the workflow will stop
return 'ignore'
}

aws {
batch {
maxTransferAttempts = 3
delayBetweenAttempts = '5 sec'
maxSpotAttempts = 8
}
}

process {
executor = 'awsbatch'

// Default disk space
disk = 50.GB

// Retry for exit codes that have something to do with memory issues
// always retry once
errorStrategy = { exitStrat(task) }
maxRetries = 3
maxMemory = null

// Resource labels
withLabel: lowcpu { cpus = 5 }
withLabel: midcpu { cpus = 15 }
withLabel: highcpu { cpus = 30 }
withLabel: lowmem {
memory = { get_memory( 20.GB * task.attempt ) }
disk = { 50.GB * task.attempt }
}
withLabel: midmem {
memory = { get_memory( 50.GB * task.attempt ) }
disk = { 100.GB * task.attempt }
}
withLabel: highmem {
memory = { get_memory( 100.GB * task.attempt ) }
disk = { 200.GB * task.attempt }
}
withLabel: veryhighmem {
memory = { get_memory( 200.GB * task.attempt ) }
disk = { 400.GB * task.attempt }
}
withLabel: lowsharedmem {
containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" : ""}
}
withLabel: midsharedmem {
containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" : ""}
}
withLabel: highsharedmem {
containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" : ""}
}
withLabel: gpu {
// assuming g6.8xlarge
cpus = 32
accelerator = 1
memory = 100.GB
queue = "TowerForge-9YTjlzYCo5nGhuhJw2daF-work"
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '--gpus all': null ) }
}
withLabel: midgpu {
// assuming g6.8xlarge
cpus = 32
accelerator = 4
memory = 100.GB
queue = "TowerForge-9YTjlzYCo5nGhuhJw2daF-work"
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '--gpus all': null ) }
}
withLabel: highgpu {
// assuming g6.16xlarge
cpus = 64
accelerator = 8
memory = 200.GB
queue = "TowerForge-9YTjlzYCo5nGhuhJw2daF-work"
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '--gpus all': null ) }
}
withLabel: biggpu {
// assuming p5.4xlarge
cpus = 16
accelerator = 1
memory = 200.GB
queue = "TowerForge-...-work"
containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
( workflow.containerEngine == "docker" ? '--gpus all': null ) }
}

// make sure publishstates gets enough disk space and memory
withName:'.*publishStatesProc' {
memory = '16GB'
disk = '100GB'
}
}

def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}

try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

// set tracing file
trace {
enabled = true
overwrite = true
file = "${params.publish_dir}/trace.txt"
}

aws.batch.maxSpotAttempts = 5
google.batch.maxSpotAttempts = 5
Loading