Skip to content

Commit 12f7b22

Browse files
ewelsclaude
andcommitted
Improve AWS Batch spot reclamation error messages
- Add clearer error messages when spot instances are reclaimed - Provide guidance on configuring maxSpotAttempts for automatic retries - Add logging for retry attempts and spot reclamation configuration - Include helpful link to AWS documentation on spot interruptions This addresses issues #5240 and #6225 by making spot reclamation failures more user-friendly and providing actionable guidance. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 700fadd commit 12f7b22

File tree

1 file changed

+43
-1
lines changed

1 file changed

+43
-1
lines changed

plugins/nf-amazon/src/main/nextflow/cloud/aws/batch/AwsBatchTaskHandler.groovy

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,38 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
284284
return result.join(' - ')
285285
}
286286

287+
protected boolean isSpotReclamationError(JobDetail job) {
288+
if(!job)
289+
return false
290+
// Check if the error is related to spot instance reclamation
291+
// AWS Batch uses "Host EC2*" pattern for spot reclamation events
292+
final statusReason = job.statusReason
293+
return statusReason && statusReason.startsWith('Host EC2')
294+
}
295+
296+
protected String formatSpotReclamationError(JobDetail job) {
297+
final baseReason = errReason(job)
298+
final maxAttempts = maxSpotAttempts()
299+
final StringBuilder message = new StringBuilder()
300+
301+
message.append("AWS Batch job failed due to EC2 spot instance reclamation.")
302+
message.append("\n\nOriginal error: ").append(baseReason)
303+
304+
if( maxAttempts == 0 ) {
305+
message.append("\n\nTo automatically retry jobs when spot instances are reclaimed, "
306+
+ "set 'aws.batch.maxSpotAttempts' to a value greater than 0 in your configuration. "
307+
+ "For example: aws.batch.maxSpotAttempts = 5")
308+
} else {
309+
message.append("\n\nThis job was configured to retry up to ").append(maxAttempts)
310+
.append(" times on spot reclamation, but all attempts failed.")
311+
}
312+
313+
message.append("\n\nFor more information about spot instance interruptions, see: "
314+
+ "https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-interruptions.html")
315+
316+
return message.toString()
317+
}
318+
287319
/**
288320
* {@inheritDoc}
289321
*/
@@ -299,6 +331,11 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
299331
final job = describeJob(jobId)
300332
final done = job?.status in ['SUCCEEDED', 'FAILED']
301333
if( done ) {
334+
// Log retry attempts for spot reclamation visibility
335+
final attemptCount = job?.attempts?.size() ?: 0
336+
if( attemptCount > 1 ) {
337+
log.info "[AWS BATCH] Process `${task.lazyName()}` completed after ${attemptCount} attempts (job=${jobId})"
338+
}
302339
// take the exit code of the container, if 0 (successful) or missing
303340
// take the exit code from the `.exitcode` file create by nextflow
304341
// the rationale of this is that, in case of error, the exit code return
@@ -310,7 +347,11 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
310347
final reason = errReason(job)
311348
// retry all CannotPullContainer errors apart when it does not exist or cannot be accessed
312349
final unrecoverable = reason.contains('CannotPullContainer') && reason.contains('unauthorized')
313-
task.error = unrecoverable ? new ProcessUnrecoverableException(reason) : new ProcessException(reason)
350+
351+
// Check for spot reclamation errors and provide clearer error messages
352+
final errorMessage = isSpotReclamationError(job) ? formatSpotReclamationError(job) : reason
353+
354+
task.error = unrecoverable ? new ProcessUnrecoverableException(errorMessage) : new ProcessException(errorMessage)
314355
task.stderr = executor.getJobOutputStream(jobId) ?: errorFile
315356
}
316357
else {
@@ -789,6 +830,7 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
789830
.withAttempts( attempts )
790831
.withEvaluateOnExit(cond1, cond2)
791832
result.setRetryStrategy(retry)
833+
log.debug "[AWS BATCH] Process `${task.lazyName()}` configured for spot reclamation retry (maxSpotAttempts=${attempts})"
792834
}
793835

794836
// set task timeout

0 commit comments

Comments
 (0)