@@ -284,6 +284,38 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
284
284
return result. join(' - ' )
285
285
}
286
286
287
+ protected boolean isSpotReclamationError (JobDetail job ) {
288
+ if (! job)
289
+ return false
290
+ // Check if the error is related to spot instance reclamation
291
+ // AWS Batch uses "Host EC2*" pattern for spot reclamation events
292
+ final statusReason = job. statusReason
293
+ return statusReason && statusReason. startsWith(' Host EC2' )
294
+ }
295
+
296
+ protected String formatSpotReclamationError (JobDetail job ) {
297
+ final baseReason = errReason(job)
298
+ final maxAttempts = maxSpotAttempts()
299
+ final StringBuilder message = new StringBuilder ()
300
+
301
+ message. append(" AWS Batch job failed due to EC2 spot instance reclamation." )
302
+ message. append(" \n\n Original error: " ). append(baseReason)
303
+
304
+ if ( maxAttempts == 0 ) {
305
+ message. append(" \n\n To automatically retry jobs when spot instances are reclaimed, "
306
+ + " set 'aws.batch.maxSpotAttempts' to a value greater than 0 in your configuration. "
307
+ + " For example: aws.batch.maxSpotAttempts = 5" )
308
+ } else {
309
+ message. append(" \n\n This job was configured to retry up to " ). append(maxAttempts)
310
+ .append(" times on spot reclamation, but all attempts failed." )
311
+ }
312
+
313
+ message. append(" \n\n For more information about spot instance interruptions, see: "
314
+ + " https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-interruptions.html" )
315
+
316
+ return message. toString()
317
+ }
318
+
287
319
/**
288
320
* {@inheritDoc }
289
321
*/
@@ -299,6 +331,11 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
299
331
final job = describeJob(jobId)
300
332
final done = job?. status in [' SUCCEEDED' , ' FAILED' ]
301
333
if ( done ) {
334
+ // Log retry attempts for spot reclamation visibility
335
+ final attemptCount = job?. attempts?. size() ?: 0
336
+ if ( attemptCount > 1 ) {
337
+ log. info " [AWS BATCH] Process `${ task.lazyName()} ` completed after ${ attemptCount} attempts (job=${ jobId} )"
338
+ }
302
339
// take the exit code of the container, if 0 (successful) or missing
303
340
// take the exit code from the `.exitcode` file create by nextflow
304
341
// the rationale of this is that, in case of error, the exit code return
@@ -310,7 +347,11 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
310
347
final reason = errReason(job)
311
348
// retry all CannotPullContainer errors apart when it does not exist or cannot be accessed
312
349
final unrecoverable = reason. contains(' CannotPullContainer' ) && reason. contains(' unauthorized' )
313
- task. error = unrecoverable ? new ProcessUnrecoverableException (reason) : new ProcessException (reason)
350
+
351
+ // Check for spot reclamation errors and provide clearer error messages
352
+ final errorMessage = isSpotReclamationError(job) ? formatSpotReclamationError(job) : reason
353
+
354
+ task. error = unrecoverable ? new ProcessUnrecoverableException (errorMessage) : new ProcessException (errorMessage)
314
355
task. stderr = executor. getJobOutputStream(jobId) ?: errorFile
315
356
}
316
357
else {
@@ -789,6 +830,7 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
789
830
.withAttempts( attempts )
790
831
.withEvaluateOnExit(cond1, cond2)
791
832
result. setRetryStrategy(retry)
833
+ log. debug " [AWS BATCH] Process `${ task.lazyName()} ` configured for spot reclamation retry (maxSpotAttempts=${ attempts} )"
792
834
}
793
835
794
836
// set task timeout
0 commit comments