Skip to content

Commit 1a41bbd

Browse files
committed
Add option to retry only known errors.
1 parent 54e4308 commit 1a41bbd

File tree

11 files changed

+136
-5
lines changed

11 files changed

+136
-5
lines changed

backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import cromwell.backend.validation._
2929
import cromwell.core._
3030
import cromwell.core.io.{AsyncIoActorClient, DefaultIoCommandBuilder, IoCommandBuilder}
3131
import cromwell.core.path.Path
32+
import cromwell.core.retry._
3233
import cromwell.services.keyvalue.KeyValueServiceActor._
3334
import cromwell.services.keyvalue.KvClient
3435
import cromwell.services.metadata.CallMetadataKeys
@@ -268,6 +269,25 @@ trait StandardAsyncExecutionActor
268269
}
269270
}
270271

272+
lazy val maxRetriesMode: MaxRetriesMode = {
273+
val maxRetriesModeOption: Option[MaxRetriesMode] =
274+
jobDescriptor.workflowDescriptor.getWorkflowOption(WorkflowOptions.MaxRetriesMode) flatMap { value: String =>
275+
MaxRetriesMode.tryParse(value) match {
276+
case Success(v) => Option(v)
277+
case Failure(e) =>
278+
// should not happen, this case should have been screened for and fast-failed during workflow materialization.
279+
log.error(
280+
e,
281+
s"Programmer error: unexpected failure attempting to convert value for workflow option " +
282+
s"'${WorkflowOptions.MaxRetriesMode.name}' to MaxRetriesMode."
283+
)
284+
Option(MaxRetriesMode.DefaultMode)
285+
}
286+
}
287+
288+
maxRetriesModeOption.getOrElse(MaxRetriesMode.DefaultMode)
289+
}
290+
271291
lazy val memoryRetryRequested: Boolean = memoryRetryFactor.nonEmpty
272292

273293
/**
@@ -1101,8 +1121,17 @@ trait StandardAsyncExecutionActor
11011121

11021122
failedRetryableOrNonRetryable match {
11031123
case failedNonRetryable: FailedNonRetryableExecutionHandle if previousFailedRetries < maxRetries =>
1104-
// The user asked us to retry finitely for them, possibly with a memory modification
1105-
evaluateFailureRetry(failedNonRetryable, kvsFromPreviousAttempt, kvsForNextAttempt, memoryRetry)
1124+
maxRetriesMode match {
1125+
case AllErrors =>
1126+
// The user asked us to retry finitely for them, possibly with a memory modification
1127+
evaluateFailureRetry(failedNonRetryable, kvsFromPreviousAttempt, kvsForNextAttempt, memoryRetry)
1128+
case KnownErrors if memoryRetry.oomDetected =>
1129+
// The user asked us to retry finitely for them, with a memory modification
1130+
evaluateFailureRetry(failedNonRetryable, kvsFromPreviousAttempt, kvsForNextAttempt, memoryRetry)
1131+
case _ =>
1132+
// No reason to retry
1133+
Future.successful(failedNonRetryable)
1134+
}
11061135
case failedNonRetryable: FailedNonRetryableExecutionHandle =>
11071136
// No reason to retry
11081137
Future.successful(failedNonRetryable)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"max_retries_mode" : "AllErrors"
3+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"max_retries_mode" : "KnownErrors"
3+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
name: max_retries_mode_allerrors
2+
testFormat: workflowfailure
3+
4+
files {
5+
workflow: max_retries/max_retries.wdl
6+
options: max_retries/max_retries_mode_allerrors.options
7+
}
8+
9+
metadata {
10+
"failures.0.causedBy.0.message": "Job retry_for_me.broken_task:NA:2 exited with return code 1 which has not been declared as a valid return code. See 'continueOnReturnCode' runtime attribute for more details."
11+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
name: max_retries_mode_knownerrors
2+
testFormat: workflowfailure
3+
4+
files {
5+
workflow: max_retries/max_retries.wdl
6+
options: max_retries/max_retries_mode_knownerrors.options
7+
}
8+
9+
metadata {
10+
"failures.0.causedBy.0.message": "Job retry_for_me.broken_task:NA:1 exited with return code 1 which has not been declared as a valid return code. See 'continueOnReturnCode' runtime attribute for more details."
11+
}

core/src/main/scala/cromwell/core/WorkflowOptions.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ object WorkflowOptions {
7777
case object WorkflowFailureMode extends WorkflowOption("workflow_failure_mode")
7878
case object UseReferenceDisks extends WorkflowOption("use_reference_disks")
7979
case object MemoryRetryMultiplier extends WorkflowOption("memory_retry_multiplier")
80+
case object MaxRetriesMode extends WorkflowOption("max_retries_mode")
8081
case object WorkflowCallbackUri extends WorkflowOption("workflow_callback_uri")
8182

8283
private lazy val WorkflowOptionsConf = ConfigFactory.load.getConfig("workflow-options")
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package cromwell.core.retry
2+
3+
import scala.util.{Failure, Success, Try}
4+
5+
sealed trait MaxRetriesMode
6+
case object AllErrors extends MaxRetriesMode
7+
case object KnownErrors extends MaxRetriesMode
8+
9+
object MaxRetriesMode {
10+
val DefaultMode = AllErrors
11+
private val AllModes = Seq(AllErrors, KnownErrors)
12+
13+
def tryParse(mode: String): Try[MaxRetriesMode] =
14+
AllModes find { _.toString.equalsIgnoreCase(mode) } map { Success(_) } getOrElse Failure(
15+
new Exception(s"Invalid max retries mode: '$mode', supported modes are: ${AllModes.mkString("'", "', '", "'")}")
16+
)
17+
}

docs/RuntimeAttributes.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ runtime {
247247

248248
*Default: _0_*
249249

250-
This retry option is introduced to provide a method for tackling transient job failures. For example, if a task fails due to a timeout from accessing an external service, then this option helps re-run the failed the task without having to re-run the entire workflow. It takes an Int as a value that indicates the maximum number of times Cromwell should retry a failed task. This retry is applied towards jobs that fail while executing the task command. This method only applies to transient job failures and is a feeble attempt to retry a job, that is it cannot be used to increase memory in out-of-memory situations.
250+
This retry option is introduced to provide a method for tackling transient job failures. For example, if a task fails due to a timeout from accessing an external service, then this option helps re-run the failed the task without having to re-run the entire workflow. It takes an Int as a value that indicates the maximum number of times Cromwell should retry a failed task. This retry is applied towards jobs that fail while executing the task command. This method only applies to transient job failures and is a feeble attempt to retry a job.
251251

252252
If using the Google backend, it's important to note that The `maxRetries` count is independent from the [preemptible](#preemptible) count. For example, the task below can be retried up to 6 times if it's preempted 3 times AND the command execution fails 3 times.
253253

docs/wf_options/Overview.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,12 @@ Example `options.json`:
146146
"memory_retry_multiplier" : 1.1
147147
}
148148
```
149+
150+
## Max Retries Mode
151+
152+
The `max_retries_mode` workflow options sets the behavior of retrying failed jobs when the [`maxRetries` runtime
153+
attribute](../RuntimeAttributes.md#maxretries) is specified.
154+
155+
The possible values are `AllErrors` or `KnownErrors`. If set to `AllErrors`, the job will be retried for any error. If
156+
set to `KnownErrors`, the job will only be retried for errors that are known to be retryable, such as increasing memory
157+
in out-of-memory situations. The default value is `AllErrors`.

engine/src/main/scala/cromwell/engine/workflow/lifecycle/materialization/MaterializeWorkflowDescriptorActor.scala

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import cromwell.core.io.AsyncIo
2727
import cromwell.core.labels.{Label, Labels}
2828
import cromwell.core.logging.WorkflowLogging
2929
import cromwell.core.path.{PathBuilder, PathBuilderFactory}
30+
import cromwell.core.retry._
3031
import cromwell.engine._
3132
import cromwell.engine.backend.CromwellBackends
3233
import cromwell.engine.workflow.WorkflowProcessingEventPublishing._
@@ -182,6 +183,19 @@ object MaterializeWorkflowDescriptorActor {
182183
s"'$optionName' is specified in workflow options but value is not of expected Double type: ${e.getMessage}".invalidNel
183184
}
184185
}
186+
187+
def validateMaxRetriesMode(workflowOptions: WorkflowOptions): ErrorOr[MaxRetriesMode] = {
188+
val modeString: Try[String] = workflowOptions.get(WorkflowOptions.MaxRetriesMode) match {
189+
case Success(value) => Success(value)
190+
case Failure(_: OptionNotFoundException) => Success(MaxRetriesMode.DefaultMode.toString)
191+
case Failure(e) => Failure(e)
192+
}
193+
194+
modeString flatMap MaxRetriesMode.tryParse match {
195+
case Success(mode) => mode.validNel
196+
case Failure(t) => t.getMessage.invalidNel
197+
}
198+
}
185199
}
186200

187201
// TODO WOM: need to decide where to draw the line between language specific initialization and WOM
@@ -495,12 +509,15 @@ class MaterializeWorkflowDescriptorActor(override val serviceRegistryActor: Acto
495509

496510
val memoryRetryMultiplierValidation: ErrorOr[Unit] = validateMemoryRetryMultiplier(workflowOptions)
497511

512+
val maxRetriesModeValidation: ErrorOr[MaxRetriesMode] = validateMaxRetriesMode(workflowOptions)
513+
498514
(failureModeValidation,
499515
backendAssignmentsValidation,
500516
callCachingModeValidation,
501517
useReferenceDisksValidation,
502-
memoryRetryMultiplierValidation
503-
) mapN { case (failureMode, backendAssignments, callCachingMode, _, _) =>
518+
memoryRetryMultiplierValidation,
519+
maxRetriesModeValidation
520+
) mapN { case (failureMode, backendAssignments, callCachingMode, _, _, _) =>
504521
val callable = womNamespace.executable.entryPoint
505522
val backendDescriptor = BackendWorkflowDescriptor(id,
506523
callable,

0 commit comments

Comments
 (0)