@@ -1416,7 +1416,7 @@ trait StandardAsyncExecutionActor
1416
1416
): Future [ExecutionHandle ] = {
1417
1417
1418
1418
// Returns true if the task has written an RC file that indicates OOM, false otherwise
1419
- def memoryRetryRC : Future [Boolean ] = {
1419
+ def memoryRetryRC : Future [( Boolean , Option [ Path ]) ] = {
1420
1420
1421
1421
def readFile (path : Path , maxBytes : Option [Int ]): Future [String ] =
1422
1422
asyncIo.contentAsStringAsync(path, maxBytes, failOnOverflow = false )
@@ -1438,23 +1438,37 @@ trait StandardAsyncExecutionActor
1438
1438
}
1439
1439
}
1440
1440
1441
- def checkMemoryRetryStderr (errorKeys : List [String ], maxBytes : Int ): Future [Boolean ] =
1442
- readFile(jobPaths.standardPaths.error , Option (maxBytes)) map { errorContent =>
1441
+ def checkMemoryRetryStderr (memoryRetryError : Path , errorKeys : List [String ], maxBytes : Int ): Future [Boolean ] =
1442
+ readFile(memoryRetryError , Option (maxBytes)) map { errorContent =>
1443
1443
errorKeys.exists(errorContent.contains)
1444
1444
}
1445
1445
1446
- asyncIo.existsAsync(jobPaths.memoryRetryRC) flatMap {
1447
- case true => checkMemoryRetryRC()
1448
- case false =>
1449
- (memoryRetryErrorKeys, memoryRetryStderrLimit) match {
1450
- case (Some (keys), Some (limit)) =>
1451
- asyncIo.existsAsync(jobPaths.standardPaths.error) flatMap {
1452
- case true => checkMemoryRetryStderr(keys, limit)
1453
- case false => Future .successful(false )
1454
- }
1455
- case _ => Future .successful(false )
1456
- }
1457
- }
1446
+ def checkMemoryRetryError (): Future [Boolean ] =
1447
+ (memoryRetryErrorKeys, memoryRetryStderrLimit, jobPaths.memoryRetryError) match {
1448
+ case (Some (keys), Some (limit), Some (memoryRetryError)) =>
1449
+ for {
1450
+ memoryRetryErrorExists <- asyncIo.existsAsync(memoryRetryError)
1451
+ memoryRetryErrorFound <-
1452
+ if (memoryRetryErrorExists)
1453
+ checkMemoryRetryStderr(memoryRetryError, keys, limit)
1454
+ else
1455
+ Future .successful(false )
1456
+ } yield memoryRetryErrorFound
1457
+ case _ => Future .successful(false )
1458
+ }
1459
+
1460
+ // For backwards behavioral compatibility, check for the old memory retry RC file first. That file used to catch
1461
+ // the errors from the standard error file, but now sometimes the error is written to a separate log file.
1462
+ // If it exists, check its contents. If it doesn't find an OOM code, check the new memory retry error file.
1463
+ for {
1464
+ memoryRetryRCExists <- asyncIo.existsAsync(jobPaths.memoryRetryRC)
1465
+ memoryRetryRCErrorFound <- if (memoryRetryRCExists) checkMemoryRetryRC() else Future .successful(false )
1466
+ memoryRetryErrorFound <- if (memoryRetryRCErrorFound) Future .successful(true ) else checkMemoryRetryError()
1467
+ memoryErrorPathOption =
1468
+ if (memoryRetryRCErrorFound) Option (jobPaths.standardPaths.error)
1469
+ else if (memoryRetryErrorFound) jobPaths.memoryRetryError
1470
+ else None
1471
+ } yield (memoryRetryErrorFound, memoryErrorPathOption)
1458
1472
}
1459
1473
1460
1474
val stderr = jobPaths.standardPaths.error
@@ -1465,74 +1479,76 @@ trait StandardAsyncExecutionActor
1465
1479
// Only check stderr size if we need to, otherwise this results in a lot of unnecessary I/O that
1466
1480
// may fail due to race conditions on quickly-executing jobs.
1467
1481
stderrSize <- if (failOnStdErr) asyncIo.sizeAsync(stderr) else Future .successful(0L )
1468
- outOfMemoryDetected <- memoryRetryRC
1469
- } yield (stderrSize, returnCodeAsString, outOfMemoryDetected)
1470
-
1471
- stderrSizeAndReturnCodeAndMemoryRetry flatMap { case (stderrSize, returnCodeAsString, outOfMemoryDetected) =>
1472
- val tryReturnCodeAsInt = Try (returnCodeAsString.trim.toInt)
1473
-
1474
- if (isDone(status)) {
1475
- tryReturnCodeAsInt match {
1476
- case Success (returnCodeAsInt) if failOnStdErr && stderrSize.intValue > 0 =>
1477
- val executionHandle = Future .successful(
1478
- FailedNonRetryableExecutionHandle (StderrNonEmpty (jobDescriptor.key.tag, stderrSize, stderrAsOption),
1479
- Option (returnCodeAsInt),
1480
- None
1482
+ (outOfMemoryDetected, outOfMemoryPathOption) <- memoryRetryRC
1483
+ } yield (stderrSize, returnCodeAsString, outOfMemoryDetected, outOfMemoryPathOption)
1484
+
1485
+ stderrSizeAndReturnCodeAndMemoryRetry flatMap {
1486
+ case (stderrSize, returnCodeAsString, outOfMemoryDetected, outOfMemoryPathOption) =>
1487
+ val tryReturnCodeAsInt = Try (returnCodeAsString.trim.toInt)
1488
+
1489
+ if (isDone(status)) {
1490
+ tryReturnCodeAsInt match {
1491
+ case Success (returnCodeAsInt) if failOnStdErr && stderrSize.intValue > 0 =>
1492
+ val executionHandle = Future .successful(
1493
+ FailedNonRetryableExecutionHandle (StderrNonEmpty (jobDescriptor.key.tag, stderrSize, stderrAsOption),
1494
+ Option (returnCodeAsInt),
1495
+ None
1496
+ )
1481
1497
)
1482
- )
1483
- retryElseFail(executionHandle)
1484
- case Success (returnCodeAsInt) if continueOnReturnCode.continueFor( returnCodeAsInt) =>
1485
- handleExecutionSuccess(status, oldHandle, returnCodeAsInt)
1486
- // It's important that we check retryWithMoreMemory case before isAbort. RC could be 137 in either case;
1487
- // if it was caused by OOM killer, want to handle as OOM and not job abort.
1488
- case Success (returnCodeAsInt) if outOfMemoryDetected && memoryRetryRequested =>
1489
- val executionHandle = Future .successful (
1490
- FailedNonRetryableExecutionHandle (
1491
- RetryWithMoreMemory (jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log ),
1492
- Option (returnCodeAsInt),
1493
- None
1498
+ retryElseFail(executionHandle )
1499
+ case Success (returnCodeAsInt) if continueOnReturnCode.continueFor(returnCodeAsInt) =>
1500
+ handleExecutionSuccess(status, oldHandle, returnCodeAsInt)
1501
+ // It's important that we check retryWithMoreMemory case before isAbort. RC could be 137 in either case;
1502
+ // if it was caused by OOM killer, want to handle as OOM and not job abort.
1503
+ case Success (returnCodeAsInt) if outOfMemoryDetected && memoryRetryRequested =>
1504
+ val executionHandle = Future .successful(
1505
+ FailedNonRetryableExecutionHandle (
1506
+ RetryWithMoreMemory (jobDescriptor.key.tag, outOfMemoryPathOption, memoryRetryErrorKeys, log),
1507
+ Option (returnCodeAsInt ),
1508
+ None
1509
+ )
1494
1510
)
1495
- )
1496
- retryElseFail(executionHandle,
1497
- MemoryRetryResult (outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier)
1498
- )
1499
- case Success (returnCodeAsInt) if isAbort(returnCodeAsInt) =>
1500
- Future .successful(AbortedExecutionHandle )
1501
- case Success (returnCodeAsInt) =>
1502
- val executionHandle = Future .successful(
1503
- FailedNonRetryableExecutionHandle (WrongReturnCode (jobDescriptor.key.tag, returnCodeAsInt, stderrAsOption),
1504
- Option (returnCodeAsInt),
1505
- None
1511
+ retryElseFail(executionHandle,
1512
+ MemoryRetryResult (outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier)
1506
1513
)
1507
- )
1508
- retryElseFail(executionHandle)
1509
- case Failure (_) =>
1510
- Future .successful(
1511
- FailedNonRetryableExecutionHandle (
1512
- ReturnCodeIsNotAnInt (jobDescriptor.key.tag, returnCodeAsString, stderrAsOption),
1513
- kvPairsToSave = None
1514
+ case Success (returnCodeAsInt) if isAbort(returnCodeAsInt) =>
1515
+ Future .successful(AbortedExecutionHandle )
1516
+ case Success (returnCodeAsInt) =>
1517
+ val executionHandle = Future .successful(
1518
+ FailedNonRetryableExecutionHandle (
1519
+ WrongReturnCode (jobDescriptor.key.tag, returnCodeAsInt, stderrAsOption),
1520
+ Option (returnCodeAsInt),
1521
+ None
1522
+ )
1514
1523
)
1515
- )
1516
- }
1517
- } else {
1518
- tryReturnCodeAsInt match {
1519
- case Success (returnCodeAsInt)
1520
- if outOfMemoryDetected && memoryRetryRequested && ! continueOnReturnCode.continueFor(returnCodeAsInt) =>
1521
- val executionHandle = Future .successful(
1522
- FailedNonRetryableExecutionHandle (
1523
- RetryWithMoreMemory (jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log),
1524
- Option (returnCodeAsInt),
1525
- None
1524
+ retryElseFail(executionHandle)
1525
+ case Failure (_) =>
1526
+ Future .successful(
1527
+ FailedNonRetryableExecutionHandle (
1528
+ ReturnCodeIsNotAnInt (jobDescriptor.key.tag, returnCodeAsString, stderrAsOption),
1529
+ kvPairsToSave = None
1530
+ )
1526
1531
)
1527
- )
1528
- retryElseFail(executionHandle,
1529
- MemoryRetryResult (outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier)
1530
- )
1531
- case _ =>
1532
- val failureStatus = handleExecutionFailure(status, tryReturnCodeAsInt.toOption)
1533
- retryElseFail(failureStatus)
1532
+ }
1533
+ } else {
1534
+ tryReturnCodeAsInt match {
1535
+ case Success (returnCodeAsInt)
1536
+ if outOfMemoryDetected && memoryRetryRequested && ! continueOnReturnCode.continueFor(returnCodeAsInt) =>
1537
+ val executionHandle = Future .successful(
1538
+ FailedNonRetryableExecutionHandle (
1539
+ RetryWithMoreMemory (jobDescriptor.key.tag, outOfMemoryPathOption, memoryRetryErrorKeys, log),
1540
+ Option (returnCodeAsInt),
1541
+ None
1542
+ )
1543
+ )
1544
+ retryElseFail(executionHandle,
1545
+ MemoryRetryResult (outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier)
1546
+ )
1547
+ case _ =>
1548
+ val failureStatus = handleExecutionFailure(status, tryReturnCodeAsInt.toOption)
1549
+ retryElseFail(failureStatus)
1550
+ }
1534
1551
}
1535
- }
1536
1552
} recoverWith { case exception =>
1537
1553
if (isDone(status)) Future .successful(FailedNonRetryableExecutionHandle (exception, kvPairsToSave = None ))
1538
1554
else {
0 commit comments