-
Notifications
You must be signed in to change notification settings - Fork 0
Update pauser to handle exception and run unpause as long as possible #33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 18 commits
92ebb3a
07d8325
08312b2
34a03fc
25c10f6
4d22a34
42f1a30
1d17af9
f087077
d3b54c0
55ba29c
891beb8
c233242
744da27
c413386
600e7b1
9e9132d
909e596
f9dc445
5cca44c
639e132
b93c054
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
package com.scalar.admin.kubernetes; | ||
|
||
public class PauseFailedException extends PauserException { | ||
public PauseFailedException(String message) { | ||
super(message); | ||
} | ||
|
||
public PauseFailedException(String message, Throwable cause) { | ||
super(message, cause); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,5 +1,6 @@ | ||||||
package com.scalar.admin.kubernetes; | ||||||
|
||||||
import com.google.common.annotations.VisibleForTesting; | ||||||
import com.google.common.util.concurrent.Uninterruptibles; | ||||||
import com.scalar.admin.RequestCoordinator; | ||||||
import io.kubernetes.client.openapi.Configuration; | ||||||
|
@@ -13,8 +14,6 @@ | |||||
import java.util.stream.Collectors; | ||||||
import javax.annotation.Nullable; | ||||||
import javax.annotation.concurrent.NotThreadSafe; | ||||||
import org.slf4j.Logger; | ||||||
import org.slf4j.LoggerFactory; | ||||||
|
||||||
/** | ||||||
* This class implements a pause operation for Scalar product pods in a Kubernetes cluster. The | ||||||
|
@@ -34,11 +33,29 @@ | |||||
@NotThreadSafe | ||||||
public class Pauser { | ||||||
|
||||||
private static final int MAX_UNPAUSE_RETRY_COUNT = 3; | ||||||
@VisibleForTesting static final int MAX_UNPAUSE_RETRY_COUNT = 3; | ||||||
|
||||||
private final Logger logger = LoggerFactory.getLogger(Pauser.class); | ||||||
private final TargetSelector targetSelector; | ||||||
|
||||||
private static final String UNPAUSE_ERROR_MESSAGE = | ||||||
"Unpause operation failed. Scalar products might still be in a paused state. You" | ||||||
+ " must restart related pods by using the `kubectl rollout restart deployment" | ||||||
+ " <DEPLOYMENT_NAME>` command to unpause all pods. "; | ||||||
private static final String PAUSE_ERROR_MESSAGE = | ||||||
"Pause operation failed. You cannot use the backup that was taken during this pause" | ||||||
+ " duration. You need to retry the pause operation from the beginning to" | ||||||
+ " take a backup. "; | ||||||
private static final String GET_TARGET_AFTER_PAUSE_ERROR_MESSAGE = | ||||||
"Failed to find the target pods to examine if the targets pods were updated during" | ||||||
+ " paused. "; | ||||||
private static final String STATUS_CHECK_ERROR_MESSAGE = | ||||||
"Status check failed. You cannot use the backup that was taken during this pause" | ||||||
+ " duration. You need to retry the pause operation from the beginning to" | ||||||
+ " take a backup. "; | ||||||
private static final String STATUS_DIFFERENT_ERROR_MESSAGE = | ||||||
"The target pods were updated during the pause duration. You cannot use the backup that" | ||||||
+ " was taken during this pause duration. "; | ||||||
|
||||||
/** | ||||||
* @param namespace The namespace where the pods are deployed. | ||||||
* @param helmReleaseName The Helm release name used to deploy the pods. | ||||||
|
@@ -77,67 +94,155 @@ public PausedDuration pause(int pauseDuration, @Nullable Long maxPauseWaitTime) | |||||
"pauseDuration is required to be greater than 0 millisecond."); | ||||||
} | ||||||
|
||||||
TargetSnapshot target; | ||||||
// Get pods and deployment information before pause. | ||||||
TargetSnapshot targetBeforePause; | ||||||
try { | ||||||
target = targetSelector.select(); | ||||||
targetBeforePause = getTarget(); | ||||||
} catch (Exception e) { | ||||||
throw new PauserException("Failed to find the target pods to pause.", e); | ||||||
} | ||||||
|
||||||
RequestCoordinator coordinator = getRequestCoordinator(target); | ||||||
|
||||||
coordinator.pause(true, maxPauseWaitTime); | ||||||
|
||||||
Instant startTime = Instant.now(); | ||||||
// Get RequestCoordinator of Scalar Admin to pause. | ||||||
RequestCoordinator requestCoordinator; | ||||||
try { | ||||||
requestCoordinator = getRequestCoordinator(targetBeforePause); | ||||||
} catch (Exception e) { | ||||||
throw new PauserException("Failed to initialize the request coordinator.", e); | ||||||
} | ||||||
|
||||||
Uninterruptibles.sleepUninterruptibly(pauseDuration, TimeUnit.MILLISECONDS); | ||||||
// From here, we cannot throw exceptions right after they occur because we need to take care of | ||||||
// the unpause operation failure. We will throw the exception after the unpause operation or at | ||||||
// the end of this method. | ||||||
|
||||||
Instant endTime = Instant.now(); | ||||||
// Run a pause operation. | ||||||
PausedDuration pausedDuration = null; | ||||||
PauseFailedException pauseFailedException = null; | ||||||
try { | ||||||
pausedDuration = pauseInternal(requestCoordinator, pauseDuration, maxPauseWaitTime); | ||||||
} catch (Exception e) { | ||||||
pauseFailedException = new PauseFailedException("Pause operation failed.", e); | ||||||
} | ||||||
|
||||||
unpauseWithRetry(coordinator, MAX_UNPAUSE_RETRY_COUNT, target); | ||||||
// Run an unpause operation. | ||||||
UnpauseFailedException unpauseFailedException = null; | ||||||
try { | ||||||
unpauseWithRetry(requestCoordinator, MAX_UNPAUSE_RETRY_COUNT); | ||||||
} catch (Exception e) { | ||||||
kota2and3kan marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
unpauseFailedException = new UnpauseFailedException("Unpause operation failed.", e); | ||||||
} | ||||||
|
||||||
// Get pods and deployment information after pause. | ||||||
TargetSnapshot targetAfterPause; | ||||||
try { | ||||||
targetAfterPause = targetSelector.select(); | ||||||
targetAfterPause = getTarget(); | ||||||
} catch (Exception e) { | ||||||
throw new PauserException( | ||||||
"Failed to find the target pods to examine if the targets pods were updated during" | ||||||
+ " paused.", | ||||||
e); | ||||||
PauserException pauserException = | ||||||
new PauserException(GET_TARGET_AFTER_PAUSE_ERROR_MESSAGE, e); | ||||||
if (unpauseFailedException == null) { | ||||||
throw pauserException; | ||||||
} else { | ||||||
throw new UnpauseFailedException(UNPAUSE_ERROR_MESSAGE, pauserException); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code seems to drop the cause of // Run an unpause operation.
UnpauseFailedException unpauseFailedException = null;
try {
unpauseWithRetry(requestCoordinator, MAX_UNPAUSE_RETRY_COUNT);
} catch (Exception e) {
unpauseFailedException = new UnpauseFailedException(UNPAUSE_ERROR_MESSAGE, e);
}
// Get pods and deployment information after pause.
TargetSnapshot targetAfterPause;
try {
targetAfterPause = getTarget();
} catch (Exception e) {
PauserException pauserException =
new PauserException(GET_TARGET_AFTER_PAUSE_ERROR_MESSAGE, pauserException);
if (unpauseFailedException == null) {
throw pauserException;
} else {
unpauseFailedException.addSuppressed(e);
throw unpauseFailedException;
}
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @komamitsu I think this issue should be resolved on the Thank you for pointing it out! |
||||||
} | ||||||
} | ||||||
|
||||||
if (!target.getStatus().equals(targetAfterPause.getStatus())) { | ||||||
throw new PauserException("The target pods were updated during paused. Please retry."); | ||||||
// Check if pods and deployment information are the same between before pause and after pause. | ||||||
boolean isTargetStatusEqual; | ||||||
try { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about executing this block only when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, yes. That's right. I will fix it. Thank you for the suggestion! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 5cca44c. |
||||||
isTargetStatusEqual = targetStatusEquals(targetBeforePause, targetAfterPause); | ||||||
} catch (Exception e) { | ||||||
StatusCheckFailedException statusCheckFailedException = | ||||||
new StatusCheckFailedException(STATUS_CHECK_ERROR_MESSAGE, e); | ||||||
if (unpauseFailedException == null) { | ||||||
throw statusCheckFailedException; | ||||||
} else { | ||||||
throw new UnpauseFailedException(UNPAUSE_ERROR_MESSAGE, statusCheckFailedException); | ||||||
} | ||||||
} | ||||||
|
||||||
return new PausedDuration(startTime, endTime); | ||||||
// Create an error message if any of the operations failed. | ||||||
String errorMessage = | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [just an idea] Sorry for repeated minor suggestions. But, I just noticed adding additional exceptions to the primary exception might be an option so that all the detailed informations are preserved, although it would drop the benefit of outputting a fully concatenated error message in the first exception. Like this: ...
Exception exception = buildException(pauseFailedException, unpauseFailedException,
statusCheckException, statusUnmatchedException);
if (exception != null) {
throw exception;
}
return pausedDuration;
}
@Nullable
private Exception buildException(
@Nullable PauseFailedException pauseFailedException,
@Nullable UnpauseFailedException unpauseFailedException,
@Nullable StatusCheckFailedException statusCheckFailedException,
@Nullable StatusUnmatchedException statusUnmatchedException) {
Exception exception = null;
if (unpauseFailedException != null) {
exception = unpauseFailedException;
}
if (pauseFailedException != null) {
if (exception == null) {
exception = pauseFailedException;
} else {
exception.addSuppressed(pauseFailedException);
}
}
if (statusCheckFailedException != null) {
if (exception == null) {
exception = statusCheckFailedException;
} else {
exception.addSuppressed(statusCheckFailedException);
}
}
if (statusUnmatchedException != null) {
if (exception == null) {
exception = statusUnmatchedException;
} else {
exception.addSuppressed(statusUnmatchedException);
}
}
return exception;
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 to the idea ;) With this, API clients can decide what to do based on the primary + secondary(suppressed) exceptions, e.g., primary = @komamitsu Just wanted to confirm, you'd like to throw There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @monkey-mas Good point. Agreed, although There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @komamitsu @monkey-mas (I didn't know |
||||||
createErrorMessage(unpauseFailedException, pauseFailedException, isTargetStatusEqual); | ||||||
|
||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's more readable if we identify the cases here like the following and create error messages for each case.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I see. Indeed, your suggestion is more readable. However, in the previous meeting, we (@komamitsu and I) discussed using boolean flags, and the result was Honestly, it's a bit difficult for me to decide which is better because I think both (more readable and simpler) make sense to me... @komamitsu There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This may improve the readability. But it'd also introduce many states. So, I think it's a trade-off (or maybe matter of taste?). The current code uses @kota2and3kan I think either is fine with us. I'm not insisting on my suggestion. You can go with any option you feel most confident in and comfortable being responsible for. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. I got it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
// We use the exceptions as conditions instead of using boolean flags like `isPauseOk`, etc. If | ||||||
// we use boolean flags, it might cause a bit large number of combinations. For example, if we | ||||||
// have three flags, they generate 2^3 = 8 combinations. It also might make the nested if | ||||||
// statement or a lot of branches of the switch statement. That's why we don't use status flags | ||||||
// for now. | ||||||
|
||||||
// Return the final result based on each process. | ||||||
if (unpauseFailedException != null) { // Unpause Failed. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
// Unpause issue is the most critical because it might cause system down. | ||||||
throw new UnpauseFailedException(errorMessage, unpauseFailedException); | ||||||
} else if (pauseFailedException != null) { // Pause Failed. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
// Pause Failed is second priority because pause issue might be caused by configuration error. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
throw new PauseFailedException(errorMessage, pauseFailedException); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The first There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @komamitsu I will fix it. I think we can fix this issue on the Thank you for pointing it out! |
||||||
} else if (!isTargetStatusEqual) { // Target status changed. | ||||||
// Target status changed is third priority because this issue might be caused by temporary | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
// issue, for example, pod restarts. | ||||||
throw new PauseFailedException(errorMessage); | ||||||
} else { | ||||||
// All operations succeeded. | ||||||
return pausedDuration; | ||||||
} | ||||||
} | ||||||
|
||||||
private void unpauseWithRetry( | ||||||
RequestCoordinator coordinator, int maxRetryCount, TargetSnapshot target) { | ||||||
@VisibleForTesting | ||||||
void unpauseWithRetry(RequestCoordinator coordinator, int maxRetryCount) { | ||||||
int retryCounter = 0; | ||||||
|
||||||
while (true) { | ||||||
try { | ||||||
coordinator.unpause(); | ||||||
return; | ||||||
} catch (Exception e) { | ||||||
if (++retryCounter >= maxRetryCount) { | ||||||
logger.warn( | ||||||
"Failed to unpause Scalar product. They are still in paused. You must restart related" | ||||||
+ " pods by using the `kubectl rollout restart deployment {}`" | ||||||
+ " command to unpase all pods.", | ||||||
target.getDeployment().getMetadata().getName()); | ||||||
return; | ||||||
throw e; | ||||||
} | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
@VisibleForTesting | ||||||
TargetSnapshot getTarget() throws PauserException { | ||||||
return targetSelector.select(); | ||||||
} | ||||||
Comment on lines
+202
to
+205
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I mentioned in another comment, I separated this |
||||||
|
||||||
@VisibleForTesting | ||||||
RequestCoordinator getRequestCoordinator(TargetSnapshot target) { | ||||||
return new RequestCoordinator( | ||||||
target.getPods().stream() | ||||||
.map(p -> new InetSocketAddress(p.getStatus().getPodIP(), target.getAdminPort())) | ||||||
.collect(Collectors.toList())); | ||||||
} | ||||||
|
||||||
@VisibleForTesting | ||||||
PausedDuration pauseInternal( | ||||||
RequestCoordinator requestCoordinator, int pauseDuration, @Nullable Long maxPauseWaitTime) { | ||||||
requestCoordinator.pause(true, maxPauseWaitTime); | ||||||
Instant startTime = Instant.now(); | ||||||
Uninterruptibles.sleepUninterruptibly(pauseDuration, TimeUnit.MILLISECONDS); | ||||||
Instant endTime = Instant.now(); | ||||||
return new PausedDuration(startTime, endTime); | ||||||
} | ||||||
|
||||||
@VisibleForTesting | ||||||
boolean targetStatusEquals(TargetSnapshot before, TargetSnapshot after) { | ||||||
return before.getStatus().equals(after.getStatus()); | ||||||
} | ||||||
|
||||||
private String createErrorMessage( | ||||||
UnpauseFailedException unpauseFailedException, | ||||||
PauseFailedException pauseFailedException, | ||||||
boolean isTargetStatusEqual) { | ||||||
StringBuilder errorMessageBuilder = new StringBuilder(); | ||||||
if (unpauseFailedException != null) { | ||||||
errorMessageBuilder.append(UNPAUSE_ERROR_MESSAGE); | ||||||
} | ||||||
if (pauseFailedException != null) { | ||||||
errorMessageBuilder.append(PAUSE_ERROR_MESSAGE); | ||||||
} | ||||||
if (!isTargetStatusEqual) { | ||||||
errorMessageBuilder.append(STATUS_DIFFERENT_ERROR_MESSAGE); | ||||||
} | ||||||
return errorMessageBuilder.toString(); | ||||||
} | ||||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
package com.scalar.admin.kubernetes; | ||
|
||
public class StatusCheckFailedException extends PauserException { | ||
public StatusCheckFailedException(String message) { | ||
super(message); | ||
} | ||
|
||
public StatusCheckFailedException(String message, Throwable cause) { | ||
super(message, cause); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
package com.scalar.admin.kubernetes; | ||
|
||
public class UnpauseFailedException extends PauserException { | ||
public UnpauseFailedException(String message) { | ||
super(message); | ||
} | ||
|
||
public UnpauseFailedException(String message, Throwable cause) { | ||
super(message, cause); | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.