Skip to content

Commit e54aa85

Browse files
committed
enable task monitor to update fatal errors
1 parent e0c8fe2 commit e54aa85

File tree

4 files changed

+20
-9
lines changed

4 files changed

+20
-9
lines changed

apps/kubernetes-provider/src/taskMonitor.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ export class TaskMonitor {
140140
const containerState = this.#getContainerStateSummary(containerStatus.state);
141141
const exitCode = containerState.exitCode ?? -1;
142142

143-
if (exitCode === EXIT_CODE_ALREADY_HANDLED || exitCode === EXIT_CODE_CHILD_NONZERO) {
143+
if (exitCode === EXIT_CODE_ALREADY_HANDLED) {
144144
this.#logger.debug("Ignoring pod failure, already handled by worker", {
145145
podName,
146146
});
@@ -160,7 +160,10 @@ export class TaskMonitor {
160160

161161
let reason = rawReason || "Unknown error";
162162
let logs = rawLogs || "";
163-
let overrideCompletion = false;
163+
164+
/** This will only override existing task errors. It will not crash the run. */
165+
let onlyOverrideExistingError = exitCode === EXIT_CODE_CHILD_NONZERO;
166+
164167
let errorCode: TaskRunInternalError["code"] = TaskRunErrorCodes.POD_UNKNOWN_ERROR;
165168

166169
switch (rawReason) {
@@ -185,7 +188,6 @@ export class TaskMonitor {
185188
}
186189
break;
187190
case "OOMKilled":
188-
overrideCompletion = true;
189191
reason =
190192
"[TaskMonitor] Your task ran out of memory. Try increasing the machine specs. If this doesn't fix it there might be a memory leak.";
191193
errorCode = TaskRunErrorCodes.TASK_PROCESS_OOM_KILLED;
@@ -198,7 +200,7 @@ export class TaskMonitor {
198200
exitCode,
199201
reason,
200202
logs,
201-
overrideCompletion,
203+
overrideCompletion: onlyOverrideExistingError,
202204
errorCode,
203205
} satisfies FailureDetails;
204206

apps/webapp/app/v3/handleSocketIo.server.ts

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import { Redis } from "ioredis";
2424
import { createAdapter } from "@socket.io/redis-adapter";
2525
import { CrashTaskRunService } from "./services/crashTaskRun.server";
2626
import { CreateTaskRunAttemptService } from "./services/createTaskRunAttempt.server";
27+
import { UpdateFatalRunErrorService } from "./services/updateFatalRunError.server";
2728

2829
export const socketIo = singleton("socketIo", initalizeIoServer);
2930

@@ -302,11 +303,13 @@ function createProviderNamespace(io: Server) {
302303
handlers: {
303304
WORKER_CRASHED: async (message) => {
304305
try {
305-
const service = new CrashTaskRunService();
306-
307-
await service.call(message.runId, {
308-
...message,
309-
});
306+
if (message.overrideCompletion) {
307+
const updateErrorService = new UpdateFatalRunErrorService();
308+
await updateErrorService.call(message.runId, { ...message });
309+
} else {
310+
const crashRunService = new CrashTaskRunService();
311+
await crashRunService.call(message.runId, { ...message });
312+
}
310313
} catch (error) {
311314
logger.error("Error while handling crashed worker", { error });
312315
}

apps/webapp/app/v3/services/crashTaskRun.server.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ export class CrashTaskRunService extends BaseService {
2929

3030
logger.debug("CrashTaskRunService.call", { runId, opts });
3131

32+
if (options?.overrideCompletion) {
33+
logger.error("CrashTaskRunService.call: overrideCompletion is deprecated", { runId });
34+
return;
35+
}
36+
3237
const taskRun = await this._prisma.taskRun.findFirst({
3338
where: {
3439
id: runId,

packages/core/src/v3/schemas/messages.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ export const ProviderToPlatformMessages = {
252252
exitCode: z.number().optional(),
253253
message: z.string().optional(),
254254
logs: z.string().optional(),
255+
/** This means we should only update the error if one exists */
255256
overrideCompletion: z.boolean().optional(),
256257
errorCode: TaskRunInternalError.shape.code.optional(),
257258
}),

0 commit comments

Comments
 (0)