7
7
} from "@trigger.dev/core/v3" ;
8
8
import {
9
9
BatchTaskRun ,
10
+ isPrismaRaceConditionError ,
11
+ isPrismaRetriableError ,
10
12
isUniqueConstraintError ,
11
13
Prisma ,
12
14
TaskRunAttempt ,
@@ -20,6 +22,7 @@ import { logger } from "~/services/logger.server";
20
22
import { getEntitlement } from "~/services/platform.v3.server" ;
21
23
import { workerQueue } from "~/services/worker.server" ;
22
24
import { generateFriendlyId } from "../friendlyIdentifiers" ;
25
+ import { legacyRunEngineWorker } from "../legacyRunEngineWorker.server" ;
23
26
import { marqs } from "../marqs/index.server" ;
24
27
import { guardQueueSizeLimitsForEnv } from "../queueSizeLimits.server" ;
25
28
import { downloadPacketFromObjectStore , uploadPacketToObjectStore } from "../r2.server" ;
@@ -923,71 +926,123 @@ export async function completeBatchTaskRunItemV3(
923
926
batchTaskRunId : string ,
924
927
tx : PrismaClientOrTransaction ,
925
928
scheduleResumeOnComplete = false ,
926
- taskRunAttemptId ?: string
929
+ taskRunAttemptId ?: string ,
930
+ retryAttempt ?: number
927
931
) {
928
- await $transaction (
929
- tx ,
930
- "completeBatchTaskRunItemV3" ,
931
- async ( tx , span ) => {
932
- span ?. setAttribute ( "batch_id" , batchTaskRunId ) ;
933
-
934
- // Update the item to complete
935
- const updated = await tx . batchTaskRunItem . updateMany ( {
936
- where : {
937
- id : itemId ,
938
- status : "PENDING" ,
939
- } ,
940
- data : {
941
- status : "COMPLETED" ,
942
- taskRunAttemptId,
943
- } ,
944
- } ) ;
932
+ const isRetry = retryAttempt !== undefined ;
933
+
934
+ if ( isRetry ) {
935
+ logger . debug ( "completeBatchTaskRunItemV3 retrying" , {
936
+ itemId,
937
+ batchTaskRunId,
938
+ scheduleResumeOnComplete,
939
+ taskRunAttemptId,
940
+ retryAttempt,
941
+ } ) ;
942
+ }
945
943
946
- if ( updated . count === 0 ) {
947
- return ;
948
- }
944
+ try {
945
+ await $transaction (
946
+ tx ,
947
+ "completeBatchTaskRunItemV3" ,
948
+ async ( tx , span ) => {
949
+ span ?. setAttribute ( "batch_id" , batchTaskRunId ) ;
949
950
950
- const updatedBatchRun = await tx . batchTaskRun . update ( {
951
- where : {
952
- id : batchTaskRunId ,
953
- } ,
954
- data : {
955
- completedCount : {
956
- increment : 1 ,
951
+ // Update the item to complete
952
+ const updated = await tx . batchTaskRunItem . updateMany ( {
953
+ where : {
954
+ id : itemId ,
955
+ status : "PENDING" ,
957
956
} ,
958
- } ,
959
- select : {
960
- sealed : true ,
961
- status : true ,
962
- completedCount : true ,
963
- expectedCount : true ,
964
- dependentTaskAttemptId : true ,
965
- } ,
966
- } ) ;
957
+ data : {
958
+ status : "COMPLETED" ,
959
+ taskRunAttemptId,
960
+ } ,
961
+ } ) ;
967
962
968
- if (
969
- updatedBatchRun . status === "PENDING" &&
970
- updatedBatchRun . completedCount === updatedBatchRun . expectedCount &&
971
- updatedBatchRun . sealed
972
- ) {
973
- await tx . batchTaskRun . update ( {
963
+ if ( updated . count === 0 ) {
964
+ return ;
965
+ }
966
+
967
+ const updatedBatchRun = await tx . batchTaskRun . update ( {
974
968
where : {
975
969
id : batchTaskRunId ,
976
970
} ,
977
971
data : {
978
- status : "COMPLETED" ,
979
- completedAt : new Date ( ) ,
972
+ completedCount : {
973
+ increment : 1 ,
974
+ } ,
975
+ } ,
976
+ select : {
977
+ sealed : true ,
978
+ status : true ,
979
+ completedCount : true ,
980
+ expectedCount : true ,
981
+ dependentTaskAttemptId : true ,
980
982
} ,
981
983
} ) ;
982
984
983
- // We only need to resume the batch if it has a dependent task attempt ID
984
- if ( scheduleResumeOnComplete && updatedBatchRun . dependentTaskAttemptId ) {
985
- await ResumeBatchRunService . enqueue ( batchTaskRunId , true , tx ) ;
985
+ if (
986
+ updatedBatchRun . status === "PENDING" &&
987
+ updatedBatchRun . completedCount === updatedBatchRun . expectedCount &&
988
+ updatedBatchRun . sealed
989
+ ) {
990
+ await tx . batchTaskRun . update ( {
991
+ where : {
992
+ id : batchTaskRunId ,
993
+ } ,
994
+ data : {
995
+ status : "COMPLETED" ,
996
+ completedAt : new Date ( ) ,
997
+ } ,
998
+ } ) ;
999
+
1000
+ // We only need to resume the batch if it has a dependent task attempt ID
1001
+ if ( scheduleResumeOnComplete && updatedBatchRun . dependentTaskAttemptId ) {
1002
+ await ResumeBatchRunService . enqueue ( batchTaskRunId , true , tx ) ;
1003
+ }
986
1004
}
1005
+ } ,
1006
+ {
1007
+ timeout : 10_000 ,
1008
+ maxWait : 4_000 ,
987
1009
}
988
- } ,
989
- {
990
- timeout : 10000 ,
1010
+ ) ;
1011
+ } catch ( error ) {
1012
+ if ( isPrismaRetriableError ( error ) || isPrismaRaceConditionError ( error ) ) {
1013
+ logger . error ( "completeBatchTaskRunItemV3 failed with a Prisma Error, scheduling a retry" , {
1014
+ itemId,
1015
+ batchTaskRunId,
1016
+ error,
1017
+ retryAttempt,
1018
+ isRetry,
1019
+ } ) ;
1020
+
1021
+ if ( isRetry ) {
1022
+ //throwing this error will cause the Redis worker to retry the job
1023
+ throw error ;
1024
+ } else {
1025
+ //schedule a retry
1026
+ await legacyRunEngineWorker . enqueue ( {
1027
+ id : `completeBatchTaskRunItem:${ itemId } ` ,
1028
+ job : "completeBatchTaskRunItem" ,
1029
+ payload : {
1030
+ itemId,
1031
+ batchTaskRunId,
1032
+ scheduleResumeOnComplete,
1033
+ taskRunAttemptId,
1034
+ } ,
1035
+ availableAt : new Date ( Date . now ( ) + 2_000 ) ,
1036
+ } ) ;
1037
+ }
1038
+ } else {
1039
+ logger . error ( "completeBatchTaskRunItemV3 failed with a non-retriable error" , {
1040
+ itemId,
1041
+ batchTaskRunId,
1042
+ error,
1043
+ retryAttempt,
1044
+ isRetry,
1045
+ } ) ;
991
1046
}
992
- ) ;
1047
+ }
993
1048
}
0 commit comments