98
98
import com .linkedin .venice .exceptions .ErrorType ;
99
99
import com .linkedin .venice .exceptions .VeniceException ;
100
100
import com .linkedin .venice .exceptions .VeniceResourceAccessException ;
101
+ import com .linkedin .venice .exceptions .VeniceTimeoutException ;
101
102
import com .linkedin .venice .hadoop .exceptions .VeniceInvalidInputException ;
102
103
import com .linkedin .venice .hadoop .input .kafka .KafkaInputDictTrainer ;
103
104
import com .linkedin .venice .hadoop .mapreduce .datawriter .jobs .DataWriterMRJob ;
164
165
import java .util .Optional ;
165
166
import java .util .Properties ;
166
167
import java .util .Set ;
168
+ import java .util .concurrent .Executors ;
169
+ import java .util .concurrent .ScheduledExecutorService ;
167
170
import java .util .concurrent .TimeUnit ;
168
171
import java .util .stream .Collectors ;
169
172
import org .apache .avro .Schema ;
@@ -245,6 +248,7 @@ public class VenicePushJob implements AutoCloseable {
245
248
private final PushJobHeartbeatSenderFactory pushJobHeartbeatSenderFactory ;
246
249
private PushJobHeartbeatSender pushJobHeartbeatSender = null ;
247
250
private boolean pushJobStatusUploadDisabledHasBeenLogged = false ;
251
+ private final ScheduledExecutorService timeoutExecutor ;
248
252
249
253
/**
250
254
* @param jobId id of the job
@@ -253,6 +257,7 @@ public class VenicePushJob implements AutoCloseable {
253
257
public VenicePushJob (String jobId , Properties vanillaProps ) {
254
258
this .jobId = jobId ;
255
259
this .props = getVenicePropsFromVanillaProps (Objects .requireNonNull (vanillaProps , "VPJ props cannot be null" ));
260
+ this .timeoutExecutor = Executors .newSingleThreadScheduledExecutor ();
256
261
LOGGER .info ("Constructing {}: {}" , VenicePushJob .class .getSimpleName (), props .toString (true ));
257
262
this .sslProperties = Lazy .of (() -> {
258
263
try {
@@ -657,11 +662,7 @@ DataWriterComputeJob getDataWriterComputeJob() {
657
662
*/
658
663
public void run () {
659
664
try {
660
- Optional <SSLFactory > sslFactory = VPJSSLUtils .createSSLFactory (
661
- pushJobSetting .enableSSL ,
662
- props .getString (SSL_FACTORY_CLASS_NAME , DEFAULT_SSL_FACTORY_CLASS_NAME ),
663
- this .sslProperties );
664
- initControllerClient (pushJobSetting .storeName , sslFactory );
665
+ initControllerClient (pushJobSetting .storeName );
665
666
pushJobSetting .clusterName = controllerClient .getClusterName ();
666
667
LOGGER .info (
667
668
"The store {} is discovered in Venice cluster {}" ,
@@ -672,6 +673,7 @@ public void run() {
672
673
initKIFRepushDetails ();
673
674
}
674
675
676
+ setupJobTimeoutMonitor ();
675
677
initPushJobDetails ();
676
678
logGreeting ();
677
679
sendPushJobDetailsToController ();
@@ -887,6 +889,20 @@ public void run() {
887
889
}
888
890
}
889
891
892
+ /**
893
+ * Timeout on the entire push job that kills the job if it runs longer than the store's configured bootstrap timeout.
894
+ */
895
+ private void setupJobTimeoutMonitor () {
896
+ long bootstrapToOnlineTimeoutInHours =
897
+ getStoreResponse (pushJobSetting .storeName ).getStore ().getBootstrapToOnlineTimeoutInHours ();
898
+ timeoutExecutor .schedule (() -> {
899
+ cancel ();
900
+ throw new VeniceTimeoutException (
901
+ "Failing push-job for store " + pushJobSetting .storeName + " which is still running after "
902
+ + bootstrapToOnlineTimeoutInHours + " hours." );
903
+ }, bootstrapToOnlineTimeoutInHours , TimeUnit .HOURS );
904
+ }
905
+
890
906
private void buildHDFSSchemaDir () throws IOException {
891
907
// Build the full path for HDFSRmdSchemaSource:
892
908
// RMD schemas: <job_temp_dir>/rmd_schemas
@@ -1199,9 +1215,12 @@ protected InputDataInfoProvider getInputDataInfoProvider() {
1199
1215
* 2. A mock controller client is provided
1200
1216
*
1201
1217
* @param storeName
1202
- * @param sslFactory
1203
1218
*/
1204
- private void initControllerClient (String storeName , Optional <SSLFactory > sslFactory ) {
1219
+ private void initControllerClient (String storeName ) {
1220
+ Optional <SSLFactory > sslFactory = VPJSSLUtils .createSSLFactory (
1221
+ pushJobSetting .enableSSL ,
1222
+ props .getString (SSL_FACTORY_CLASS_NAME , DEFAULT_SSL_FACTORY_CLASS_NAME ),
1223
+ this .sslProperties );
1205
1224
final String controllerD2ZkHost ;
1206
1225
if (pushJobSetting .multiRegion ) {
1207
1226
// In multi region mode, push jobs will communicate with parent controller
@@ -2312,7 +2331,6 @@ void pollStatusUntilComplete(
2312
2331
* no more than {@link DEFAULT_JOB_STATUS_IN_UNKNOWN_STATE_TIMEOUT_MS}.
2313
2332
*/
2314
2333
long unknownStateStartTimeMs = 0 ;
2315
- long pollStartTimeMs = System .currentTimeMillis ();
2316
2334
2317
2335
String topicToMonitor = getTopicToMonitor (pushJobSetting );
2318
2336
@@ -2382,14 +2400,6 @@ void pollStatusUntilComplete(
2382
2400
}
2383
2401
return ;
2384
2402
}
2385
- long bootstrapToOnlineTimeoutInHours =
2386
- VenicePushJob .this .pushJobSetting .storeResponse .getStore ().getBootstrapToOnlineTimeoutInHours ();
2387
- long durationMs = LatencyUtils .getElapsedTimeFromMsToMs (pollStartTimeMs );
2388
- if (durationMs > TimeUnit .HOURS .toMillis (bootstrapToOnlineTimeoutInHours )) {
2389
- throw new VeniceException (
2390
- "Failing push-job for store " + VenicePushJob .this .pushJobSetting .storeResponse .getName ()
2391
- + " which is still running after " + TimeUnit .MILLISECONDS .toHours (durationMs ) + " hours." );
2392
- }
2393
2403
if (!overallStatus .equals (ExecutionStatus .UNKNOWN )) {
2394
2404
unknownStateStartTimeMs = 0 ;
2395
2405
} else if (unknownStateStartTimeMs == 0 ) {
@@ -2557,9 +2567,8 @@ private String pushJobPropertiesToString(
2557
2567
}
2558
2568
2559
2569
/**
2560
- * A cancel method for graceful cancellation of the running Job to be invoked as a result of user actions.
2561
- *
2562
- * @throws Exception
2570
+ * A cancel method for graceful cancellation of the running Job to be invoked as a result of user actions or due to
2571
+ * the job exceeding bootstrapToOnlineTimeoutInHours.
2563
2572
*/
2564
2573
public void cancel () {
2565
2574
killJob (pushJobSetting , controllerClient );
@@ -2572,7 +2581,7 @@ public void cancel() {
2572
2581
sendPushJobDetailsToController ();
2573
2582
}
2574
2583
2575
- private void killJob (PushJobSetting pushJobSetting , ControllerClient controllerClient ) {
2584
+ void killJob (PushJobSetting pushJobSetting , ControllerClient controllerClient ) {
2576
2585
// Attempting to kill job. There's a race condition, but meh. Better kill when you know it's running
2577
2586
killDataWriterJob ();
2578
2587
if (!pushJobSetting .isIncrementalPush ) {
@@ -2597,7 +2606,7 @@ private void killJob(PushJobSetting pushJobSetting, ControllerClient controllerC
2597
2606
}
2598
2607
}
2599
2608
2600
- private void killDataWriterJob () {
2609
+ void killDataWriterJob () {
2601
2610
if (dataWriterComputeJob == null ) {
2602
2611
LOGGER .warn ("No op to kill a null data writer job" );
2603
2612
return ;
@@ -2671,6 +2680,7 @@ private static Path getLatestPath(Path path, FileSystem fs) throws IOException {
2671
2680
2672
2681
@ Override
2673
2682
public void close () {
2683
+ timeoutExecutor .shutdownNow ();
2674
2684
closeVeniceWriter ();
2675
2685
Utils .closeQuietlyWithErrorLogged (dataWriterComputeJob );
2676
2686
Utils .closeQuietlyWithErrorLogged (controllerClient );
@@ -2684,7 +2694,6 @@ public void close() {
2684
2694
}
2685
2695
2686
2696
public static void main (String [] args ) {
2687
-
2688
2697
if (args .length != 1 ) {
2689
2698
Utils .exit ("USAGE: java -jar venice-push-job-all.jar <VPJ_config_file_path>" );
2690
2699
}
0 commit comments