165
165
import java .util .Properties ;
166
166
import java .util .Set ;
167
167
import java .util .concurrent .Executors ;
168
- import java .util .concurrent .Future ;
169
168
import java .util .concurrent .ScheduledExecutorService ;
170
169
import java .util .concurrent .TimeUnit ;
171
170
import java .util .stream .Collectors ;
@@ -248,6 +247,7 @@ public class VenicePushJob implements AutoCloseable {
248
247
private final PushJobHeartbeatSenderFactory pushJobHeartbeatSenderFactory ;
249
248
private PushJobHeartbeatSender pushJobHeartbeatSender = null ;
250
249
private boolean pushJobStatusUploadDisabledHasBeenLogged = false ;
250
+ private final ScheduledExecutorService timeoutExecutor ;
251
251
252
252
/**
253
253
* @param jobId id of the job
@@ -256,6 +256,7 @@ public class VenicePushJob implements AutoCloseable {
256
256
public VenicePushJob (String jobId , Properties vanillaProps ) {
257
257
this .jobId = jobId ;
258
258
this .props = getVenicePropsFromVanillaProps (Objects .requireNonNull (vanillaProps , "VPJ props cannot be null" ));
259
+ this .timeoutExecutor = Executors .newSingleThreadScheduledExecutor ();
259
260
LOGGER .info ("Constructing {}: {}" , VenicePushJob .class .getSimpleName (), props .toString (true ));
260
261
this .sslProperties = Lazy .of (() -> {
261
262
try {
@@ -659,36 +660,23 @@ DataWriterComputeJob getDataWriterComputeJob() {
659
660
* @throws VeniceException
660
661
*/
661
662
public void run () {
662
- Optional <SSLFactory > sslFactory = VPJSSLUtils .createSSLFactory (
663
- pushJobSetting .enableSSL ,
664
- props .getString (SSL_FACTORY_CLASS_NAME , DEFAULT_SSL_FACTORY_CLASS_NAME ),
665
- this .sslProperties );
666
- initControllerClient (pushJobSetting .storeName , sslFactory );
667
-
668
- ScheduledExecutorService timeoutExecutor = Executors .newSingleThreadScheduledExecutor ();
669
- long bootstrapToOnlineTimeoutInHours =
670
- getStoreResponse (pushJobSetting .storeName ).getStore ().getBootstrapToOnlineTimeoutInHours ();
671
- Future <?> timeoutFuture = timeoutExecutor .schedule (() -> {
672
- LOGGER .error ("Timeout reached. Stopping the job." );
673
- cancel ();
674
- }, bootstrapToOnlineTimeoutInHours , TimeUnit .HOURS );
675
-
676
- try {
677
- runPushJob ();
678
- } finally {
679
- timeoutFuture .cancel (true );
680
- timeoutExecutor .shutdown ();
681
- }
682
- }
683
-
684
- private void runPushJob () {
685
663
try {
664
+ initControllerClient (pushJobSetting .storeName );
686
665
pushJobSetting .clusterName = controllerClient .getClusterName ();
687
666
LOGGER .info (
688
667
"The store {} is discovered in Venice cluster {}" ,
689
668
pushJobSetting .storeName ,
690
669
pushJobSetting .clusterName );
691
670
671
+ long bootstrapToOnlineTimeoutInHours =
672
+ getStoreResponse (pushJobSetting .storeName ).getStore ().getBootstrapToOnlineTimeoutInHours ();
673
+ timeoutExecutor .schedule (() -> {
674
+ cancel ();
675
+ throw new VeniceException (
676
+ "Failing push-job for store " + pushJobSetting .storeName + " which is still running after "
677
+ + bootstrapToOnlineTimeoutInHours + " hours." );
678
+ }, bootstrapToOnlineTimeoutInHours , TimeUnit .HOURS );
679
+
692
680
if (pushJobSetting .isSourceKafka ) {
693
681
initKIFRepushDetails ();
694
682
}
@@ -1214,9 +1202,12 @@ protected InputDataInfoProvider getInputDataInfoProvider() {
1214
1202
* 2. A mock controller client is provided
1215
1203
*
1216
1204
* @param storeName
1217
- * @param sslFactory
1218
1205
*/
1219
- private void initControllerClient (String storeName , Optional <SSLFactory > sslFactory ) {
1206
+ private void initControllerClient (String storeName ) {
1207
+ Optional <SSLFactory > sslFactory = VPJSSLUtils .createSSLFactory (
1208
+ pushJobSetting .enableSSL ,
1209
+ props .getString (SSL_FACTORY_CLASS_NAME , DEFAULT_SSL_FACTORY_CLASS_NAME ),
1210
+ this .sslProperties );
1220
1211
final String controllerD2ZkHost ;
1221
1212
if (pushJobSetting .multiRegion ) {
1222
1213
// In multi region mode, push jobs will communicate with parent controller
@@ -2565,8 +2556,6 @@ private String pushJobPropertiesToString(
2565
2556
/**
2566
2557
* A cancel method for graceful cancellation of the running Job to be invoked as a result of user actions or due to
2567
2558
* the job exceeding bootstrapToOnlineTimeoutInHours.
2568
- *
2569
- * @throws Exception
2570
2559
*/
2571
2560
public void cancel () {
2572
2561
killJob (pushJobSetting , controllerClient );
@@ -2678,6 +2667,7 @@ private static Path getLatestPath(Path path, FileSystem fs) throws IOException {
2678
2667
2679
2668
@ Override
2680
2669
public void close () {
2670
+ timeoutExecutor .shutdownNow ();
2681
2671
closeVeniceWriter ();
2682
2672
Utils .closeQuietlyWithErrorLogged (dataWriterComputeJob );
2683
2673
Utils .closeQuietlyWithErrorLogged (controllerClient );
0 commit comments