Skip to content

Commit 7f68679

Browse files
authored
[controller] Fix bug in ProtocolVersionAutoDetectionService (#1795)
Fix 1: Startup Check for ProtocolVersionAutoDetectionService Leadership check failed due to initialDelay = 0. Increased delay to allow full init. Fix 2: URL Map Mismatch Protocol mismatch (http vs https) broke map lookup. Normalized key format. Integration Test Note: Disabled HTTPS in tests due to getUrl(true) returning same URL for all local controllers. Production uses different hosts, so not affected.
1 parent 862fff2 commit 7f68679

File tree

11 files changed

+100
-92
lines changed

11 files changed

+100
-92
lines changed

internal/venice-common/src/main/java/com/linkedin/venice/controllerapi/AdminOperationProtocolVersionControllerResponse.java

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66

77
public class AdminOperationProtocolVersionControllerResponse extends ControllerResponse {
88
private long localAdminOperationProtocolVersion = -1;
9-
private String requestUrl = "";
10-
private Map<String, Long> controllerUrlToVersionMap = new HashMap<>();
9+
private String localControllerName = "";
10+
private Map<String, Long> controllerNameToVersionMap = new HashMap<>();
1111

1212
public void setLocalAdminOperationProtocolVersion(long adminOperationProtocolVersion) {
1313
this.localAdminOperationProtocolVersion = adminOperationProtocolVersion;
@@ -17,19 +17,19 @@ public long getLocalAdminOperationProtocolVersion() {
1717
return localAdminOperationProtocolVersion;
1818
}
1919

20-
public void setRequestUrl(String url) {
21-
this.requestUrl = url;
20+
public void setLocalControllerName(String controllerName) {
21+
this.localControllerName = controllerName;
2222
}
2323

24-
public String getRequestUrl() {
25-
return requestUrl;
24+
public String getLocalControllerName() {
25+
return localControllerName;
2626
}
2727

28-
public void setControllerUrlToVersionMap(Map<String, Long> urlToVersionMap) {
29-
this.controllerUrlToVersionMap = urlToVersionMap;
28+
public void setControllerNameToVersionMap(Map<String, Long> controllerNameToVersionMap) {
29+
this.controllerNameToVersionMap = controllerNameToVersionMap;
3030
}
3131

32-
public Map<String, Long> getControllerUrlToVersionMap() {
33-
return controllerUrlToVersionMap;
32+
public Map<String, Long> getControllerNameToVersionMap() {
33+
return controllerNameToVersionMap;
3434
}
3535
}

internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/controller/TestAdminOperationVersionDetection.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package com.linkedin.venice.controller;
22

3+
import static com.linkedin.venice.ConfigKeys.CONTROLLER_SSL_ENABLED;
34
import static org.testng.Assert.assertEquals;
45
import static org.testng.Assert.assertFalse;
56

@@ -36,8 +37,10 @@ public class TestAdminOperationVersionDetection {
3637
@BeforeClass(alwaysRun = true)
3738
public void setUp() throws Exception {
3839
// Create multi-region multi-cluster setup
39-
Properties parentControllerProperties = new Properties();
40+
Properties controllerProperties = new Properties();
41+
controllerProperties.put(CONTROLLER_SSL_ENABLED, "false");
4042
Properties serverProperties = new Properties();
43+
serverProperties.put(CONTROLLER_SSL_ENABLED, "false");
4144

4245
VeniceMultiRegionClusterCreateOptions.Builder optionsBuilder =
4346
new VeniceMultiRegionClusterCreateOptions.Builder().numberOfRegions(NUMBER_OF_CHILD_DATACENTERS)
@@ -50,7 +53,8 @@ public void setUp() throws Exception {
5053
.sslToStorageNodes(true)
5154
.forkServer(false)
5255
.serverProperties(serverProperties)
53-
.parentControllerProperties(parentControllerProperties);
56+
.parentControllerProperties(controllerProperties)
57+
.childControllerProperties(controllerProperties);
5458
multiRegionMultiClusterWrapper =
5559
ServiceFactory.getVeniceTwoLayerMultiRegionMultiClusterWrapper(optionsBuilder.build());
5660
}
@@ -73,8 +77,8 @@ public void testGetAdminOperationVersionForParentControllers() {
7377
assertEquals(
7478
response.getLocalAdminOperationProtocolVersion(),
7579
AdminOperationSerializer.LATEST_SCHEMA_ID_FOR_ADMIN_OPERATION);
76-
Map<String, Long> urlToVersionMap = response.getControllerUrlToVersionMap();
77-
assertEquals(urlToVersionMap.size(), 2);
80+
Map<String, Long> controllerNameToVersionMap = response.getControllerNameToVersionMap();
81+
assertEquals(controllerNameToVersionMap.size(), 2);
7882
assertEquals(response.getCluster(), clusterName);
7983
}
8084

@@ -97,8 +101,8 @@ public void testAdminOperationVersionForChildControllers() {
97101
assertEquals(
98102
response.getLocalAdminOperationProtocolVersion(),
99103
AdminOperationSerializer.LATEST_SCHEMA_ID_FOR_ADMIN_OPERATION);
100-
Map<String, Long> urlToVersionMap = response.getControllerUrlToVersionMap();
101-
assertEquals(urlToVersionMap.size(), 2);
104+
Map<String, Long> controllerNameToVersionMap = response.getControllerNameToVersionMap();
105+
assertEquals(controllerNameToVersionMap.size(), 2);
102106
assertEquals(response.getCluster(), clusterName);
103107
}
104108
}

internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/endToEnd/TestProtocolVersionAutoDetection.java

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package com.linkedin.venice.endToEnd;
22

3-
import static com.linkedin.venice.ConfigKeys.CONTROLLER_PROTOCOL_VERSION_AUTO_DETECTION_SERVICE_ENABLED;
4-
import static com.linkedin.venice.ConfigKeys.CONTROLLER_PROTOCOL_VERSION_AUTO_DETECTION_SLEEP_MS;
3+
import static com.linkedin.venice.ConfigKeys.*;
54
import static com.linkedin.venice.controller.kafka.protocol.serializer.AdminOperationSerializer.LATEST_SCHEMA_ID_FOR_ADMIN_OPERATION;
65
import static org.testng.Assert.assertEquals;
76
import static org.testng.Assert.assertFalse;
@@ -41,6 +40,10 @@ public void setUp() {
4140
Properties parentControllerProps = new Properties();
4241
parentControllerProps.put(CONTROLLER_PROTOCOL_VERSION_AUTO_DETECTION_SERVICE_ENABLED, true);
4342
parentControllerProps.put(CONTROLLER_PROTOCOL_VERSION_AUTO_DETECTION_SLEEP_MS, SERVICE_INTERVAL_MS);
43+
parentControllerProps.put(CONTROLLER_SSL_ENABLED, "false");
44+
45+
Properties childControllerProps = new Properties();
46+
childControllerProps.put(CONTROLLER_SSL_ENABLED, "false");
4447

4548
VeniceMultiRegionClusterCreateOptions.Builder optionsBuilder =
4649
new VeniceMultiRegionClusterCreateOptions.Builder().numberOfRegions(NUMBER_OF_CHILD_DATACENTERS)
@@ -51,7 +54,9 @@ public void setUp() {
5154
.numberOfRouters(1)
5255
.replicationFactor(1)
5356
.forkServer(false)
54-
.parentControllerProperties(parentControllerProps);
57+
.sslToStorageNodes(true)
58+
.parentControllerProperties(parentControllerProps)
59+
.childControllerProperties(childControllerProps);
5560
multiRegionMultiClusterWrapper =
5661
ServiceFactory.getVeniceTwoLayerMultiRegionMultiClusterWrapper(optionsBuilder.build());
5762
childDatacenters = multiRegionMultiClusterWrapper.getChildRegions();
@@ -100,7 +105,7 @@ public void testGetAdminOperationProtocolVersionFromControllers() {
100105
AdminOperationProtocolVersionControllerResponse parentResponse =
101106
parentControllerClient.getAdminOperationProtocolVersionFromControllers(clusterName);
102107
assertEquals(parentResponse.getLocalAdminOperationProtocolVersion(), LATEST_SCHEMA_ID_FOR_ADMIN_OPERATION);
103-
assertEquals(parentResponse.getControllerUrlToVersionMap().size(), 2);
108+
assertEquals(parentResponse.getControllerNameToVersionMap().size(), 2);
104109
}
105110

106111
// child controller
@@ -109,7 +114,7 @@ public void testGetAdminOperationProtocolVersionFromControllers() {
109114
AdminOperationProtocolVersionControllerResponse childResponse =
110115
dc0ControllerClient.getAdminOperationProtocolVersionFromControllers(clusterName);
111116
assertEquals(childResponse.getLocalAdminOperationProtocolVersion(), LATEST_SCHEMA_ID_FOR_ADMIN_OPERATION);
112-
assertEquals(childResponse.getControllerUrlToVersionMap().size(), 2);
117+
assertEquals(childResponse.getControllerNameToVersionMap().size(), 2);
113118
}
114119
}
115120
}

services/venice-controller/src/main/java/com/linkedin/venice/controller/Admin.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,4 +1045,6 @@ default void clearInstanceMonitor(String clusterName) {
10451045
LogContext getLogContext();
10461046

10471047
VeniceControllerClusterConfig getControllerConfig(String clusterName);
1048+
1049+
String getControllerName();
10481050
}

services/venice-controller/src/main/java/com/linkedin/venice/controller/ProtocolVersionAutoDetectionService.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ public ProtocolVersionAutoDetectionService(
5555
@Override
5656
public boolean startInner() throws Exception {
5757
LOGGER.info("Starting {}", getClass().getSimpleName());
58-
executor.scheduleAtFixedRate(getRunnableTask(), 0, sleepIntervalInMs, TimeUnit.MILLISECONDS);
58+
executor.scheduleAtFixedRate(getRunnableTask(), sleepIntervalInMs, sleepIntervalInMs, TimeUnit.MILLISECONDS);
5959
return true;
6060
}
6161

@@ -96,8 +96,8 @@ public long getSmallestLocalAdminOperationProtocolVersionForAllConsumers(String
9696
"Failed to get admin operation protocol version from child controller " + entry.getKey() + ": "
9797
+ response.getError());
9898
}
99-
Map<String, Long> controllerUrlToVersionMap = response.getControllerUrlToVersionMap();
100-
regionToControllerToVersionMap.put(entry.getKey(), controllerUrlToVersionMap);
99+
Map<String, Long> controllerNameToVersionMap = response.getControllerNameToVersionMap();
100+
regionToControllerToVersionMap.put(entry.getKey(), controllerNameToVersionMap);
101101
}
102102

103103
LOGGER.info("All controller versions for cluster {}: {}", clusterName, regionToControllerToVersionMap);

services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceHelixAdmin.java

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7387,6 +7387,7 @@ public List<Instance> getControllersByHelixState(String clusterName, String heli
73877387
if (instanceNameAndState.getValue().equals(helixState)) {
73887388
// Found the Venice controller, adding to the return list
73897389
String id = instanceNameAndState.getKey();
7390+
// TODO: Update the instance constructor when Helix NodeId is changed to use secure port.
73907391
controllers.add(
73917392
new Instance(
73927393
id,
@@ -7791,27 +7792,34 @@ public void updateAdminOperationProtocolVersion(String clusterName, Long adminOp
77917792
/**
77927793
* Get the admin operation protocol versions from controllers (leader + standby) for specific cluster.
77937794
* @param clusterName: the cluster name
7794-
* @return map (controllerUrl: version). Example: {http://localhost:1234=1, http://localhost:1235=1}*/
7795+
* @return map (controllerName: version). Example: {localhost_1234=1, localhost_1235=1}*/
77957796
@Override
77967797
public Map<String, Long> getAdminOperationVersionFromControllers(String clusterName) {
77977798
checkControllerLeadershipFor(clusterName);
77987799

7799-
Map<String, Long> controllerUrlToAdminOperationVersionMap = new HashMap<>();
7800+
Map<String, Long> controllerNameToAdminOperationVersionMap = new HashMap<>();
78007801

78017802
// Get the version from the current controller - leader
7802-
String leaderControllerUrl = getLeaderController(clusterName).getUrl(false);
7803-
controllerUrlToAdminOperationVersionMap.put(leaderControllerUrl, getLocalAdminOperationProtocolVersion());
7803+
Instance leaderController = getLeaderController(clusterName);
7804+
7805+
controllerNameToAdminOperationVersionMap.put(getControllerName(), getLocalAdminOperationProtocolVersion());
78047806

78057807
// Create the controller client to reuse
78067808
// (this is controller client to communicate with other controllers in the same cluster, the same region)
7807-
ControllerClient localControllerClient =
7808-
ControllerClient.constructClusterControllerClient(clusterName, leaderControllerUrl, sslFactory);
7809+
ControllerClient localControllerClient = ControllerClient.constructClusterControllerClient(
7810+
clusterName,
7811+
leaderController.getUrl(getSslFactory().isPresent()),
7812+
getSslFactory());
78097813

78107814
// Get version for standby controllers
78117815
List<Instance> standbyControllers = getControllersByHelixState(clusterName, HelixState.STANDBY_STATE);
78127816

78137817
for (Instance standbyController: standbyControllers) {
7814-
String standbyControllerUrl = standbyController.getUrl(false);
7818+
// In production, leader and standby share the secure port but have different hostnames—no conflict.
7819+
// In tests, both run on 'localhost' with the same secure port, which confuses routing.
7820+
// Hence, we need to disable SSL for local integration tests.
7821+
// RCA: Helix uses an insecure port from the instance ID, while secure port comes from shared multiClusterConfig.
7822+
String standbyControllerUrl = standbyController.getUrl(getSslFactory().isPresent());
78157823

78167824
// Get the admin operation protocol version from standby controller
78177825
AdminOperationProtocolVersionControllerResponse response =
@@ -7821,11 +7829,11 @@ public Map<String, Long> getAdminOperationVersionFromControllers(String clusterN
78217829
"Failed to get admin operation protocol version from standby controller: " + standbyControllerUrl
78227830
+ ", error message: " + response.getError());
78237831
}
7824-
controllerUrlToAdminOperationVersionMap
7825-
.put(response.getRequestUrl(), response.getLocalAdminOperationProtocolVersion());
7832+
controllerNameToAdminOperationVersionMap
7833+
.put(response.getLocalControllerName(), response.getLocalAdminOperationProtocolVersion());
78267834
}
78277835

7828-
return controllerUrlToAdminOperationVersionMap;
7836+
return controllerNameToAdminOperationVersionMap;
78297837
}
78307838

78317839
/**
@@ -8050,7 +8058,8 @@ void addConfig(VeniceControllerClusterConfig config) {
80508058
multiClusterConfigs.addClusterConfig(config);
80518059
}
80528060

8053-
String getControllerName() {
8061+
@Override
8062+
public String getControllerName() {
80548063
return controllerName;
80558064
}
80568065

services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceParentHelixAdmin.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4547,6 +4547,11 @@ public long getLocalAdminOperationProtocolVersion() {
45474547
return getVeniceHelixAdmin().getLocalAdminOperationProtocolVersion();
45484548
}
45494549

4550+
@Override
4551+
public String getControllerName() {
4552+
return getVeniceHelixAdmin().getControllerName();
4553+
}
4554+
45504555
/**
45514556
* Unsupported operation in the parent controller.
45524557
*/

services/venice-controller/src/main/java/com/linkedin/venice/controller/server/ControllerRoutes.java

Lines changed: 5 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
import com.linkedin.venice.controllerapi.PubSubTopicConfigResponse;
2626
import com.linkedin.venice.controllerapi.StoppableNodeStatusResponse;
2727
import com.linkedin.venice.exceptions.ErrorType;
28-
import com.linkedin.venice.exceptions.VeniceException;
2928
import com.linkedin.venice.protocols.controller.LeaderControllerGrpcRequest;
3029
import com.linkedin.venice.protocols.controller.LeaderControllerGrpcResponse;
3130
import com.linkedin.venice.pubsub.PubSubTopicConfiguration;
@@ -34,7 +33,6 @@
3433
import com.linkedin.venice.pubsub.manager.TopicManager;
3534
import com.linkedin.venice.utils.ObjectMapperFactory;
3635
import com.linkedin.venice.utils.Utils;
37-
import java.net.URL;
3836
import java.util.List;
3937
import java.util.Map;
4038
import java.util.Optional;
@@ -258,22 +256,11 @@ public Route getAdminOperationVersionFromControllers(Admin admin) {
258256
response.type(HttpConstants.JSON);
259257
try {
260258
String clusterName = request.queryParams(CLUSTER);
261-
String currentUrl = getRequestURL(request);
262-
263259
responseObject.setCluster(clusterName);
264-
Map<String, Long> controllerUrlToVersionMap = admin.getAdminOperationVersionFromControllers(clusterName);
265-
responseObject.setControllerUrlToVersionMap(controllerUrlToVersionMap);
266-
responseObject.setRequestUrl(currentUrl);
267-
268-
if (controllerUrlToVersionMap.containsKey(currentUrl)) {
269-
responseObject.setLocalAdminOperationProtocolVersion(controllerUrlToVersionMap.get(currentUrl));
270-
} else {
271-
// Should not happen
272-
throw new VeniceException(
273-
"The current controller URL: " + currentUrl + " is not in the urlToVersionMap in the response "
274-
+ controllerUrlToVersionMap);
275-
}
276-
260+
Map<String, Long> controllerNameToVersionMap = admin.getAdminOperationVersionFromControllers(clusterName);
261+
responseObject.setControllerNameToVersionMap(controllerNameToVersionMap);
262+
responseObject.setLocalControllerName(admin.getControllerName());
263+
responseObject.setLocalAdminOperationProtocolVersion(admin.getLocalAdminOperationProtocolVersion());
277264
} catch (Throwable e) {
278265
responseObject.setError(e);
279266
AdminSparkServer.handleError(e, request, response);
@@ -292,7 +279,7 @@ public Route getLocalAdminOperationProtocolVersion(Admin admin) {
292279
response.type(HttpConstants.JSON);
293280
try {
294281
responseObject.setLocalAdminOperationProtocolVersion(admin.getLocalAdminOperationProtocolVersion());
295-
responseObject.setRequestUrl(getRequestURL(request));
282+
responseObject.setLocalControllerName(admin.getControllerName());
296283
} catch (Throwable e) {
297284
responseObject.setError(e);
298285
AdminSparkServer.handleError(e, request, response);
@@ -301,22 +288,6 @@ public Route getLocalAdminOperationProtocolVersion(Admin admin) {
301288
};
302289
}
303290

304-
/**
305-
* Get the request base URL from the request object.
306-
* Example:
307-
* request.url() = https://localhost:8080/venice/cluster/clusterName/leaderController?param1=value1&param2=value2
308-
* base URL: https://localhost:8080
309-
* @return the base URL
310-
*/
311-
private String getRequestURL(Request request) {
312-
try {
313-
URL url = new URL(request.url());
314-
return url.getProtocol() + "://" + url.getHost() + (url.getPort() != -1 ? ":" + url.getPort() : "");
315-
} catch (Exception e) {
316-
throw new VeniceException("Invalid URL: " + request.url(), e);
317-
}
318-
}
319-
320291
@FunctionalInterface
321292
interface UpdateTopicConfigFunction {
322293
void apply(Request request);

services/venice-controller/src/test/java/com/linkedin/venice/controller/TestProtocolVersionAutoDetectionService.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,13 +216,13 @@ private AdminOperationProtocolVersionControllerResponse getAdminOperationProtoco
216216
for (Map.Entry<String, Long> entry: hostToVersionMap.entrySet()) {
217217
String hostName = entry.getKey();
218218
long version = entry.getValue();
219-
response.getControllerUrlToVersionMap().put(hostName, version);
219+
response.getControllerNameToVersionMap().put(hostName, version);
220220
if (hostToClusterToLeaderStateMap.containsKey(hostName)) {
221221
Map<String, Boolean> clusterToStateMap = hostToClusterToLeaderStateMap.get(hostName);
222222
if (clusterToStateMap.get(clusterName) != null && clusterToStateMap.get(clusterName)) {
223223
// Add the local version for leader controller
224224
response.setLocalAdminOperationProtocolVersion(version);
225-
response.setRequestUrl(hostName);
225+
response.setLocalControllerName(hostName);
226226
response.setCluster(clusterName);
227227
}
228228
}

0 commit comments

Comments
 (0)