Skip to content

Commit 4f3602c

Browse files
committed
[client] Add OTel key count metric
1 parent 52af421 commit 4f3602c

File tree

12 files changed

+237
-31
lines changed

12 files changed

+237
-31
lines changed

clients/da-vinci-client/src/main/java/com/linkedin/davinci/client/StatsAvroGenericDaVinciClient.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ public CompletableFuture<V> get(K key, V reusableValue) {
8383
return trackRequest(clientStatsForSingleGet, () -> super.get(key, reusableValue)).whenComplete((v, throwable) -> {
8484
if (throwable == null && v != null) {
8585
clientStatsForSingleGet.recordSuccessRequestKeyCount(1);
86+
clientStatsForSingleGet.recordFailedRequestKeyCount(0);
87+
} else {
88+
clientStatsForSingleGet.recordSuccessRequestKeyCount(0);
89+
clientStatsForSingleGet.recordFailedRequestKeyCount(1);
8690
}
8791
});
8892
}
@@ -91,8 +95,12 @@ public CompletableFuture<V> get(K key, V reusableValue) {
9195
public CompletableFuture<Map<K, V>> batchGet(Set<K> keys) {
9296
clientStatsForBatchGet.recordRequestKeyCount(keys.size());
9397
return trackRequest(clientStatsForBatchGet, () -> super.batchGet(keys)).whenComplete((v, throwable) -> {
94-
if (throwable == null && v != null) {
98+
if (v != null) {
9599
clientStatsForBatchGet.recordSuccessRequestKeyCount(v.size());
100+
clientStatsForBatchGet.recordFailedRequestKeyCount(keys.size() - v.size());
101+
} else {
102+
clientStatsForBatchGet.recordSuccessRequestKeyCount(0);
103+
clientStatsForBatchGet.recordFailedRequestKeyCount(keys.size());
96104
}
97105
});
98106
}

clients/da-vinci-client/src/test/java/com/linkedin/davinci/client/StatsAvroGenericDaVinciClientTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ public void testGet(boolean reuseApi) throws ExecutionException, InterruptedExce
6565
assertTrue(metrics.get(".test_store--healthy_request.OccurrenceRate").value() > 0);
6666
assertTrue(metrics.get(".test_store--unhealthy_request.OccurrenceRate").value() > 0);
6767
assertTrue(metrics.get(".test_store--healthy_request_latency.Avg").value() > 0);
68-
assertEquals(metrics.get(".test_store--success_request_key_count.Avg").value(), 1.0);
68+
assertEquals(metrics.get(".test_store--success_request_key_count.Avg").value(), 1.0 / 2);
6969
assertEquals(metrics.get(".test_store--success_request_key_count.Max").value(), 1.0);
7070
assertTrue(metrics.get(".test_store--success_request_ratio.SimpleRatioStat").value() < 1.0);
7171
assertTrue(metrics.get(".test_store--success_request_key_ratio.SimpleRatioStat").value() < 1.0);
@@ -105,7 +105,7 @@ public void testBatchGet() throws ExecutionException, InterruptedException {
105105
assertTrue(metrics.get(".test_store--multiget_healthy_request.OccurrenceRate").value() > 0);
106106
assertTrue(metrics.get(".test_store--multiget_unhealthy_request.OccurrenceRate").value() > 0);
107107
assertTrue(metrics.get(".test_store--multiget_healthy_request_latency.Avg").value() > 0);
108-
assertEquals(metrics.get(".test_store--multiget_success_request_key_count.Avg").value(), 2.0);
108+
assertEquals(metrics.get(".test_store--multiget_success_request_key_count.Avg").value(), 1.0); // round (2.0 / 3)
109109
assertEquals(metrics.get(".test_store--multiget_success_request_key_count.Max").value(), 2.0);
110110
assertTrue(metrics.get(".test_store--multiget_success_request_ratio.SimpleRatioStat").value() < 1.0);
111111
assertTrue(metrics.get(".test_store--multiget_success_request_key_ratio.SimpleRatioStat").value() < 1.0);

clients/venice-client/src/main/java/com/linkedin/venice/fastclient/StatsAvroGenericStoreClient.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ private <R> CompletableFuture<R> recordRequestMetrics(
158158
}
159159

160160
if (exceptionReceived) {
161+
// We still want to record the partial success key count number even if the request is unhealthy.
161162
clientStats.emitUnhealthyRequestMetrics(latency, throwable);
162163
} else {
163164
clientStats.emitHealthyRequestMetrics(latency, requestContext.successRequestKeyCount.get());
@@ -176,13 +177,15 @@ private <R> CompletableFuture<R> recordRequestMetrics(
176177
if (requestContext.responseDeserializationTime > 0) {
177178
clientStats.recordResponseDeserializationTime(requestContext.responseDeserializationTime);
178179
}
179-
clientStats.recordSuccessRequestKeyCount(requestContext.successRequestKeyCount.get());
180180
}
181181

182+
clientStats.recordSuccessRequestKeyCount(requestContext.successRequestKeyCount.get());
183+
182184
if (requestContext.noAvailableReplica) {
183185
clientStats.recordNoAvailableReplicaRequest();
184186
}
185187

188+
int retrySuccessKeyCount = 0;
186189
if (requestContext instanceof GetRequestContext) {
187190
GetRequestContext getRequestContext = (GetRequestContext) requestContext;
188191

@@ -198,7 +201,8 @@ private <R> CompletableFuture<R> recordRequestMetrics(
198201
if (!exceptionReceived) {
199202
if (getRequestContext.retryContext.retryWin) {
200203
clientStats.recordRetryRequestWin();
201-
clientStats.recordRetryRequestSuccessKeyCount(1);
204+
retrySuccessKeyCount = 1;
205+
clientStats.recordRetryRequestSuccessKeyCount(retrySuccessKeyCount);
202206
}
203207
}
204208
}
@@ -213,14 +217,20 @@ private <R> CompletableFuture<R> recordRequestMetrics(
213217
clientStats.recordRetryRequestKeyCount(retryRequestContext.numKeysInRequest);
214218
clientStats.recordRetryFanoutSize(retryRequestContext.getFanoutSize());
215219
if (!exceptionReceived) {
216-
clientStats.recordRetryRequestSuccessKeyCount(retryRequestContext.numKeysCompleted.get());
220+
retrySuccessKeyCount = retryRequestContext.numKeysCompleted.get();
221+
clientStats.recordRetryRequestSuccessKeyCount(retrySuccessKeyCount);
217222
if (retryRequestContext.numKeysCompleted.get() > 0) {
218223
clientStats.recordRetryRequestWin();
219224
}
220225
}
221226
}
222227
}
223228

229+
// numberOfKeys = successKeyCount + retrySuccessKeyCount + failedKeyCount.
230+
clientStats.recordFailedRequestKeyCount(
231+
numberOfKeys - requestContext.successRequestKeyCount.get() - retrySuccessKeyCount,
232+
throwable);
233+
224234
if (exceptionReceived) {
225235
// throw an exception after incrementing some error related metrics
226236
if (throwable instanceof VeniceClientException) {

clients/venice-client/src/test/java/com/linkedin/venice/fastclient/DispatchingAvroGenericStoreClientTest.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -525,8 +525,8 @@ private void validateMetrics(
525525
assertFalse(metrics.get(metricPrefix + "healthy_request_latency.Avg").value() > 0);
526526
assertTrue(metrics.get(metricPrefix + "unhealthy_request.OccurrenceRate").value() > 0);
527527
assertTrue(metrics.get(metricPrefix + "unhealthy_request_latency.Avg").value() > 0);
528-
// as partial healthy request is still considered unhealthy, not incrementing the below metric
529-
assertFalse(metrics.get(metricPrefix + "success_request_key_count.Max").value() > 0);
528+
// as partial healthy request is should still report success key count.
529+
assertTrue(metrics.get(metricPrefix + "success_request_key_count.Max").value() > 0);
530530
if (batchGet) {
531531
assertTrue(metrics.get(metricPrefix + "response_ttfr.Avg").value() > 0);
532532
assertEquals(batchGetRequestContext.successRequestKeyCount.get(), (int) successKeyCount);
@@ -1294,7 +1294,7 @@ public void testComputeWithExceptionFromTransportLayerForOneRoute() throws IOExc
12941294
fail();
12951295
} catch (Exception e) {
12961296
assertTrue(e.getMessage().endsWith("At least one route did not complete"), e.getMessage());
1297-
validateComputeRequestMetrics(false, false, RequestType.COMPUTE, false, 2, 1);
1297+
validateComputeRequestMetrics(false, true, RequestType.COMPUTE, false, 2, 1);
12981298
} finally {
12991299
tearDown();
13001300
}
@@ -1525,7 +1525,7 @@ public void testStreamingComputeWithExceptionFromTransportLayerForOneRoute()
15251525
assertEquals(
15261526
response.get("test_key_1").get("name").toString(),
15271527
COMPUTE_REQUEST_VALUE_RESPONSE.get("test_key_1").get("name"));
1528-
validateComputeRequestMetrics(false, false, RequestType.COMPUTE_STREAMING, false, 2, 1);
1528+
validateComputeRequestMetrics(false, true /*partialHealthyRequest*/, RequestType.COMPUTE_STREAMING, false, 2, 1);
15291529
} finally {
15301530
tearDown();
15311531
}

clients/venice-push-job/src/test/java/com/linkedin/venice/spark/datawriter/partition/VeniceSparkPartitionerTest.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ public void setUp() {
4141

4242
@AfterClass(alwaysRun = true)
4343
public void tearDown() {
44-
spark.stop();
44+
if (spark != null) {
45+
spark.stop();
46+
}
4547
}
4648

4749
@Test

clients/venice-push-job/src/test/java/com/linkedin/venice/spark/datawriter/task/SparkDataWriterTaskTrackerTest.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ public void setUp() {
1818

1919
@AfterClass(alwaysRun = true)
2020
public void tearDown() {
21-
spark.stop();
21+
if (spark != null) {
22+
spark.stop();
23+
}
2224
}
2325

2426
@Test

clients/venice-push-job/src/test/java/com/linkedin/venice/spark/engine/SparkEngineTaskConfigProviderTest.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@ public void setUp() {
3333

3434
@AfterClass(alwaysRun = true)
3535
public void tearDown() {
36-
sparkContext.close();
37-
spark.stop();
36+
if (sparkContext != null) {
37+
sparkContext.close();
38+
}
39+
if (spark != null) {
40+
spark.stop();
41+
}
3842
}
3943

4044
@Test

clients/venice-thin-client/src/main/java/com/linkedin/venice/client/stats/BasicClientStats.java

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import static com.linkedin.venice.stats.dimensions.VeniceResponseStatusCategory.SUCCESS;
1212
import static com.linkedin.venice.utils.CollectionUtils.setOf;
1313
import static org.apache.hc.core5.http.HttpStatus.SC_NOT_FOUND;
14+
import static org.apache.hc.core5.http.HttpStatus.SC_NO_CONTENT;
1415
import static org.apache.hc.core5.http.HttpStatus.SC_OK;
1516

1617
import com.linkedin.venice.client.exceptions.VeniceClientHttpException;
@@ -74,6 +75,10 @@ public class BasicClientStats extends AbstractVeniceHttpStats {
7475
private final MetricEntityStateOneEnum<VeniceResponseStatusCategory> healthyLatencyMetricForDavinciClient;
7576
private final MetricEntityStateOneEnum<VeniceResponseStatusCategory> unhealthyLatencyMetricForDavinciClient;
7677
private final Sensor requestKeyCountSensor;
78+
private final MetricEntityStateThreeEnums<HttpResponseStatusEnum, HttpResponseStatusCodeCategory, VeniceResponseStatusCategory> healthyKeyCountMetric;
79+
private final MetricEntityStateThreeEnums<HttpResponseStatusEnum, HttpResponseStatusCodeCategory, VeniceResponseStatusCategory> unhealthyKeyCountMetric;
80+
private final MetricEntityStateOneEnum<VeniceResponseStatusCategory> healthyKeyCountMetricForDavinciClient;
81+
private final MetricEntityStateOneEnum<VeniceResponseStatusCategory> unhealthyKeyCountMetricForDavinciClient;
7782
private final Sensor successRequestKeyCountSensor;
7883
private final Sensor successRequestRatioSensor;
7984
private final Sensor successRequestKeyRatioSensor;
@@ -172,10 +177,31 @@ protected BasicClientStats(
172177
getFullMetricName(BasicClientTehutiMetricName.UNHEALTHY_REQUEST_LATENCY.getMetricName()))),
173178
getBaseDimensionsMap(),
174179
VeniceResponseStatusCategory.class);
180+
181+
healthyKeyCountMetricForDavinciClient = MetricEntityStateOneEnum.create(
182+
BasicClientMetricEntity.KEY_COUNT_DVC.getMetricEntity(),
183+
getOtelRepository(),
184+
this::registerSensor,
185+
BasicClientTehutiMetricName.SUCCESS_REQUEST_KEY_COUNT,
186+
Arrays.asList(successRequestKeyCountRate, new Avg(), new Max()),
187+
baseDimensionsMap,
188+
VeniceResponseStatusCategory.class);
189+
190+
unhealthyKeyCountMetricForDavinciClient = MetricEntityStateOneEnum.create(
191+
BasicClientMetricEntity.KEY_COUNT_DVC.getMetricEntity(),
192+
otelRepository,
193+
this::registerSensor,
194+
BasicClientTehutiMetricName.FAILED_REQUEST_KEY_COUNT,
195+
Arrays.asList(new Rate(), new Avg(), new Max()),
196+
baseDimensionsMap,
197+
VeniceResponseStatusCategory.class);
198+
175199
healthyRequestMetric = null;
176200
unhealthyRequestMetric = null;
177201
healthyLatencyMetric = null;
178202
unhealthyLatencyMetric = null;
203+
healthyKeyCountMetric = null;
204+
unhealthyKeyCountMetric = null;
179205
} else {
180206
healthyRequestMetric = MetricEntityStateThreeEnums.create(
181207
BasicClientMetricEntity.CALL_COUNT.getMetricEntity(),
@@ -226,10 +252,34 @@ protected BasicClientStats(
226252
HttpResponseStatusEnum.class,
227253
HttpResponseStatusCodeCategory.class,
228254
VeniceResponseStatusCategory.class);
255+
// key count
256+
healthyKeyCountMetric = MetricEntityStateThreeEnums.create(
257+
BasicClientMetricEntity.KEY_COUNT.getMetricEntity(),
258+
getOtelRepository(),
259+
this::registerSensor,
260+
BasicClientTehutiMetricName.SUCCESS_REQUEST_KEY_COUNT,
261+
Arrays.asList(successRequestKeyCountRate, new Avg(), new Max()),
262+
baseDimensionsMap,
263+
HttpResponseStatusEnum.class,
264+
HttpResponseStatusCodeCategory.class,
265+
VeniceResponseStatusCategory.class);
266+
unhealthyKeyCountMetric = MetricEntityStateThreeEnums.create(
267+
BasicClientMetricEntity.KEY_COUNT.getMetricEntity(),
268+
otelRepository,
269+
this::registerSensor,
270+
BasicClientTehutiMetricName.FAILED_REQUEST_KEY_COUNT,
271+
Arrays.asList(new Rate(), new Avg(), new Max()),
272+
baseDimensionsMap,
273+
HttpResponseStatusEnum.class,
274+
HttpResponseStatusCodeCategory.class,
275+
VeniceResponseStatusCategory.class);
276+
229277
healthyRequestMetricForDavinciClient = null;
230278
unhealthyRequestMetricForDavinciClient = null;
231279
healthyLatencyMetricForDavinciClient = null;
232280
unhealthyLatencyMetricForDavinciClient = null;
281+
healthyKeyCountMetricForDavinciClient = null;
282+
unhealthyKeyCountMetricForDavinciClient = null;
233283
}
234284

235285
// successRequestRatioSensor will be a derived metric in OTel
@@ -301,6 +351,34 @@ public void recordRequestKeyCount(int keyCount) {
301351

302352
public void recordSuccessRequestKeyCount(int successKeyCount) {
303353
successRequestKeyCountSensor.record(successKeyCount);
354+
if (ClientType.isDavinciClient(this.clientType)) {
355+
healthyKeyCountMetricForDavinciClient.record(successKeyCount, SUCCESS);
356+
} else {
357+
int httpStatus = getHealthyRequestHttpStatus(successKeyCount);
358+
HttpResponseStatusEnum statusEnum = transformIntToHttpResponseStatusEnum(httpStatus);
359+
HttpResponseStatusCodeCategory httpCategory = getVeniceHttpResponseStatusCodeCategory(httpStatus);
360+
healthyKeyCountMetric.record(successKeyCount, statusEnum, httpCategory, SUCCESS);
361+
}
362+
}
363+
364+
public void recordFailedRequestKeyCount(int failedKeyCount, Throwable throwable) {
365+
if (ClientType.isDavinciClient(this.clientType)) {
366+
unhealthyKeyCountMetricForDavinciClient.record(failedKeyCount, FAIL);
367+
} else {
368+
/**
369+
* When throwable is null and the failed key count is 0, it means that the request was successful. However,
370+
* we still need to record the failed key count as 0, and thus we use a default http status of SC_NO_CONTENT
371+
* to indicate success.
372+
*/
373+
int httpStatus = throwable != null ? getUnhealthyRequestHttpStatus(throwable) : SC_NO_CONTENT;
374+
HttpResponseStatusEnum statusEnum = transformIntToHttpResponseStatusEnum(httpStatus);
375+
HttpResponseStatusCodeCategory httpCategory = getVeniceHttpResponseStatusCodeCategory(httpStatus);
376+
unhealthyKeyCountMetric.record(failedKeyCount, statusEnum, httpCategory, FAIL);
377+
}
378+
}
379+
380+
public void recordFailedRequestKeyCount(int failedKeyCount) {
381+
recordFailedRequestKeyCount(failedKeyCount, null);
304382
}
305383

306384
protected final Rate getRequestRate() {
@@ -361,7 +439,8 @@ private Map<VeniceMetricsDimensions, String> getBaseDimensionsMap() {
361439
* Metric names for tehuti metrics used in this class.
362440
*/
363441
public enum BasicClientTehutiMetricName implements TehutiMetricNameEnum {
364-
HEALTHY_REQUEST, UNHEALTHY_REQUEST, HEALTHY_REQUEST_LATENCY, UNHEALTHY_REQUEST_LATENCY;
442+
HEALTHY_REQUEST, UNHEALTHY_REQUEST, HEALTHY_REQUEST_LATENCY, UNHEALTHY_REQUEST_LATENCY, SUCCESS_REQUEST_KEY_COUNT,
443+
FAILED_REQUEST_KEY_COUNT;
365444

366445
private final String metricName;
367446

@@ -401,6 +480,18 @@ public enum BasicClientMetricEntity {
401480
HTTP_RESPONSE_STATUS_CODE_CATEGORY,
402481
VENICE_RESPONSE_STATUS_CODE_CATEGORY)
403482
),
483+
/**
484+
* Count of keys during response handling along with response codes
485+
*/
486+
KEY_COUNT(
487+
MetricType.HISTOGRAM, MetricUnit.NUMBER, "Count of keys during response handling along with response codes",
488+
setOf(
489+
VENICE_STORE_NAME,
490+
VENICE_REQUEST_METHOD,
491+
HTTP_RESPONSE_STATUS_CODE,
492+
HTTP_RESPONSE_STATUS_CODE_CATEGORY,
493+
VENICE_RESPONSE_STATUS_CODE_CATEGORY)
494+
),
404495
/**
405496
* Count of all DaVinci requests: as DaVinci is local reads, we do not track HTTP response codes
406497
* But keeping the same name call_count across all clients for consistency
@@ -416,6 +507,14 @@ public enum BasicClientMetricEntity {
416507
CALL_TIME.name().toLowerCase(), MetricType.HISTOGRAM, MetricUnit.MILLISECOND,
417508
"Latency for all DaVinci Client responses",
418509
setOf(VENICE_STORE_NAME, VENICE_REQUEST_METHOD, VENICE_RESPONSE_STATUS_CODE_CATEGORY)
510+
),
511+
/**
512+
* Count of keys during response handling along with response codes
513+
*/
514+
KEY_COUNT_DVC(
515+
KEY_COUNT.name().toLowerCase(), MetricType.HISTOGRAM, MetricUnit.NUMBER,
516+
"Count of keys for all DaVinci Client responses",
517+
setOf(VENICE_STORE_NAME, VENICE_REQUEST_METHOD, VENICE_RESPONSE_STATUS_CODE_CATEGORY)
419518
);
420519

421520
private final MetricEntity metricEntity;

0 commit comments

Comments
 (0)