Skip to content

Commit e8b6835

Browse files
vitstnblinkov
authored andcommitted
YQL-19495 handle NaNs in TDigest
commit_hash:6ceaf9a8cc4d034c2829780bed37396d25f9056d
1 parent c3d7951 commit e8b6835

File tree

9 files changed

+119
-14
lines changed

9 files changed

+119
-14
lines changed

library/cpp/tdigest/tdigest.cpp

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,53 +3,61 @@
33
#include <library/cpp/tdigest/tdigest.pb.h>
44

55
#include <cmath>
6+
#include <util/generic/yexception.h>
67

78
// TODO: rewrite to https://github.com/tdunning/t-digest/blob/master/src/main/java/com/tdunning/math/stats/MergingDigest.java
89

9-
TDigest::TDigest(double delta, double k)
10+
TDigest::TDigest(double delta, double k, bool supportsNaN)
1011
: N(0)
1112
, Delta(delta)
1213
, K(k)
14+
, SupportsNaN(supportsNaN)
1315
{
1416
}
1517

16-
TDigest::TDigest(double delta, double k, double firstValue)
17-
: TDigest(delta, k)
18+
TDigest::TDigest(double delta, double k, double firstValue, bool supportsNaN)
19+
: TDigest(delta, k, supportsNaN)
1820
{
1921
AddValue(firstValue);
2022
}
2123

22-
TDigest::TDigest(TStringBuf serializedDigest)
24+
TDigest::TDigest(TStringBuf serializedDigest, bool supportsNaN)
2325
: N(0)
26+
, SupportsNaN(supportsNaN)
2427
{
2528
NTDigest::TDigest digest;
2629
Y_ABORT_UNLESS(digest.ParseFromArray(serializedDigest.data(), serializedDigest.size()));
2730
Delta = digest.delta();
2831
K = digest.k();
32+
HasNaN = SupportsNaN && digest.nans();
2933
for (int i = 0; i < digest.centroids_size(); ++i) {
3034
const NTDigest::TDigest::TCentroid& centroid = digest.centroids(i);
3135
Update(centroid.mean(), centroid.weight());
3236
}
3337
}
3438

35-
TDigest::TDigest(const TDigest* digest1, const TDigest* digest2)
39+
TDigest::TDigest(const TDigest* digest1, const TDigest* digest2, bool supportsNaN)
3640
: N(0)
3741
, Delta(std::min(digest1->Delta, digest2->Delta))
3842
, K(std::max(digest1->K, digest2->K))
43+
, SupportsNaN(supportsNaN)
44+
, HasNaN(supportsNaN && (digest1->HasNaN || digest2->HasNaN))
3945
{
4046
Add(*digest1);
4147
Add(*digest2);
4248
}
4349

4450
void TDigest::Add(const TDigest& otherDigest) {
51+
Y_ENSURE(SupportsNaN == otherDigest.SupportsNaN);
4552
for (auto& it : otherDigest.Centroids)
4653
Update(it.Mean, it.Count);
4754
for (auto& it : otherDigest.Unmerged)
4855
Update(it.Mean, it.Count);
4956
}
5057

5158
TDigest TDigest::operator+(const TDigest& other) {
52-
TDigest T(Delta, K);
59+
Y_ENSURE(SupportsNaN == other.SupportsNaN);
60+
TDigest T(Delta, K, SupportsNaN);
5361
T.Add(*this);
5462
T.Add(other);
5563
return T;
@@ -92,6 +100,12 @@ void TDigest::MergeCentroid(TVector<TCentroid>& merged, double& sum, const TCent
92100
}
93101

94102
void TDigest::Update(double x, double w) {
103+
if (SupportsNaN) {
104+
if (std::isnan(x)) {
105+
HasNaN = true;
106+
return;
107+
}
108+
}
95109
AddCentroid(TCentroid(x, w));
96110
if (Unmerged.size() >= K / Delta) {
97111
Compress();
@@ -136,8 +150,17 @@ void TDigest::AddValue(double value) {
136150

137151
double TDigest::GetPercentile(double percentile) {
138152
Compress();
139-
if (Centroids.empty())
153+
if (Centroids.empty()) {
154+
if (HasNaN) {
155+
return std::numeric_limits<double>::quiet_NaN();
156+
}
140157
return 0.0;
158+
}
159+
160+
if (HasNaN && percentile >= 1.0) {
161+
return std::numeric_limits<double>::quiet_NaN();
162+
}
163+
141164
// This algorithm uses C=1/2 with 0.5 optimized away
142165
// See https://en.wikipedia.org/wiki/Percentile#First_Variant.2C
143166
double x = percentile * N;
@@ -159,6 +182,9 @@ double TDigest::GetPercentile(double percentile) {
159182

160183
double TDigest::GetRank(double value) {
161184
Compress();
185+
if (SupportsNaN && std::isnan(value)) {
186+
return 1.0;
187+
}
162188
if (Centroids.empty()) {
163189
return 0.0;
164190
}
@@ -189,6 +215,10 @@ TString TDigest::Serialize() {
189215
NTDigest::TDigest digest;
190216
digest.set_delta(Delta);
191217
digest.set_k(K);
218+
if (HasNaN) {
219+
digest.set_nans(HasNaN);
220+
}
221+
192222
for (const auto& it : Centroids) {
193223
NTDigest::TDigest::TCentroid* centroid = digest.add_centroids();
194224
centroid->set_mean(it.Mean);

library/cpp/tdigest/tdigest.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ class TDigest {
3636
double N;
3737
double Delta;
3838
double K;
39+
bool SupportsNaN = false;
40+
bool HasNaN = false;
3941

4042
void Add(const TDigest& otherDigest);
4143
void AddCentroid(const TCentroid& centroid);
@@ -47,10 +49,10 @@ class TDigest {
4749
void Update(double x, double w = 1.0);
4850

4951
public:
50-
TDigest(double delta = 0.01, double k = 25);
51-
TDigest(double delta, double k, double firstValue);
52-
TDigest(TStringBuf serializedDigest);
53-
TDigest(const TDigest* digest1, const TDigest* digest2); // merge
52+
TDigest(double delta = 0.01, double k = 25, bool supportsNaN = false);
53+
TDigest(double delta, double k, double firstValue, bool supportsNaN = false);
54+
TDigest(TStringBuf serializedDigest, bool supportsNaN = false);
55+
TDigest(const TDigest* digest1, const TDigest* digest2, bool supportsNaN = false); // merge
5456

5557
TString Serialize();
5658

library/cpp/tdigest/tdigest.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ message TDigest {
88
optional double Weight = 2;
99
}
1010
repeated TCentroid Centroids = 3;
11+
optional bool Nans = 4;
1112
}

yql/essentials/udfs/common/stat/static/stat_udf.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ namespace {
2222
UdfTerminate((TStringBuilder() << GetPos() << " Invalid combination of delta/K values").data());
2323
}
2424

25-
return TUnboxedValuePod(new TDigestResource(delta, K, args[0].Get<double>()));
25+
return TUnboxedValuePod(new TDigestResource(delta, K, args[0].Get<double>(), true));
2626
}
2727

2828
SIMPLE_STRICT_UDF(TTDigest_AddValue, TResource<DigestResourceName>(TResource<DigestResourceName>, double)) {
@@ -46,14 +46,17 @@ namespace {
4646

4747
SIMPLE_UDF(TTDigest_Deserialize, TResource<DigestResourceName>(char*)) {
4848
Y_UNUSED(valueBuilder);
49-
return TUnboxedValuePod(new TDigestResource(TString(args[0].AsStringRef())));
49+
return TUnboxedValuePod(new TDigestResource(TString(args[0].AsStringRef()), true));
5050
}
5151

5252
SIMPLE_STRICT_UDF(TTDigest_Merge, TResource<DigestResourceName>(TResource<DigestResourceName>, TResource<DigestResourceName>)) {
5353
Y_UNUSED(valueBuilder);
5454
TDigestResource::Validate(args[0]);
5555
TDigestResource::Validate(args[1]);
56-
return TUnboxedValuePod(new TDigestResource(static_cast<TDigestResource*>(args[0].AsBoxed().Get())->Get(), static_cast<TDigestResource*>(args[1].AsBoxed().Get())->Get()));
56+
return TUnboxedValuePod(new TDigestResource(
57+
static_cast<TDigestResource*>(args[0].AsBoxed().Get())->Get(),
58+
static_cast<TDigestResource*>(args[1].AsBoxed().Get())->Get(),
59+
true));
5760
}
5861

5962
/*
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"test.test[nan]": [
3+
{
4+
"uri": "file://test.test_nan_/results.txt"
5+
}
6+
]
7+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
[
2+
{
3+
"Write" = [
4+
{
5+
"Type" = [
6+
"ListType";
7+
[
8+
"StructType";
9+
[
10+
[
11+
"column0";
12+
[
13+
"OptionalType";
14+
[
15+
"DataType";
16+
"Double"
17+
]
18+
]
19+
];
20+
[
21+
"column1";
22+
[
23+
"OptionalType";
24+
[
25+
"DataType";
26+
"Double"
27+
]
28+
]
29+
]
30+
]
31+
]
32+
];
33+
"Data" = [
34+
[
35+
[
36+
"1.1"
37+
];
38+
[
39+
"nan"
40+
]
41+
]
42+
]
43+
}
44+
]
45+
}
46+
]
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
select percentile(x,0.99),percentile(x,1.0)
2+
from (values (double("nan")),(1.1),(0.5)) as a(x)
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
YQL_UDF_TEST()
2+
3+
DEPENDS(yql/essentials/udfs/common/stat)
4+
5+
TIMEOUT(300)
6+
7+
SIZE(MEDIUM)
8+
9+
IF (SANITIZER_TYPE == "memory")
10+
TAG(ya:not_autocheck) # YQL-15385
11+
ENDIF()
12+
13+
END()

yql/essentials/udfs/common/stat/ya.make

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ YQL_UDF_CONTRIB(stat_udf)
1818

1919
IF (NOT EXPORT_CMAKE)
2020
RECURSE_FOR_TESTS(
21+
test
2122
ut
2223
)
2324
ENDIF()

0 commit comments

Comments
 (0)