@@ -30,11 +30,15 @@ import (
30
30
// are correctly counted (without duplicates).
31
31
func TestDistinctPrivacyKeyNoNoise (t * testing.T ) {
32
32
var triples []testutils.TripleWithIntValue
33
- for i := 0 ; i < 100 ; i ++ { // Add 400 values of which 200 are distinct to Partition 0.
33
+ for i := 0 ; i < 100 ; i ++ { // Add 200 distinct values to Partition 0.
34
34
triples = append (triples , testutils.TripleWithIntValue {ID : i , Partition : 0 , Value : i })
35
35
triples = append (triples , testutils.TripleWithIntValue {ID : i , Partition : 0 , Value : 100 + i })
36
- triples = append (triples , testutils.TripleWithIntValue {ID : i , Partition : 0 , Value : i }) // Duplicate each value. Should be discarded by DistinctPerKey.
37
- triples = append (triples , testutils.TripleWithIntValue {ID : i , Partition : 0 , Value : 100 + i }) // Duplicate each value. Should be discarded by DistinctPerKey.
36
+ }
37
+ for i := 100 ; i < 200 ; i ++ { // Add 200 additional values, all of which are duplicates of the existing distinct values, to Partition 0.
38
+ // The duplicates come from users different from the 100 users above in order to not discard
39
+ // any distinct values during the initial per-partition contribution bounding step.
40
+ triples = append (triples , testutils.TripleWithIntValue {ID : i , Partition : 0 , Value : i - 100 }) // Duplicate. Should be discarded by DistinctPerKey.
41
+ triples = append (triples , testutils.TripleWithIntValue {ID : i , Partition : 0 , Value : i }) // Duplicate. Should be discarded by DistinctPerKey.
38
42
}
39
43
for i := 0 ; i < 50 ; i ++ { // Add 200 values of which 100 are distinct to Partition 1.
40
44
triples = append (triples , testutils.TripleWithIntValue {ID : i , Partition : 1 , Value : i })
@@ -183,6 +187,49 @@ func TestDistinctPerKeyPerKeyCrossPartitionContributionBounding(t *testing.T) {
183
187
}
184
188
}
185
189
190
+ // Checks that DistinctPrivacyKey bounds cross-partition contributions before doing deduplication of
191
+ // values. This is to ensure we don't run into a contribution bounding-related privacy bug in some
192
+ // rare cases.
193
+ func TestDistinctPerKeyPerKeyCrossPartitionContributionBounding_IsAppliedBeforeDeduplication (t * testing.T ) {
194
+ var triples []testutils.TripleWithIntValue
195
+ for i := 0 ; i < 100 ; i ++ { // Add value=1 to 100 partitions.
196
+ triples = append (triples , testutils.TripleWithIntValue {ID : i , Partition : i , Value : 1 })
197
+ }
198
+ for i := 0 ; i < 100 ; i ++ { // Add a user that contributes value=1 to all 100 partitions.
199
+ triples = append (triples , testutils.TripleWithIntValue {ID : 100 , Partition : i , Value : 1 })
200
+ }
201
+ // Assume cross-partition contribution bounding is not done before deduplication of values.
202
+ // Each value=1 in each of the i ∈ {0, ..., 99} partitions would have two users associated
203
+ // with it: user with ID=i and user with ID=100. We pick one of these two users randomly,
204
+ // so in expectation about 50 of 100 partitions' deduplicated values would have user with id=100
205
+ // associated with them. After cross-partition contribution bounding happens, we would be
206
+ // left with around 50 partitions with a single distinct value each and the test would fail.
207
+ result := []testutils.TestInt64Metric {}
208
+ for i := 0 ; i < 100 ; i ++ {
209
+ result = append (result , testutils.TestInt64Metric {i , 1 })
210
+ }
211
+ p , s , col , want := ptest .CreateList2 (triples , result )
212
+ col = beam .ParDo (s , testutils .ExtractIDFromTripleWithIntValue , col )
213
+
214
+ // ε=50, δ=1-10⁻¹⁵ and l1Sensitivity=1 gives a threshold of ≈2.
215
+ // However, since δ is very large, a partition with a single user
216
+ // is kept with a probability almost 1.
217
+ // We have 100 partitions. So, to get an overall flakiness of 10⁻²³,
218
+ // we can have each partition fail with 1-10⁻²⁵ probability (k=25).
219
+ epsilon , delta , k , l1Sensitivity := 50.0 , 1 - 1e-15 , 25.0 , 1.0
220
+ // ε is split by 2 for noise and for partition selection, so we use 2*ε to get a Laplace noise with ε.
221
+ pcol := MakePrivate (s , col , NewPrivacySpec (2 * epsilon , delta ))
222
+ pcol = ParDo (s , testutils .TripleWithIntValueToKV , pcol )
223
+ got := DistinctPerKey (s , pcol , DistinctPerKeyParams {MaxPartitionsContributed : 1 , NoiseKind : LaplaceNoise {}, MaxContributionsPerPartition : 1 })
224
+ want = beam .ParDo (s , testutils .Int64MetricToKV , want )
225
+ if err := testutils .ApproxEqualsKVInt64 (s , got , want , testutils .LaplaceTolerance (k , l1Sensitivity , epsilon )); err != nil {
226
+ t .Fatalf ("TestDistinctPerKeyPerKeyCrossPartitionContributionBounding_IsAppliedBeforeDeduplication: %v" , err )
227
+ }
228
+ if err := ptest .Run (p ); err != nil {
229
+ t .Errorf ("TestDistinctPerKeyPerKeyCrossPartitionContributionBounding_IsAppliedBeforeDeduplication: DistinctPerKey(%v) = %v, expected %v: %v" , col , got , want , err )
230
+ }
231
+ }
232
+
186
233
// Checks that DistinctPrivacyKey bounds per-partition contributions correctly.
187
234
func TestDistinctPrivacyKeyPerPartitionContributionBounding (t * testing.T ) {
188
235
var triples []testutils.TripleWithIntValue
@@ -218,19 +265,57 @@ func TestDistinctPrivacyKeyPerPartitionContributionBounding(t *testing.T) {
218
265
// ε=50, δ=10⁻¹⁰⁰ and l1Sensitivity=6 gives a threshold of ≈33.
219
266
// We have 3 partitions. So, to get an overall flakiness of 10⁻²³,
220
267
// we can have each partition fail with 1-10⁻²⁵ probability (k=25).
221
- // To see the logic and the math behind flakiness and tolerance calculation,
222
- // See https://github.com/google/differential-privacy/blob/main/privacy-on-beam/docs/Tolerance_Calculation.pdf.
223
268
epsilon , delta , k , l1Sensitivity := 50.0 , 1e-100 , 25.0 , 6.0
224
269
// ε is split by 2 for noise and for partition selection, so we use 2*ε to get a Laplace noise with ε.
225
270
pcol := MakePrivate (s , col , NewPrivacySpec (2 * epsilon , delta ))
226
271
pcol = ParDo (s , testutils .TripleWithIntValueToKV , pcol )
227
272
got := DistinctPerKey (s , pcol , DistinctPerKeyParams {MaxPartitionsContributed : 3 , NoiseKind : LaplaceNoise {}, MaxContributionsPerPartition : 2 })
228
273
want = beam .ParDo (s , testutils .Int64MetricToKV , want )
229
274
if err := testutils .ApproxEqualsKVInt64 (s , got , want , testutils .LaplaceTolerance (k , l1Sensitivity , epsilon )); err != nil {
230
- t .Fatalf ("TestDistinctPerKeyNoNoise : %v" , err )
275
+ t .Fatalf ("TestDistinctPrivacyKeyPerPartitionContributionBounding : %v" , err )
231
276
}
232
277
if err := ptest .Run (p ); err != nil {
233
- t .Errorf ("TestDistinctPerKeyNoNoise: DistinctPerKey(%v) = %v, expected %v: %v" , col , got , want , err )
278
+ t .Errorf ("TestDistinctPrivacyKeyPerPartitionContributionBounding: DistinctPerKey(%v) = %v, expected %v: %v" , col , got , want , err )
279
+ }
280
+ }
281
+
282
+ // Checks that DistinctPrivacyKey bounds per-partition contributions before doing deduplication of
283
+ // values. This is to ensure we don't run into a contribution bounding-related privacy bug in some
284
+ // rare cases.
285
+ func TestDistinctPrivacyKeyPerPartitionContributionBounding_IsAppliedBeforeDeduplication (t * testing.T ) {
286
+ var triples []testutils.TripleWithIntValue
287
+ for i := 0 ; i < 100 ; i ++ { // Add 100 distinct values to Partition 0.
288
+ triples = append (triples , testutils.TripleWithIntValue {ID : i , Partition : 0 , Value : i })
289
+ }
290
+ for i := 0 ; i < 100 ; i ++ { // Add a user that contributes all these 100 distinct values to Partition 0.
291
+ triples = append (triples , testutils.TripleWithIntValue {ID : 100 , Partition : 0 , Value : i })
292
+ }
293
+ // Assume per-partition contribution bounding is not done before deduplication of values.
294
+ // Each value i ∈ {0, ..., 99} would have two users associated with it: user with ID=i and
295
+ // user with ID=100. We pick one of these two users randomly, so in expectation about 50
296
+ // of 100 deduplicated values would have user with id=100 associated with them. After
297
+ // per-partition contribution bounding happens, we would be left with around 50 distinct
298
+ // values and the test would fail.
299
+ result := []testutils.TestInt64Metric {
300
+ {0 , 100 },
301
+ }
302
+ p , s , col , want := ptest .CreateList2 (triples , result )
303
+ col = beam .ParDo (s , testutils .ExtractIDFromTripleWithIntValue , col )
304
+
305
+ // ε=50, δ=10⁻¹⁰⁰ and l1Sensitivity=1 gives a threshold of ≈6.
306
+ // We have 1 partition. So, to get an overall flakiness of 10⁻²³,
307
+ // we need to have each partition pass with 1-10⁻²³ probability (k=23).
308
+ epsilon , delta , k , l1Sensitivity := 50.0 , 1e-100 , 23.0 , 1.0
309
+ // ε is split by 2 for noise and for partition selection, so we use 2*ε to get a Laplace noise with ε.
310
+ pcol := MakePrivate (s , col , NewPrivacySpec (2 * epsilon , delta ))
311
+ pcol = ParDo (s , testutils .TripleWithIntValueToKV , pcol )
312
+ got := DistinctPerKey (s , pcol , DistinctPerKeyParams {MaxPartitionsContributed : 1 , NoiseKind : LaplaceNoise {}, MaxContributionsPerPartition : 1 })
313
+ want = beam .ParDo (s , testutils .Int64MetricToKV , want )
314
+ if err := testutils .ApproxEqualsKVInt64 (s , got , want , testutils .LaplaceTolerance (k , l1Sensitivity , epsilon )); err != nil {
315
+ t .Fatalf ("TestDistinctPrivacyKeyPerPartitionContributionBounding_IsAppliedBeforeDeduplication: %v" , err )
316
+ }
317
+ if err := ptest .Run (p ); err != nil {
318
+ t .Errorf ("TestDistinctPrivacyKeyPerPartitionContributionBounding_IsAppliedBeforeDeduplication: DistinctPerKey(%v) = %v, expected %v: %v" , col , got , want , err )
234
319
}
235
320
}
236
321
0 commit comments