Skip to content

Commit 11413cc

Browse files
authored
Merge pull request #360 from diffix/piotr/no-sql-layer-lcf
Remove bucket seed from LCF noise calculation
2 parents bc2285f + 4232bde commit 11413cc

File tree

6 files changed

+31
-19
lines changed

6 files changed

+31
-19
lines changed

pg_diffix/aggregation/noise.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ extern double generate_layered_noise(const seed_t *seeds, int seeds_count,
1515
const char *step_name, double layer_sd);
1616

1717
/*
18-
* Returns the noisy LCF threshold for the given noise layers.
18+
* Returns the noisy LCF threshold for the given noise layer.
1919
*/
20-
extern double generate_lcf_threshold(const seed_t *seeds, int seeds_count);
20+
extern double generate_lcf_threshold(seed_t seed);
2121

2222
#endif /* PG_DIFFIX_NOISE_H */

src/aggregation/count_distinct.c

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -80,33 +80,31 @@ static void set_value_sorting_globals(Oid element_type)
8080
g_compare_values_func = &g_compare_values_typentry->cmp_proc_finfo;
8181
}
8282

83-
static bool aid_set_is_high_count(seed_t bucket_seed, const List *aid_values_set)
83+
static bool aid_set_is_high_count(const List *aid_values_set)
8484
{
8585
if (list_length(aid_values_set) < g_config.low_count_min_threshold)
8686
return false; /* Fewer AID values than minimum threshold, value is low-count. */
8787

8888
seed_t aid_seed = hash_set_to_seed(aid_values_set);
89-
90-
seed_t seeds[] = {bucket_seed, aid_seed};
91-
double threshold = generate_lcf_threshold(seeds, ARRAY_LENGTH(seeds));
89+
double threshold = generate_lcf_threshold(aid_seed);
9290

9391
return list_length(aid_values_set) >= threshold;
9492
}
9593

96-
static bool aid_sets_are_high_count(seed_t bucket_seed, const List *aid_values_sets)
94+
static bool aid_sets_are_high_count(const List *aid_values_sets)
9795
{
9896
ListCell *cell;
9997
foreach (cell, aid_values_sets)
10098
{
10199
const List *aid_values_set = (const List *)lfirst(cell);
102-
if (!aid_set_is_high_count(bucket_seed, aid_values_set))
100+
if (!aid_set_is_high_count(aid_values_set))
103101
return false;
104102
}
105103
return true;
106104
}
107105

108106
/* Returns a list with the tracker entries that are low count. */
109-
static List *filter_lc_entries(seed_t bucket_seed, DistinctTracker_hash *tracker)
107+
static List *filter_lc_entries(DistinctTracker_hash *tracker)
110108
{
111109
List *lc_entries = NIL;
112110

@@ -115,7 +113,7 @@ static List *filter_lc_entries(seed_t bucket_seed, DistinctTracker_hash *tracker
115113
DistinctTrackerHashEntry *entry = NULL;
116114
while ((entry = DistinctTracker_iterate(tracker, &it)) != NULL)
117115
{
118-
if (!aid_sets_are_high_count(bucket_seed, entry->aid_values_sets))
116+
if (!aid_sets_are_high_count(entry->aid_values_sets))
119117
lc_entries = lappend(lc_entries, entry);
120118
}
121119

@@ -352,7 +350,7 @@ static CountDistinctResult count_distinct_calculate_final(CountDistinctState *st
352350

353351
DistinctTracker_hash *tracker = state->tracker;
354352

355-
List *lc_entries = filter_lc_entries(bucket_seed, tracker);
353+
List *lc_entries = filter_lc_entries(tracker);
356354
list_sort(lc_entries, &compare_tracker_entries_by_value); /* Needed to ensure determinism. */
357355

358356
CountDistinctResult result = {0};

src/aggregation/low_count.c

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,10 @@ typedef struct AidResult
1414
bool low_count;
1515
} AidResult;
1616

17-
static AidResult calculate_aid_result(seed_t bucket_seed, const AidTrackerState *tracker)
17+
static AidResult calculate_aid_result(const AidTrackerState *tracker)
1818
{
1919
AidResult result = {.aid_seed = tracker->aid_seed};
20-
21-
seed_t seeds[] = {bucket_seed, tracker->aid_seed};
22-
result.threshold = generate_lcf_threshold(seeds, ARRAY_LENGTH(seeds));
20+
result.threshold = generate_lcf_threshold(tracker->aid_seed);
2321
result.low_count = tracker->aid_set->members < result.threshold;
2422

2523
return result;
@@ -83,11 +81,10 @@ static Datum agg_finalize(AnonAggState *base_state, Bucket *bucket, BucketDescri
8381
LowCountState *state = (LowCountState *)base_state;
8482

8583
bool low_count = false;
86-
seed_t bucket_seed = compute_bucket_seed(bucket, bucket_desc);
8784

8885
for (int i = 0; i < state->trackers_count; i++)
8986
{
90-
AidResult result = calculate_aid_result(bucket_seed, state->trackers[i]);
87+
AidResult result = calculate_aid_result(state->trackers[i]);
9188
low_count = low_count || result.low_count;
9289
}
9390

src/aggregation/noise.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,15 @@ double generate_layered_noise(const seed_t *seeds, int seeds_count,
112112
return noise;
113113
}
114114

115-
double generate_lcf_threshold(const seed_t *seeds, int seeds_count)
115+
double generate_lcf_threshold(seed_t seed)
116116
{
117117
/*
118118
* `low_count_mean_gap` is the number of (total!) standard deviations between
119119
* `low_count_min_threshold` and desired mean.
120120
*/
121121
double threshold_mean = (double)g_config.low_count_min_threshold +
122122
g_config.low_count_mean_gap * g_config.low_count_layer_sd * sqrt(2.0);
123-
double noise = generate_layered_noise(seeds, seeds_count, "suppress", g_config.low_count_layer_sd);
123+
double noise = generate_layered_noise(&seed, 1, "suppress", g_config.low_count_layer_sd);
124124
double noisy_threshold = threshold_mean + noise;
125125
return Max(noisy_threshold, g_config.low_count_min_threshold);
126126
}

test/expected/noisy.out

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,19 @@ SELECT COUNT(*), COUNT(city), COUNT(DISTINCT city) FROM london_customers;
212212
0 | 0 | 0
213213
(1 row)
214214

215+
-- LCF doesn't depend on the bucket seed, both queries should have same noisy threshold.
216+
SELECT diffix.floor_by(age, 30), COUNT(*) FROM test_patients GROUP BY 1;
217+
floor_by | count
218+
----------+-------
219+
| 23
220+
(1 row)
221+
222+
SELECT diffix.floor_by(age, 30), diffix.floor_by(age, 106), COUNT(*) FROM test_patients GROUP BY 1, 2;
223+
floor_by | floor_by | count
224+
----------+----------+-------
225+
| | 9
226+
(1 row)
227+
215228
----------------------------------------------------------------
216229
-- Empty tables
217230
----------------------------------------------------------------

test/sql/noisy.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ SELECT city FROM test_customers GROUP BY 1 HAVING length(city) <> 4;
5656

5757
SELECT COUNT(*), COUNT(city), COUNT(DISTINCT city) FROM london_customers;
5858

59+
-- LCF doesn't depend on the bucket seed, both queries should have same noisy threshold.
60+
SELECT diffix.floor_by(age, 30), COUNT(*) FROM test_patients GROUP BY 1;
61+
SELECT diffix.floor_by(age, 30), diffix.floor_by(age, 106), COUNT(*) FROM test_patients GROUP BY 1, 2;
62+
5963
----------------------------------------------------------------
6064
-- Empty tables
6165
----------------------------------------------------------------

0 commit comments

Comments
 (0)