Skip to content

Commit 7d7b670

Browse files
shs037tensorflower-gardener
authored andcommitted
Add functions to derive epsilon lower bounds.
PiperOrigin-RevId: 484021227
1 parent 3f16540 commit 7d7b670

3 files changed

Lines changed: 651 additions & 0 deletions

File tree

tensorflow_privacy/privacy/privacy_tests/BUILD

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,23 @@ py_test(
1818
deps = [":utils"],
1919
)
2020

21+
py_test(
22+
name = "epsilon_lower_bound_test",
23+
srcs = ["epsilon_lower_bound_test.py"],
24+
deps = [":epsilon_lower_bound"],
25+
)
26+
2127
py_library(
2228
name = "utils",
2329
srcs = ["utils.py"],
2430
srcs_version = "PY3",
2531
)
32+
33+
py_library(
34+
name = "epsilon_lower_bound",
35+
srcs = ["epsilon_lower_bound.py"],
36+
deps = [
37+
"//third_party/py/immutabledict",
38+
"//third_party/py/statsmodels",
39+
],
40+
)
Lines changed: 360 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,360 @@
1+
# Copyright 2022, The TensorFlow Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Various functions to convert MIA or secret sharer to epsilon lower bounds."""
15+
16+
import enum
17+
import numbers
18+
from typing import Dict, Iterable, Optional, Sequence, Union
19+
20+
import immutabledict
21+
import numpy as np
22+
import numpy.typing as npt
23+
import scipy.integrate
24+
import scipy.optimize
25+
import scipy.stats
26+
import sklearn.metrics
27+
from statsmodels.stats import proportion
28+
29+
30+
def _get_tp_fp_for_thresholds(pos_scores: np.ndarray,
31+
neg_scores: np.ndarray,
32+
thresholds: Optional[np.ndarray] = None):
33+
"""Gets all the tp and fp for a given array of thresholds.
34+
35+
Args:
36+
pos_scores: per-example scores for the positive class.
37+
neg_scores: per-example scores for the negative class.
38+
thresholds: an array of thresholds to consider. Will consider elements
39+
**above** as positive. If not provided, will enumerate through all
40+
possible thresholds.
41+
42+
Returns:
43+
A tuple as the true positives and false positives.
44+
"""
45+
if thresholds is None:
46+
# pylint:disable=protected-access
47+
fp, tp, _ = sklearn.metrics._ranking._binary_clf_curve(
48+
y_true=np.concatenate([
49+
np.ones_like(pos_scores, dtype=int),
50+
np.zeros_like(neg_scores, dtype=int)
51+
]),
52+
y_score=np.concatenate([pos_scores, neg_scores]))
53+
return tp, fp
54+
55+
def get_cum_sum(scores, thresholds):
56+
values = np.concatenate([scores, thresholds])
57+
indicators = np.concatenate(
58+
[np.ones_like(scores, dtype=int),
59+
np.zeros_like(thresholds, dtype=int)])
60+
sort_idx = np.argsort(values)[::-1] # Descending
61+
indicators = indicators[sort_idx]
62+
return np.cumsum(indicators)[indicators == 0]
63+
64+
tp = get_cum_sum(pos_scores, thresholds)
65+
fp = get_cum_sum(neg_scores, thresholds)
66+
return tp, fp
67+
68+
69+
class BoundMethod(enum.Enum):
70+
"""Methods to use for bound of ratio of binomial proportions."""
71+
KATZ_LOG = 'katz-log'
72+
ADJUSTED_LOG = 'adjusted-log'
73+
BAILEY = 'bailey'
74+
INV_SINH = 'inv-sinh'
75+
CLOPPER_PEARSON = 'clopper-pearson'
76+
77+
78+
class EpsilonLowerBound:
79+
"""Differential privacy (DP) epsilon lower bound.
80+
81+
This class computes a statistical epsilon lower bound by looking at the log
82+
ratio of tpr and fpr. The tpr / fpr ratio bound is from `RatioBound` class.
83+
84+
For example, in membership inference attack, the attacker sets a threshold and
85+
predicts samples with top probability larger than the thresholds as member.
86+
If the model is trained withs DP guarantee, then we should expect
87+
log(tpr / fpr) <= epsilon, where tpr and fpr are the true positive and false
88+
positive rates of the attacker. Therefore, we can use log(tpr / fpr) to derive
89+
an epsilon lower bound.
90+
91+
The idea of using Clopper Pearson for estimating epsilon lower bound is from
92+
https://arxiv.org/pdf/2006.07709.pdf.
93+
The idea of using log Katz is from https://arxiv.org/pdf/2210.08643.pdf.
94+
95+
Examples:
96+
>>> lb = elb.EpsilonLowerBound(train_top_probs, test_top_probs, alpha=0.05)
97+
>>> methods = [BoundMethod.BAILEY, BoundMethod.KATZ_LOG]
98+
>>> lb.compute_epsilon_lower_bounds(methods, k=5)
99+
"""
100+
101+
def __init__(self,
102+
pos_scores: np.ndarray,
103+
neg_scores: np.ndarray,
104+
alpha: float,
105+
two_sided_threshold: bool = True,
106+
thresholds: Optional[np.ndarray] = None):
107+
"""Initializes the epsilon lower bound class.
108+
109+
Args:
110+
pos_scores: per-example scores for the positive class.
111+
neg_scores: per-example scores for the negative class.
112+
alpha: the confidence level, must be < 0.5.
113+
two_sided_threshold: if False, will consider thresholds such that elements
114+
**above** are predicted as positive, i.e., tpr / fpr and tnr / fnr. If
115+
True, will also consider fpr / tpr and fnr / tnr.
116+
thresholds: an array of thresholds to consider. If not provided, will
117+
enumerate through all possible thresholds.
118+
"""
119+
if pos_scores.ndim != 1:
120+
raise ValueError('pos_score should be a 1-dimensional array, '
121+
f'but got {pos_scores.ndim}.')
122+
if neg_scores.ndim != 1:
123+
raise ValueError('pos_score should be a 1-dimensional array, '
124+
f'but got {neg_scores.ndim}.')
125+
if alpha >= 0.5:
126+
raise ValueError('alpha should be < 0.5, e.g. alpha=0.05, '
127+
f'but got {alpha}.')
128+
129+
pos_size, neg_size = pos_scores.size, neg_scores.size
130+
tp, fp = _get_tp_fp_for_thresholds(pos_scores, neg_scores, thresholds)
131+
fn, tn = pos_size - tp, neg_size - fp
132+
133+
# We consider both tpr / fpr and tnr / fnr.
134+
self._rbs = [
135+
RatioBound(tp, fp, pos_size, neg_size, alpha),
136+
RatioBound(tn, fn, neg_size, pos_size, alpha)
137+
]
138+
if two_sided_threshold:
139+
self._rbs.extend([
140+
# pylint: disable-next=arguments-out-of-order
141+
RatioBound(fp, tp, neg_size, pos_size, alpha),
142+
RatioBound(fn, tn, pos_size, neg_size, alpha)
143+
])
144+
145+
def compute_epsilon_lower_bound(self,
146+
method: BoundMethod,
147+
k: Optional[int] = None
148+
) -> npt.NDArray[float]:
149+
"""Computes lower bound w/ a specified method and returns top-k epsilons.
150+
151+
Args:
152+
method: the method to use for ratio bound.
153+
k: if specified, will return top-k values.
154+
155+
Returns:
156+
An array of bounds.
157+
"""
158+
if method not in self._rbs[0].available_methods:
159+
raise ValueError(f'Method {method} not recognized.')
160+
ratio_bound = np.concatenate([rb.compute_bound(method) for rb in self._rbs])
161+
bounds = np.log(ratio_bound[ratio_bound > 0])
162+
bounds = np.sort(bounds)[::-1]
163+
if k is None or k >= bounds.size:
164+
return bounds
165+
return bounds[:k]
166+
167+
def compute_epsilon_lower_bounds(
168+
self,
169+
methods: Optional[Iterable[BoundMethod]] = None,
170+
k: Optional[int] = None) -> Dict[BoundMethod, npt.NDArray[float]]:
171+
"""Computes lower bounds with all methods and returns the top-k epsilons.
172+
173+
Args:
174+
methods: the methods to use for ratio bound. If not specified, will use
175+
all available methods.
176+
k: if specified, will return top-k values for each method.
177+
178+
Returns:
179+
A dictionary, mapping method to the corresponding bound array.
180+
"""
181+
return {
182+
method: self.compute_epsilon_lower_bound(method, k)
183+
for method in methods or self._rbs[0].available_methods.keys()
184+
}
185+
186+
187+
class RatioBound:
188+
"""Lower bound of ratio of binomial proportions.
189+
190+
This class implements several methods to compute a statistical lower bound of
191+
the ratio of binomial proportions, e.g. tpr / fpr.
192+
Most of the methods are based on https://doi.org/10.1111/2041-210X.12304 and
193+
their code at https://CRAN.R-project.org/package=asbio.
194+
Clopper pearson is based on https://arxiv.org/pdf/2006.07709.pdf.
195+
196+
Examples:
197+
>>> tp, fp = np.array([100, 90]), np.array([10, 5])
198+
>>> pos_size, neg_size = 110, 80
199+
>>> rb = elb.RatioBound(tp, fp, pos_size, neg_size, 0.05)
200+
>>> rb.compute_bound(BoundMethod.BAILEY)
201+
array([4.61953896, 6.87647915])
202+
>>> rb.compute_bounds([BoundMethod.BAILEY, BoundMethod.KATZ_LOG])
203+
{<BoundMethod.BAILEY: 'bailey'>: array([4.61953896, 6.87647915]),
204+
<BoundMethod.KATZ_LOG: 'katz-log'>: array([4.45958661, 6.39712581])}
205+
206+
Attributes:
207+
available_methods: a dictionary mapping BoundMethod to the function.
208+
"""
209+
210+
def __init__(self, tp: Union[Sequence[int], int], fp: Union[Sequence[int],
211+
int],
212+
pos_size: int, neg_size: int, alpha: float):
213+
"""Initializes the ratio bound class.
214+
215+
Args:
216+
tp: true positives.
217+
fp: false positives. Should be of the same length as tp.
218+
pos_size: number of real positive samples.
219+
neg_size: number of real negative samples.
220+
alpha: the confidence level, must be < 0.5.
221+
"""
222+
if alpha >= 0.5:
223+
raise ValueError('alpha should be < 0.5, e.g. alpha=0.05, '
224+
f'but got {alpha}.')
225+
self._is_scalar = False # Would return scalar if `tp` is a scalar.
226+
# Convert tp or fp to list if it is a scalar.
227+
if isinstance(tp, numbers.Number):
228+
tp = [tp]
229+
self._is_scalar = True
230+
if isinstance(fp, numbers.Number):
231+
fp = [fp]
232+
if len(tp) != len(fp):
233+
raise ValueError('tp and fp should have the same number of elements, '
234+
f'but get {len(tp)} and {len(fp)} respectively.')
235+
# Some methods need the original values.
236+
self._tp_orig = np.array(tp, dtype=float)
237+
self._fp_orig = np.array(fp, dtype=float)
238+
if np.any(self._tp_orig > pos_size) or np.any(self._tp_orig < 0):
239+
raise ValueError('tp needs to be in [0, pos_size].')
240+
if np.any(self._fp_orig > neg_size) or np.any(self._fp_orig < 0):
241+
raise ValueError('fp needs to be in [0, neg_size].')
242+
243+
self.available_methods = immutabledict.immutabledict({
244+
BoundMethod.KATZ_LOG: self._bound_katz_log,
245+
BoundMethod.ADJUSTED_LOG: self._bound_adjusted_log,
246+
BoundMethod.BAILEY: self._bound_bailey,
247+
BoundMethod.INV_SINH: self._bound_inv_hyperbolic_sine,
248+
BoundMethod.CLOPPER_PEARSON: self._bound_clopper_pearson,
249+
})
250+
self._alpha = alpha
251+
self._z = scipy.stats.norm.ppf(alpha)
252+
self._pos_size, self._neg_size = pos_size, neg_size
253+
254+
# Some methods need to adjust maximum possible values. We record the
255+
# adjusted arrays.
256+
idx_max = np.logical_and(self._tp_orig == self._pos_size,
257+
self._fp_orig == self._neg_size)
258+
self._tp = np.where(idx_max, self._pos_size - 0.5, self._tp_orig)
259+
self._fp = np.where(idx_max, self._neg_size - 0.5, self._fp_orig)
260+
261+
# Some methods need to handle 0 specifically. We record the indices.
262+
self._idx_tp_0, self._idx_fp_0 = (self._tp == 0), (self._fp == 0)
263+
264+
def _get_statistics(self, tp, fp):
265+
"""Returns tpr, fpr, fnr, tnr for given tp, fp."""
266+
tpr, fpr = tp / self._pos_size, fp / self._neg_size
267+
fnr, tnr = 1 - tpr, 1 - fpr
268+
return tpr, fpr, fnr, tnr
269+
270+
def compute_bound(self,
271+
method: BoundMethod) -> Union[float, npt.NDArray[float]]:
272+
"""Computes ratio bound using a specified method.
273+
274+
Args:
275+
method: the method to use for ratio bound.
276+
277+
Returns:
278+
An array of bounds or a scalar if the input tp is scalar.
279+
"""
280+
if method not in self.available_methods:
281+
raise ValueError(f'Method {method} not recognized.')
282+
bound = self.available_methods[method]()
283+
if self._is_scalar:
284+
bound = bound[0] # Take the element if of size 1
285+
return bound
286+
287+
def compute_bounds(
288+
self,
289+
methods: Optional[Iterable[BoundMethod]] = None
290+
) -> Dict[BoundMethod, Union[float, npt.NDArray[float]]]:
291+
"""Computes ratio bounds for specified methods.
292+
293+
Args:
294+
methods: the methods to use for ratio bound. If not specified, will use
295+
all available methods.
296+
297+
Returns:
298+
A dictionary, mapping method to the corresponding bound.
299+
"""
300+
return {
301+
method: self.compute_bound(method)
302+
for method in methods or self.available_methods.keys()
303+
}
304+
305+
def _bound_katz_log(self) -> npt.NDArray[float]:
306+
"""Uses the logarithm Katz method to compute lower bound of ratio."""
307+
tp, fp = self._tp, np.where(self._idx_fp_0, 0.5, self._fp)
308+
tpr, fpr, fnr, tnr = self._get_statistics(tp, fp)
309+
empirical_ratio = tpr / fpr
310+
sqrt_term = np.sqrt(fnr / tp + tnr / fp)
311+
return np.where(self._idx_tp_0, 0,
312+
empirical_ratio * np.exp(self._z * sqrt_term))
313+
314+
def _bound_adjusted_log(self) -> npt.NDArray[float]:
315+
"""Uses the logarithm Walters method to compute lower bound of ratio."""
316+
log_empirical_ratio = (
317+
np.log((self._tp + 0.5) / (self._pos_size + 0.5)) - np.log(
318+
(self._fp + 0.5) / (self._neg_size + 0.5)))
319+
sqrt_term = np.sqrt(1 / (self._tp + 0.5) - 1 / (self._pos_size + 0.5) + 1 /
320+
(self._fp + 0.5) - 1 / (self._neg_size + 0.5))
321+
return np.where(
322+
np.logical_and(self._idx_tp_0, self._idx_fp_0), 0,
323+
np.exp(log_empirical_ratio) * np.exp(self._z * sqrt_term))
324+
325+
def _bound_bailey(self) -> npt.NDArray[float]:
326+
"""Uses the Bailey method to compute lower bound of ratio."""
327+
tp = np.where(self._tp_orig == self._pos_size, self._pos_size - 0.5,
328+
self._tp_orig)
329+
fp = np.where(self._fp_orig == self._neg_size, self._neg_size - 0.5,
330+
self._fp_orig)
331+
fp[self._idx_fp_0] = 0.5
332+
tpr, fpr, fnr, tnr = self._get_statistics(tp, fp)
333+
empirical_ratio = tpr / fpr
334+
power_3_term_numer = 1 + self._z / 3 * np.sqrt(fnr / tp + tnr / fp -
335+
(self._z**2 * fnr * tnr) /
336+
(9 * tp * fp))
337+
power_3_term_denom = 1 - (self._z**2 * tnr) / (9 * fp)
338+
return np.where(
339+
self._idx_tp_0, 0,
340+
empirical_ratio * (power_3_term_numer / power_3_term_denom)**3)
341+
342+
def _bound_inv_hyperbolic_sine(self) -> npt.NDArray[float]:
343+
"""Uses the inverse sinh method to compute lower bound of ratio."""
344+
tp, fp = self._tp, np.where(self._idx_fp_0, self._z**2, self._fp)
345+
empirical_ratio = (tp / fp) / (self._pos_size / self._neg_size)
346+
in_inve_sinh = self._z / 2 * np.sqrt(1 / tp - 1 / self._pos_size + 1 / fp -
347+
1 / self._neg_size)
348+
return np.where(self._idx_tp_0, 0,
349+
empirical_ratio * np.exp(2 * np.arcsinh(in_inve_sinh)))
350+
351+
def _bound_clopper_pearson(self) -> npt.NDArray[float]:
352+
"""Uses the Clopper-Pearson method to compute lower bound of ratio."""
353+
# proportion_confint uses alpha / 2 budget on upper and lower, so total
354+
# budget will be 2 * alpha/2 = alpha.
355+
p1, _ = proportion.proportion_confint(
356+
self._tp_orig, self._pos_size, self._alpha, method='beta')
357+
_, p0 = proportion.proportion_confint(
358+
self._fp_orig, self._neg_size, self._alpha, method='beta')
359+
# Handles divide by zero issues
360+
return np.where(np.logical_or(p1 <= 0, p0 >= 1), 0, p1 / p0)

0 commit comments

Comments
 (0)