Skip to content

Commit d5d60e2

Browse files
galenmandrewtensorflower-gardener
authored andcommitted
Adds compute_dp_sgd_privacy_statement for accurate privacy accounting report.
PiperOrigin-RevId: 518934979
1 parent 52806ba commit d5d60e2

2 files changed

Lines changed: 279 additions & 16 deletions

File tree

tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py

Lines changed: 178 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"""Library for computing privacy values for DP-SGD."""
1616

1717
import math
18+
import textwrap
1819
from typing import Optional
1920

2021
from absl import app
@@ -224,19 +225,178 @@ def _compute_dp_sgd_example_privacy(
224225
return accountant.get_epsilon(example_delta)
225226

226227

228+
def compute_dp_sgd_privacy_statement(
229+
number_of_examples: int,
230+
batch_size: int,
231+
num_epochs: float,
232+
noise_multiplier: float,
233+
delta: float,
234+
used_microbatching: bool = True,
235+
max_examples_per_user: Optional[int] = None,
236+
) -> str:
237+
"""Produces a privacy report summarizing the DP guarantee.
238+
239+
Args:
240+
number_of_examples: Total number of examples in the dataset. For DP-SGD, an
241+
"example" corresponds to one row in a minibatch. E.g., for sequence models
242+
this would be a sequence of maximum length.
243+
batch_size: The number of examples in a batch. This should be the number of
244+
examples in a batch, *regardless of whether/how they are grouped into
245+
microbatches*.
246+
num_epochs: The number of epochs of training. May be fractional.
247+
noise_multiplier: The ratio of the Gaussian noise to the clip norm at each
248+
round. It is assumed that the noise_multiplier is constant although the
249+
clip norm may be variable if, for example, adaptive clipping is used.
250+
delta: The target delta.
251+
used_microbatching: Whether microbatching was used (with microbatch size
252+
greater than one). Microbatching inflates sensitivity by a factor of two
253+
in add-or-remove-one adjacency DP. (See "How to DP-fy ML: A Practical
254+
Guide to Machine Learning with Differential Privacy",
255+
https://arxiv.org/abs/2303.00654, Sec 5.6.)
256+
max_examples_per_user: If the data set is constructed to cap the maximum
257+
number of examples each user contributes, provide this argument to also
258+
print a user-level DP guarantee.
259+
260+
Returns:
261+
A str precisely articulating the privacy guarantee.
262+
"""
263+
264+
paragraph = f"""\
265+
DP-SGD performed over {number_of_examples} examples with {batch_size} \
266+
examples per iteration, noise multiplier {noise_multiplier} for {num_epochs} \
267+
epochs {'with' if used_microbatching else 'without'} microbatching"""
268+
269+
if max_examples_per_user is None:
270+
paragraph += ', and no bound on number of examples per user.'
271+
else:
272+
paragraph += f', and at most {max_examples_per_user} examples per user.'
273+
274+
paragraphs = [textwrap.fill(paragraph, width=80)]
275+
276+
paragraphs.append(
277+
textwrap.fill(
278+
"""\
279+
This privacy guarantee protects the release of all model checkpoints in \
280+
addition to the final model.""",
281+
width=80,
282+
)
283+
)
284+
285+
paragraph = textwrap.fill(
286+
f"""\
287+
Example-level DP with add-or-remove-one adjacency at delta = {delta} computed \
288+
with RDP accounting:""",
289+
width=80,
290+
)
291+
292+
example_eps_no_subsampling = _compute_dp_sgd_example_privacy(
293+
num_epochs, noise_multiplier, delta, used_microbatching
294+
)
295+
example_eps_subsampling = _compute_dp_sgd_example_privacy(
296+
num_epochs,
297+
noise_multiplier,
298+
delta,
299+
used_microbatching,
300+
poisson_subsampling_probability=batch_size / number_of_examples,
301+
)
302+
303+
paragraph += f"""
304+
Epsilon with each example occurring once per epoch: \
305+
{example_eps_no_subsampling:12.3f}
306+
Epsilon assuming Poisson sampling (*): \
307+
{example_eps_subsampling:12.3f}"""
308+
309+
paragraphs.append(paragraph)
310+
311+
inf_user_eps = False
312+
if max_examples_per_user is not None:
313+
user_eps_no_subsampling = _compute_dp_sgd_user_privacy(
314+
num_epochs,
315+
noise_multiplier,
316+
delta,
317+
max_examples_per_user,
318+
used_microbatching,
319+
)
320+
user_eps_subsampling = _compute_dp_sgd_user_privacy(
321+
num_epochs,
322+
noise_multiplier,
323+
delta,
324+
max_examples_per_user,
325+
used_microbatching,
326+
poisson_subsampling_probability=batch_size / number_of_examples,
327+
)
328+
if math.isinf(user_eps_no_subsampling):
329+
user_eps_no_subsampling_str = ' inf (**)'
330+
inf_user_eps = True
331+
else:
332+
user_eps_no_subsampling_str = f'{user_eps_no_subsampling:12.3f}'
333+
if math.isinf(user_eps_subsampling):
334+
user_eps_subsampling_str = ' inf (**)'
335+
inf_user_eps = True
336+
else:
337+
user_eps_subsampling_str = f'{user_eps_subsampling:12.3f}'
338+
339+
paragraph = textwrap.fill(
340+
f"""\
341+
User-level DP with add-or-remove-one adjacency at delta = {delta} computed \
342+
using RDP accounting and group privacy:""",
343+
width=80,
344+
)
345+
paragraph += f"""
346+
Epsilon with each example occurring once per epoch: \
347+
{user_eps_no_subsampling_str}
348+
Epsilon assuming Poisson sampling (*): \
349+
{user_eps_subsampling_str}"""
350+
351+
paragraphs.append(paragraph)
352+
else:
353+
paragraphs.append(
354+
textwrap.fill(
355+
"""\
356+
No user-level privacy guarantee is possible witout a bound on the number of \
357+
examples per user.""",
358+
width=80,
359+
)
360+
)
361+
362+
paragraphs.append(
363+
textwrap.fill(
364+
"""\
365+
(*) Poisson sampling is not usually done in training pipelines, but assuming \
366+
that the data was randomly shuffled, it is believed the actual epsilon should \
367+
be closer to this value than the conservative assumption of an arbitrary data \
368+
order.""",
369+
width=80,
370+
)
371+
)
372+
373+
if inf_user_eps:
374+
paragraphs.append(
375+
textwrap.fill(
376+
"""\
377+
(**) A finite example-level epsilon implies a finite user-level epsilon at any \
378+
`max_examples_per_user`, but because conversion from example-level to user-\
379+
level DP is not exact, it is possible for the upper bound on the user-level \
380+
epsilon to still be infinite.""",
381+
width=80,
382+
)
383+
)
384+
385+
return '\n\n'.join(paragraphs) + '\n'
386+
387+
227388
def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
228389
"""Compute epsilon based on the given hyperparameters.
229390
230391
This function is deprecated. It does not account for doubling of sensitivity
231392
with microbatching, and assumes Poisson subsampling, which is rarely used in
232393
practice. (See "How to DP-fy ML: A Practical Guide to Machine Learning with
233394
Differential Privacy", https://arxiv.org/abs/2303.00654, Sec 5.6.) Most users
234-
should call `compute_dp_sgd_privacy_statement` (which will be added shortly),
235-
which provides appropriate context for the guarantee (see the reporting
236-
recommendations in "How to DP-fy ML", Sec 5.3). If you need a numeric epsilon
237-
value under specific assumptions, it is recommended to use the `dp_accounting`
238-
libraries directly to compute epsilon, with the precise and correct
239-
assumptions of your application.
395+
should call `compute_dp_sgd_privacy_statement`, which provides appropriate
396+
context for the guarantee (see the reporting recommendations in "How to DP-fy
397+
ML", Sec 5.3). If you need a numeric epsilon value under specific assumptions,
398+
it is recommended to use the `dp_accounting` libraries directly to compute
399+
epsilon, with the precise and correct assumptions of your application.
240400
241401
Args:
242402
n: Number of examples in the training data.
@@ -248,20 +408,22 @@ def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
248408
Returns:
249409
A 2-tuple containing the value of epsilon and the optimal RDP order.
250410
"""
251-
# TODO(b/265168958): Update this text for `compute_dp_sgd_privacy_statement`.
252-
logging.warn(
253-
'`compute_dp_sgd_privacy` is deprecated. It does not account '
254-
'for doubling of sensitivity with microbatching, and assumes Poisson '
255-
'subsampling, which is rarely used in practice. Please use the '
256-
'`dp_accounting` libraries directly to compute epsilon, using the '
257-
'precise and correct assumptions of your application.'
258-
)
411+
logging.warn("""\
412+
`compute_dp_sgd_privacy` is deprecated. It does not account for doubling of \
413+
sensitivity with microbatching, and assumes Poisson subsampling, which is \
414+
rarely used in practice. Please use `compute_dp_sgd_privacy_statement`, which \
415+
provides appropriate context for the guarantee. To compute epsilon under \
416+
different assumptions than those in `compute_dp_sgd_privacy_statement`, call \
417+
the `dp_accounting` libraries directly.""")
259418

260419
q = batch_size / n # q - the sampling ratio.
261420
if q > 1:
262421
raise app.UsageError('n must be larger than the batch size.')
263-
orders = ([1.25, 1.5, 1.75, 2., 2.25, 2.5, 3., 3.5, 4., 4.5] +
264-
list(range(5, 64)) + [128, 256, 512])
422+
orders = (
423+
[1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 3.0, 3.5, 4.0, 4.5]
424+
+ list(range(5, 64))
425+
+ [128, 256, 512]
426+
)
265427
steps = int(math.ceil(epochs * n / batch_size))
266428
accountant = dp_accounting.rdp.RdpAccountant(orders)
267429

tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@
2525
_user_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_user_privacy
2626

2727

28+
DP_SGD_STATEMENT_KWARGS = dict(
29+
number_of_examples=10000,
30+
batch_size=64,
31+
num_epochs=5.0,
32+
noise_multiplier=2.0,
33+
delta=1e-6,
34+
)
35+
36+
2837
class ComputeDpSgdPrivacyTest(parameterized.TestCase):
2938

3039
@parameterized.named_parameters(
@@ -145,6 +154,98 @@ def test_user_privacy_epsilon_delta_consistency(self, z, k):
145154
)
146155
self.assertAlmostEqual(user_eps, example_eps * k)
147156

157+
def test_dp_sgd_privacy_statement_no_user_dp(self):
158+
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
159+
**DP_SGD_STATEMENT_KWARGS,
160+
)
161+
expected_statement = """\
162+
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
163+
multiplier 2.0 for 5.0 epochs with microbatching, and no bound on number of
164+
examples per user.
165+
166+
This privacy guarantee protects the release of all model checkpoints in addition
167+
to the final model.
168+
169+
Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
170+
RDP accounting:
171+
Epsilon with each example occurring once per epoch: 13.376
172+
Epsilon assuming Poisson sampling (*): 1.616
173+
174+
No user-level privacy guarantee is possible witout a bound on the number of
175+
examples per user.
176+
177+
(*) Poisson sampling is not usually done in training pipelines, but assuming
178+
that the data was randomly shuffled, it is believed the actual epsilon should be
179+
closer to this value than the conservative assumption of an arbitrary data
180+
order.
181+
"""
182+
self.assertEqual(statement, expected_statement)
183+
184+
def test_dp_sgd_privacy_statement_user_dp(self):
185+
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
186+
**DP_SGD_STATEMENT_KWARGS,
187+
max_examples_per_user=3,
188+
)
189+
expected_statement = """\
190+
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
191+
multiplier 2.0 for 5.0 epochs with microbatching, and at most 3 examples per
192+
user.
193+
194+
This privacy guarantee protects the release of all model checkpoints in addition
195+
to the final model.
196+
197+
Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
198+
RDP accounting:
199+
Epsilon with each example occurring once per epoch: 13.376
200+
Epsilon assuming Poisson sampling (*): 1.616
201+
202+
User-level DP with add-or-remove-one adjacency at delta = 1e-06 computed using
203+
RDP accounting and group privacy:
204+
Epsilon with each example occurring once per epoch: 113.899
205+
Epsilon assuming Poisson sampling (*): 8.129
206+
207+
(*) Poisson sampling is not usually done in training pipelines, but assuming
208+
that the data was randomly shuffled, it is believed the actual epsilon should be
209+
closer to this value than the conservative assumption of an arbitrary data
210+
order.
211+
"""
212+
self.assertEqual(statement, expected_statement)
213+
214+
def test_dp_sgd_privacy_statement_user_dp_infinite(self):
215+
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
216+
**DP_SGD_STATEMENT_KWARGS,
217+
max_examples_per_user=9,
218+
)
219+
expected_statement = """\
220+
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
221+
multiplier 2.0 for 5.0 epochs with microbatching, and at most 9 examples per
222+
user.
223+
224+
This privacy guarantee protects the release of all model checkpoints in addition
225+
to the final model.
226+
227+
Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
228+
RDP accounting:
229+
Epsilon with each example occurring once per epoch: 13.376
230+
Epsilon assuming Poisson sampling (*): 1.616
231+
232+
User-level DP with add-or-remove-one adjacency at delta = 1e-06 computed using
233+
RDP accounting and group privacy:
234+
Epsilon with each example occurring once per epoch: inf (**)
235+
Epsilon assuming Poisson sampling (*): inf (**)
236+
237+
(*) Poisson sampling is not usually done in training pipelines, but assuming
238+
that the data was randomly shuffled, it is believed the actual epsilon should be
239+
closer to this value than the conservative assumption of an arbitrary data
240+
order.
241+
242+
(**) A finite example-level epsilon implies a finite user-level epsilon at any
243+
`max_examples_per_user`, but because conversion from example-level to user-level
244+
DP is not exact, it is possible for the upper bound on the user-level epsilon to
245+
still be infinite.
246+
"""
247+
self.assertEqual(statement, expected_statement)
248+
148249

149250
if __name__ == '__main__':
150251
absltest.main()

0 commit comments

Comments
 (0)