1515"""Library for computing privacy values for DP-SGD."""
1616
1717import math
18+ import textwrap
1819from typing import Optional
1920
2021from absl import app
@@ -224,19 +225,178 @@ def _compute_dp_sgd_example_privacy(
224225 return accountant .get_epsilon (example_delta )
225226
226227
228+ def compute_dp_sgd_privacy_statement (
229+ number_of_examples : int ,
230+ batch_size : int ,
231+ num_epochs : float ,
232+ noise_multiplier : float ,
233+ delta : float ,
234+ used_microbatching : bool = True ,
235+ max_examples_per_user : Optional [int ] = None ,
236+ ) -> str :
237+ """Produces a privacy report summarizing the DP guarantee.
238+
239+ Args:
240+ number_of_examples: Total number of examples in the dataset. For DP-SGD, an
241+ "example" corresponds to one row in a minibatch. E.g., for sequence models
242+ this would be a sequence of maximum length.
243+ batch_size: The number of examples in a batch. This should be the number of
244+ examples in a batch, *regardless of whether/how they are grouped into
245+ microbatches*.
246+ num_epochs: The number of epochs of training. May be fractional.
247+ noise_multiplier: The ratio of the Gaussian noise to the clip norm at each
248+ round. It is assumed that the noise_multiplier is constant although the
249+ clip norm may be variable if, for example, adaptive clipping is used.
250+ delta: The target delta.
251+ used_microbatching: Whether microbatching was used (with microbatch size
252+ greater than one). Microbatching inflates sensitivity by a factor of two
253+ in add-or-remove-one adjacency DP. (See "How to DP-fy ML: A Practical
254+ Guide to Machine Learning with Differential Privacy",
255+ https://arxiv.org/abs/2303.00654, Sec 5.6.)
256+ max_examples_per_user: If the data set is constructed to cap the maximum
257+ number of examples each user contributes, provide this argument to also
258+ print a user-level DP guarantee.
259+
260+ Returns:
261+ A str precisely articulating the privacy guarantee.
262+ """
263+
264+ paragraph = f"""\
265+ DP-SGD performed over { number_of_examples } examples with { batch_size } \
266+ examples per iteration, noise multiplier { noise_multiplier } for { num_epochs } \
267+ epochs { 'with' if used_microbatching else 'without' } microbatching"""
268+
269+ if max_examples_per_user is None :
270+ paragraph += ', and no bound on number of examples per user.'
271+ else :
272+ paragraph += f', and at most { max_examples_per_user } examples per user.'
273+
274+ paragraphs = [textwrap .fill (paragraph , width = 80 )]
275+
276+ paragraphs .append (
277+ textwrap .fill (
278+ """\
279+ This privacy guarantee protects the release of all model checkpoints in \
280+ addition to the final model.""" ,
281+ width = 80 ,
282+ )
283+ )
284+
285+ paragraph = textwrap .fill (
286+ f"""\
287+ Example-level DP with add-or-remove-one adjacency at delta = { delta } computed \
288+ with RDP accounting:""" ,
289+ width = 80 ,
290+ )
291+
292+ example_eps_no_subsampling = _compute_dp_sgd_example_privacy (
293+ num_epochs , noise_multiplier , delta , used_microbatching
294+ )
295+ example_eps_subsampling = _compute_dp_sgd_example_privacy (
296+ num_epochs ,
297+ noise_multiplier ,
298+ delta ,
299+ used_microbatching ,
300+ poisson_subsampling_probability = batch_size / number_of_examples ,
301+ )
302+
303+ paragraph += f"""
304+ Epsilon with each example occurring once per epoch: \
305+ { example_eps_no_subsampling :12.3f}
306+ Epsilon assuming Poisson sampling (*): \
307+ { example_eps_subsampling :12.3f} """
308+
309+ paragraphs .append (paragraph )
310+
311+ inf_user_eps = False
312+ if max_examples_per_user is not None :
313+ user_eps_no_subsampling = _compute_dp_sgd_user_privacy (
314+ num_epochs ,
315+ noise_multiplier ,
316+ delta ,
317+ max_examples_per_user ,
318+ used_microbatching ,
319+ )
320+ user_eps_subsampling = _compute_dp_sgd_user_privacy (
321+ num_epochs ,
322+ noise_multiplier ,
323+ delta ,
324+ max_examples_per_user ,
325+ used_microbatching ,
326+ poisson_subsampling_probability = batch_size / number_of_examples ,
327+ )
328+ if math .isinf (user_eps_no_subsampling ):
329+ user_eps_no_subsampling_str = ' inf (**)'
330+ inf_user_eps = True
331+ else :
332+ user_eps_no_subsampling_str = f'{ user_eps_no_subsampling :12.3f} '
333+ if math .isinf (user_eps_subsampling ):
334+ user_eps_subsampling_str = ' inf (**)'
335+ inf_user_eps = True
336+ else :
337+ user_eps_subsampling_str = f'{ user_eps_subsampling :12.3f} '
338+
339+ paragraph = textwrap .fill (
340+ f"""\
341+ User-level DP with add-or-remove-one adjacency at delta = { delta } computed \
342+ using RDP accounting and group privacy:""" ,
343+ width = 80 ,
344+ )
345+ paragraph += f"""
346+ Epsilon with each example occurring once per epoch: \
347+ { user_eps_no_subsampling_str }
348+ Epsilon assuming Poisson sampling (*): \
349+ { user_eps_subsampling_str } """
350+
351+ paragraphs .append (paragraph )
352+ else :
353+ paragraphs .append (
354+ textwrap .fill (
355+ """\
356+ No user-level privacy guarantee is possible witout a bound on the number of \
357+ examples per user.""" ,
358+ width = 80 ,
359+ )
360+ )
361+
362+ paragraphs .append (
363+ textwrap .fill (
364+ """\
365+ (*) Poisson sampling is not usually done in training pipelines, but assuming \
366+ that the data was randomly shuffled, it is believed the actual epsilon should \
367+ be closer to this value than the conservative assumption of an arbitrary data \
368+ order.""" ,
369+ width = 80 ,
370+ )
371+ )
372+
373+ if inf_user_eps :
374+ paragraphs .append (
375+ textwrap .fill (
376+ """\
377+ (**) A finite example-level epsilon implies a finite user-level epsilon at any \
378+ `max_examples_per_user`, but because conversion from example-level to user-\
379+ level DP is not exact, it is possible for the upper bound on the user-level \
380+ epsilon to still be infinite.""" ,
381+ width = 80 ,
382+ )
383+ )
384+
385+ return '\n \n ' .join (paragraphs ) + '\n '
386+
387+
227388def compute_dp_sgd_privacy (n , batch_size , noise_multiplier , epochs , delta ):
228389 """Compute epsilon based on the given hyperparameters.
229390
230391 This function is deprecated. It does not account for doubling of sensitivity
231392 with microbatching, and assumes Poisson subsampling, which is rarely used in
232393 practice. (See "How to DP-fy ML: A Practical Guide to Machine Learning with
233394 Differential Privacy", https://arxiv.org/abs/2303.00654, Sec 5.6.) Most users
234- should call `compute_dp_sgd_privacy_statement` (which will be added shortly),
235- which provides appropriate context for the guarantee (see the reporting
236- recommendations in "How to DP-fy ML", Sec 5.3). If you need a numeric epsilon
237- value under specific assumptions, it is recommended to use the `dp_accounting`
238- libraries directly to compute epsilon, with the precise and correct
239- assumptions of your application.
395+ should call `compute_dp_sgd_privacy_statement`, which provides appropriate
396+ context for the guarantee (see the reporting recommendations in "How to DP-fy
397+ ML", Sec 5.3). If you need a numeric epsilon value under specific assumptions,
398+ it is recommended to use the `dp_accounting` libraries directly to compute
399+ epsilon, with the precise and correct assumptions of your application.
240400
241401 Args:
242402 n: Number of examples in the training data.
@@ -248,20 +408,22 @@ def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
248408 Returns:
249409 A 2-tuple containing the value of epsilon and the optimal RDP order.
250410 """
251- # TODO(b/265168958): Update this text for `compute_dp_sgd_privacy_statement`.
252- logging .warn (
253- '`compute_dp_sgd_privacy` is deprecated. It does not account '
254- 'for doubling of sensitivity with microbatching, and assumes Poisson '
255- 'subsampling, which is rarely used in practice. Please use the '
256- '`dp_accounting` libraries directly to compute epsilon, using the '
257- 'precise and correct assumptions of your application.'
258- )
411+ logging .warn ("""\
412+ `compute_dp_sgd_privacy` is deprecated. It does not account for doubling of \
413+ sensitivity with microbatching, and assumes Poisson subsampling, which is \
414+ rarely used in practice. Please use `compute_dp_sgd_privacy_statement`, which \
415+ provides appropriate context for the guarantee. To compute epsilon under \
416+ different assumptions than those in `compute_dp_sgd_privacy_statement`, call \
417+ the `dp_accounting` libraries directly.""" )
259418
260419 q = batch_size / n # q - the sampling ratio.
261420 if q > 1 :
262421 raise app .UsageError ('n must be larger than the batch size.' )
263- orders = ([1.25 , 1.5 , 1.75 , 2. , 2.25 , 2.5 , 3. , 3.5 , 4. , 4.5 ] +
264- list (range (5 , 64 )) + [128 , 256 , 512 ])
422+ orders = (
423+ [1.25 , 1.5 , 1.75 , 2.0 , 2.25 , 2.5 , 3.0 , 3.5 , 4.0 , 4.5 ]
424+ + list (range (5 , 64 ))
425+ + [128 , 256 , 512 ]
426+ )
265427 steps = int (math .ceil (epochs * n / batch_size ))
266428 accountant = dp_accounting .rdp .RdpAccountant (orders )
267429
0 commit comments