From 5c7079a4beae7075525875159e3d5387570c8606 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 6 Mar 2026 16:33:24 -0500 Subject: [PATCH 1/7] use pure DP where we can; slightly better generated notebooks --- dp_wizard/utils/code_generators/__init__.py | 7 ++-- .../code_generators/abstract_generator.py | 35 ++++++++----------- 2 files changed, 16 insertions(+), 26 deletions(-) diff --git a/dp_wizard/utils/code_generators/__init__.py b/dp_wizard/utils/code_generators/__init__.py index be8f83f4..87c4d19a 100644 --- a/dp_wizard/utils/code_generators/__init__.py +++ b/dp_wizard/utils/code_generators/__init__.py @@ -226,9 +226,6 @@ def template(CONTRIBUTIONS, CONTRIBUTIONS_ENTITY): def make_privacy_loss_block(pure: bool, epsilon: float, max_rows: int): """ - Comments in the *pure* privacy loss block reference synthetic data generation - ("cuts dict"), so don't use "pure=True" for stats code! - >>> print( ... 'pure DP: ', ... make_privacy_loss_block(pure=True, epsilon=1, max_rows=1000) @@ -249,8 +246,8 @@ def template(EPSILON, MAX_ROWS): privacy_loss = dp.loss_of( # noqa: F841 # EPSILON_COMMENT_BLOCK epsilon=EPSILON, - # If your columns don't match your cuts dict, - # you will also need to provide a very small "delta" value. + # Not necessary in this case, + # but other analyses require a very small "delta" value. # https://docs.opendp.org/en/OPENDP_V_VERSION/getting-started/tabular-data/grouping.html#Stable-Keys delta=0, # or 1 / max(1e7, MAX_ROWS), ) diff --git a/dp_wizard/utils/code_generators/abstract_generator.py b/dp_wizard/utils/code_generators/abstract_generator.py index d4049ee1..0ee142c6 100644 --- a/dp_wizard/utils/code_generators/abstract_generator.py +++ b/dp_wizard/utils/code_generators/abstract_generator.py @@ -11,7 +11,6 @@ make_privacy_loss_block, make_privacy_unit_block, ) -from dp_wizard.utils.code_generators.analyses import histogram from dp_wizard.utils.dp_helper import confidence from dp_wizard.utils.shared.bins import make_cut_points @@ -148,12 +147,13 @@ def bin_template(GROUPS, BIN_NAME): DefaultsTemplate(basic_template) .fill_values(GROUPS=groups, MAX_ROWS=max_rows) .finish() - ] + [ - DefaultsTemplate(bin_template) - .fill_values(GROUPS=groups, BIN_NAME=bin_name) - .finish() - for bin_name in bin_names ] + for bin_name in bin_names: + margins.append( + DefaultsTemplate(bin_template) + .fill_values(GROUPS=groups, BIN_NAME=bin_name) + .finish() + ) margins_list = "[" + ", ".join(margins) + "\n ]" return margins_list @@ -249,30 +249,23 @@ def _make_partial_stats_context(self): for name, plan in self.analysis_plan.analysis_columns.items() if has_bins(get_statistic_by_name(plan[0].statistic_name)) ] + all_groups_have_keys = all( + len(group_keys) > 0 for group_keys in self.analysis_plan.groups.values() + ) privacy_unit_block = make_privacy_unit_block( contributions=self.analysis_plan.contributions, contributions_entity=self.analysis_plan.contributions_entity, ) privacy_loss_block = make_privacy_loss_block( - pure=False, + pure=all_groups_have_keys, epsilon=self.analysis_plan.epsilon, max_rows=self.analysis_plan.max_rows, ) - - is_just_histograms = all( - plan_column[0].statistic_name == histogram.name - for plan_column in self.analysis_plan.analysis_columns.values() - ) - margins_list = ( - # Histograms don't need margins. - "[]" - if is_just_histograms - else self._make_margins_list( - bin_names=[f"{name}_bin" for name in bin_column_names], - groups=self.analysis_plan.groups, - max_rows=self.analysis_plan.max_rows, - ) + margins_list = self._make_margins_list( + bin_names=[f"{name}_bin" for name in bin_column_names], + groups=self.analysis_plan.groups, + max_rows=self.analysis_plan.max_rows, ) extra_columns = ", ".join( [ From 719b72ca536d1f62e8e1732e4d135fa4ec6db241 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 6 Mar 2026 16:46:04 -0500 Subject: [PATCH 2/7] update docs --- docs/index.md | 12 ++++++++++-- tests/test_docs.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/index.md b/docs/index.md index c98a4bac..35068a89 100644 --- a/docs/index.md +++ b/docs/index.md @@ -545,7 +545,7 @@ Next, we'll define our Context. This is where we set the privacy budget, and set >>> >>> privacy_loss = dp.loss_of( ... epsilon=1.0, -... delta=1 / max(1e7, 100000), +... delta=0, # or 1 / max(1e7, 100000), ... ) >>> >>> # See the OpenDP Library docs for more on Context: @@ -561,7 +561,15 @@ Next, we'll define our Context. This is where we set the privacy budget, and set ... split_by_weights=[ # With only one query, the entire budget is allocated to that query: ... 1, # grade ... ], -... margins=[], +... margins=[ +... dp.polars.Margin( +... by=list({}.keys()), +... invariant="keys", +... max_length=100000, +... max_groups=100, +... ), +... dp.polars.Margin(by=(["grade_bin"] + list({}.keys())), invariant="keys"), +... ], ... ) ``` diff --git a/tests/test_docs.py b/tests/test_docs.py index a79058c6..f159341c 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -62,7 +62,7 @@ def test_doc_examples_up_to_date(): if any( # path is expanded to an absolute path, so ignore it: - line not in expected_code and path not in line + line.strip() not in expected_code and path not in line for line in doc_code.splitlines() ): # It's fine for the docs to be a subset of the generated code, From 5cdd3d695f92fe0c07ff03f9d77c1f5937bb139b Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 6 Mar 2026 16:47:23 -0500 Subject: [PATCH 3/7] revert strip --- tests/test_docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_docs.py b/tests/test_docs.py index f159341c..a79058c6 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -62,7 +62,7 @@ def test_doc_examples_up_to_date(): if any( # path is expanded to an absolute path, so ignore it: - line.strip() not in expected_code and path not in line + line not in expected_code and path not in line for line in doc_code.splitlines() ): # It's fine for the docs to be a subset of the generated code, From 1606ff473b7cef0662e6dbb260e23a169d56b7d9 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 6 Mar 2026 17:12:22 -0500 Subject: [PATCH 4/7] update index.html --- docs/index.html | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/index.html b/docs/index.html index 73790393..55bf8da9 100644 --- a/docs/index.html +++ b/docs/index.html @@ -797,7 +797,7 @@

Context

>>> >>> privacy_loss = dp.loss_of( ... epsilon=1.0, -... delta=1 / max(1e7, 100000), +... delta=0, # or 1 / max(1e7, 100000), ... ) >>> >>> # See the OpenDP Library docs for more on Context: @@ -813,7 +813,15 @@

Context

... split_by_weights=[ # With only one query, the entire budget is allocated to that query: ... 1, # grade ... ], -... margins=[], +... margins=[ +... dp.polars.Margin( +... by=list({}.keys()), +... invariant="keys", +... max_length=100000, +... max_groups=100, +... ), +... dp.polars.Margin(by=(["grade_bin"] + list({}.keys())), invariant="keys"), +... ], ... ) From 95c4f2d903efe7aaad77e6e9b9d79013e9ee5551 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Tue, 7 Apr 2026 14:59:37 -0400 Subject: [PATCH 5/7] bin keys are public --- dp_wizard/utils/code_generators/abstract_generator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dp_wizard/utils/code_generators/abstract_generator.py b/dp_wizard/utils/code_generators/abstract_generator.py index 6ee807d0..2818f6f3 100644 --- a/dp_wizard/utils/code_generators/abstract_generator.py +++ b/dp_wizard/utils/code_generators/abstract_generator.py @@ -140,7 +140,10 @@ def basic_template(GROUPS, MAX_ROWS): ) def bin_template(GROUPS, BIN_NAME): - dp.polars.Margin(by=([BIN_NAME] + list(GROUPS.keys()))) + dp.polars.Margin( + by=([BIN_NAME] + list(GROUPS.keys())), + invariant="keys", # Consider the bin values to be public information. + ) margins = [ DefaultsTemplate(basic_template) From 6cbbc5879ceccdcf4e7e1a3e8c947fc86640f800 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Tue, 7 Apr 2026 15:09:08 -0400 Subject: [PATCH 6/7] update docs --- docs/index.md | 5 ++++- tests/test_misc.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 45246300..3a315bc3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -568,7 +568,10 @@ Next, we'll define our Context. This is where we set the privacy budget, and set ... max_length=100000, ... max_groups=100, ... ), -... dp.polars.Margin(by=(["grade_bin"] + list({}.keys())), invariant="keys"), +... dp.polars.Margin( +... by=(["grade_bin"] + list({}.keys())), +... invariant="keys", # Consider the bin values to be public information. +... ), ... ], ... ) diff --git a/tests/test_misc.py b/tests/test_misc.py index 7b58235c..3f72488d 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -84,6 +84,7 @@ def get_file_paths() -> list[Path]: for path in package_root.parent.iterdir() if not ( path.match("*venv*") + or path.name.startswith(".coverage") or path.name in [ "docs", From fea78d2f1449be5b9042ae8894d1ed07f68f5512 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Tue, 7 Apr 2026 15:10:00 -0400 Subject: [PATCH 7/7] update index.html --- docs/index.html | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/index.html b/docs/index.html index 65672299..48b14f93 100644 --- a/docs/index.html +++ b/docs/index.html @@ -820,7 +820,10 @@

Context

... max_length=100000, ... max_groups=100, ... ), -... dp.polars.Margin(by=(["grade_bin"] + list({}.keys())), invariant="keys"), +... dp.polars.Margin( +... by=(["grade_bin"] + list({}.keys())), +... invariant="keys", # Consider the bin values to be public information. +... ), ... ], ... )