@@ -39,6 +39,16 @@ def _round_requester_count(count: int, modulo: int, minimum: int) -> str | int:
3939 return round (count / modulo ) * modulo
4040
4141
42+ def _privacy_round_request_download_columns (
43+ summary_table : pandas .DataFrame , * , modulo : int = 20 , minimum : int = 50
44+ ) -> pandas .DataFrame :
45+ for column_name in ("number_of_requests" , "number_of_downloads" ):
46+ summary_table [column_name ] = summary_table [column_name ].map (
47+ lambda count : _round_requester_count (count = int (count ), modulo = modulo , minimum = minimum )
48+ )
49+ return summary_table
50+
51+
4252def _collect_unique_ips (asset_directories : list [pathlib .Path ], use_encryption : bool = True ) -> set [str ]:
4353 """
4454 Collect all unique IP addresses across the given asset directories.
@@ -106,7 +116,10 @@ def _summarize_dataset_requester_count(
106116
107117
108118def generate_summaries (
109- level : int = 0 , cache_directory : str | pathlib .Path | None = None , use_encryption : bool = True
119+ level : int = 0 ,
120+ cache_directory : str | pathlib .Path | None = None ,
121+ use_encryption : bool = True ,
122+ privacy_threshold_minimum : int = 50 ,
110123) -> None :
111124 """
112125 Generate summaries for each dataset in the extraction directory.
@@ -127,6 +140,9 @@ def generate_summaries(
127140 use_encryption : bool
128141 If ``True`` (default), ``ips.txt`` and IP cache files are decrypted when read.
129142 If ``False``, files are read as plaintext.
143+ privacy_threshold_minimum : int
144+ Minimum disclosure threshold for privacy-rounded requester/request/download
145+ values. Default is ``50``.
130146 """
131147 if level != 0 :
132148 message = (
@@ -164,6 +180,7 @@ def generate_summaries(
164180 summary_directory = summary_directory ,
165181 ip_to_region = ip_to_region ,
166182 use_encryption = use_encryption ,
183+ privacy_threshold_minimum = privacy_threshold_minimum ,
167184 )
168185
169186 all_archive_unique_ips .update (
@@ -172,7 +189,9 @@ def generate_summaries(
172189 if all_archive_unique_ips :
173190 archive_directory = summary_directory / "archive"
174191 archive_directory .mkdir (exist_ok = True )
175- rounded_archive_count = _round_requester_count (count = len (all_archive_unique_ips ), modulo = 20 , minimum = 50 )
192+ rounded_archive_count = _round_requester_count (
193+ count = len (all_archive_unique_ips ), modulo = 20 , minimum = privacy_threshold_minimum
194+ )
176195 (archive_directory / "requester_count.tsv" ).write_text (str (rounded_archive_count ))
177196
178197
@@ -183,29 +202,36 @@ def _summarize_dataset(
183202 summary_directory : pathlib .Path ,
184203 ip_to_region : dict [str , str ],
185204 use_encryption : bool = True ,
205+ privacy_threshold_minimum : int = 50 ,
186206) -> None :
187207 _summarize_dataset_by_day (
188208 asset_directories = asset_directories ,
189209 summary_file_path = summary_directory / dataset_id / "by_day.tsv" ,
210+ privacy_threshold_minimum = privacy_threshold_minimum ,
190211 )
191212 _summarize_dataset_by_asset (
192213 asset_directories = asset_directories ,
193214 summary_file_path = summary_directory / dataset_id / "by_asset.tsv" ,
215+ privacy_threshold_minimum = privacy_threshold_minimum ,
194216 )
195217 _summarize_dataset_by_region (
196218 asset_directories = asset_directories ,
197219 summary_file_path = summary_directory / dataset_id / "by_region.tsv" ,
198220 ip_to_region = ip_to_region ,
199221 use_encryption = use_encryption ,
222+ privacy_threshold_minimum = privacy_threshold_minimum ,
200223 )
201224 _summarize_dataset_requester_count (
202225 asset_directories = asset_directories ,
203226 summary_file_path = summary_directory / dataset_id / "requester_count.tsv" ,
227+ minimum = privacy_threshold_minimum ,
204228 use_encryption = use_encryption ,
205229 )
206230
207231
208- def _summarize_dataset_by_day (* , asset_directories : list [pathlib .Path ], summary_file_path : pathlib .Path ) -> None :
232+ def _summarize_dataset_by_day (
233+ * , asset_directories : list [pathlib .Path ], summary_file_path : pathlib .Path , privacy_threshold_minimum : int = 50
234+ ) -> None :
209235 all_dates = []
210236 all_bytes_sent = []
211237 all_downloads = []
@@ -258,10 +284,15 @@ def _summarize_dataset_by_day(*, asset_directories: list[pathlib.Path], summary_
258284 )
259285 summary_table .sort_values (by = "date" , inplace = True )
260286 summary_table .index = range (len (summary_table ))
287+ summary_table = _privacy_round_request_download_columns (
288+ summary_table = summary_table , minimum = privacy_threshold_minimum
289+ )
261290 summary_table .to_csv (path_or_buf = summary_file_path , mode = "w" , sep = "\t " , header = True , index = False )
262291
263292
264- def _summarize_dataset_by_asset (* , asset_directories : list [pathlib .Path ], summary_file_path : pathlib .Path ) -> None :
293+ def _summarize_dataset_by_asset (
294+ * , asset_directories : list [pathlib .Path ], summary_file_path : pathlib .Path , privacy_threshold_minimum : int = 50
295+ ) -> None :
265296 dataset_id = summary_file_path .parent .name
266297 extraction_base_path = summary_file_path .parent .parent .parent / "extraction" / dataset_id # Assumes same cache dir
267298
@@ -302,6 +333,9 @@ def _summarize_dataset_by_asset(*, asset_directories: list[pathlib.Path], summar
302333 "number_of_downloads" : [number_of_downloads_by_asset [path ] for path in all_asset_paths ],
303334 }
304335 )
336+ summary_table = _privacy_round_request_download_columns (
337+ summary_table = summary_table , minimum = privacy_threshold_minimum
338+ )
305339 summary_table .to_csv (path_or_buf = summary_file_path , mode = "w" , sep = "\t " , header = True , index = False )
306340
307341
@@ -311,6 +345,7 @@ def _summarize_dataset_by_region(
311345 summary_file_path : pathlib .Path ,
312346 ip_to_region : dict [str , str ],
313347 use_encryption : bool = True ,
348+ privacy_threshold_minimum : int = 50 ,
314349) -> None :
315350 all_regions = []
316351 all_bytes_sent = []
@@ -359,4 +394,7 @@ def _summarize_dataset_by_region(
359394 "number_of_downloads" : [number_of_downloads_by_region [region ] for region in all_regions_ordered ],
360395 }
361396 )
397+ summary_table = _privacy_round_request_download_columns (
398+ summary_table = summary_table , minimum = privacy_threshold_minimum
399+ )
362400 summary_table .to_csv (path_or_buf = summary_file_path , mode = "w" , sep = "\t " , header = True , index = False )
0 commit comments