Skip to content

Commit 0222aa2

Browse files
Copilotpre-commit-ci[bot]CodyCBakerPhD
authored
Apply requester-style privacy thresholding to request/download counts across all summaries and totals, with configurable minimum threshold (#272)
* Initial plan * Threshold archive request and download totals for privacy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply privacy thresholding to dataset summaries and totals * Update pyproject.toml * Parameterize privacy threshold for summary and total request/download counts --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
1 parent 997e8a8 commit 0222aa2

16 files changed

Lines changed: 216 additions & 33 deletions

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ packages = ["src/s3_log_extraction"]
1212

1313
[project]
1414
name = "s3-log-extraction"
15-
version="1.10.1"
15+
version="1.10.2"
1616
authors = [
1717
{ name="Cody Baker", email="cody.c.baker.phd@gmail.com" },
1818
]

src/s3_log_extraction/summarize/_generate_all_dataset_totals.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@
33

44
import pandas
55

6+
from ._generate_summaries import _round_requester_count
67
from ..config import get_cache_subdirectory
78
from ..ip_utils._globals import EXCLUDED_REGION_LABELS
89

910

1011
def generate_all_dataset_totals(
1112
cache_directory: str | pathlib.Path | None = None,
13+
privacy_threshold_minimum: int = 50,
1214
) -> None:
1315
"""
1416
Generate top-level totals of summarized access activity for all datasets.
@@ -18,6 +20,9 @@ def generate_all_dataset_totals(
1820
cache_directory : path-like, optional
1921
The top-level cache directory from which the summary directory is derived.
2022
If not provided, the default cache directory is used.
23+
privacy_threshold_minimum : int
24+
Minimum disclosure threshold for privacy-rounded request/download totals.
25+
Default is ``50``.
2126
"""
2227
summary_directory = get_cache_subdirectory(cache_directory=cache_directory, name="summaries")
2328

@@ -34,6 +39,8 @@ def generate_all_dataset_totals(
3439
if not summary_file_path.exists():
3540
continue
3641
summary = pandas.read_table(filepath_or_buffer=summary_file_path)
42+
for column_name in ("number_of_requests", "number_of_downloads"):
43+
summary[column_name] = pandas.to_numeric(summary[column_name], errors="coerce").fillna(0).astype("int64")
3744

3845
unique_countries: set[str] = set()
3946
for region in summary["region"]:
@@ -60,8 +67,12 @@ def generate_all_dataset_totals(
6067
"total_bytes_sent": int(summary["bytes_sent"].sum()),
6168
"number_of_unique_regions": number_of_unique_regions,
6269
"number_of_unique_countries": number_of_unique_countries,
63-
"total_number_of_requests": int(summary["number_of_requests"].sum()),
64-
"total_number_of_downloads": int(summary["number_of_downloads"].sum()),
70+
"total_number_of_requests": _round_requester_count(
71+
count=int(summary["number_of_requests"].sum()), modulo=20, minimum=privacy_threshold_minimum
72+
),
73+
"total_number_of_downloads": _round_requester_count(
74+
count=int(summary["number_of_downloads"].sum()), modulo=20, minimum=privacy_threshold_minimum
75+
),
6576
"number_of_requesters": number_of_requesters,
6677
}
6778

src/s3_log_extraction/summarize/_generate_archive_summaries.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@
44
import natsort
55
import pandas
66

7+
from ._generate_summaries import _privacy_round_request_download_columns
78
from ..config import get_cache_subdirectory
89

910

1011
@beartype.beartype
1112
def generate_archive_summaries(
12-
cache_directory: str | pathlib.Path | None = None, asset_types_in_order: tuple[str, ...] | list[str] | None = None
13+
cache_directory: str | pathlib.Path | None = None,
14+
asset_types_in_order: tuple[str, ...] | list[str] | None = None,
15+
privacy_threshold_minimum: int = 50,
1316
) -> None:
1417
"""
1518
Generate summaries by day and region for the entire archive from the mapped S3 logs.
@@ -22,6 +25,9 @@ def generate_archive_summaries(
2225
asset_types_in_order : sequence[str], optional
2326
Preferred output column ordering for known asset types in the archive
2427
``by_asset_type_per_week.tsv`` summary.
28+
privacy_threshold_minimum : int
29+
Minimum disclosure threshold for privacy-rounded request/download
30+
summary values. Default is ``50``.
2531
"""
2632
asset_types_in_order = list(dict.fromkeys(asset_types_in_order)) if asset_types_in_order is not None else []
2733

@@ -36,6 +42,9 @@ def generate_archive_summaries(
3642
for dataset_by_day_summary_file_path in summary_directory.rglob(pattern="by_day.tsv")
3743
if dataset_by_day_summary_file_path.parent.name != "archive"
3844
]
45+
for summary in all_dataset_summaries_by_day:
46+
for column_name in ("number_of_requests", "number_of_downloads"):
47+
summary[column_name] = pandas.to_numeric(summary[column_name], errors="coerce").fillna(0).astype("int64")
3948
aggregated_dataset_summaries_by_day = pandas.concat(objs=all_dataset_summaries_by_day, ignore_index=True)
4049

4150
pre_aggregated = aggregated_dataset_summaries_by_day.groupby(by="date", as_index=False)[
@@ -49,6 +58,9 @@ def generate_archive_summaries(
4958
aggregated_activity_by_day = aggregated_activity_by_day.astype(
5059
dtype={"bytes_sent": "int64", "number_of_requests": "int64", "number_of_downloads": "int64"}
5160
)
61+
aggregated_activity_by_day = _privacy_round_request_download_columns(
62+
summary_table=aggregated_activity_by_day, minimum=privacy_threshold_minimum
63+
)
5264

5365
archive_summary_by_day_file_path = archive_directory / "by_day.tsv"
5466
aggregated_activity_by_day.to_csv(
@@ -61,6 +73,9 @@ def generate_archive_summaries(
6173
for dataset_by_region_summary_file_path in summary_directory.rglob(pattern="by_region.tsv")
6274
if dataset_by_region_summary_file_path.parent.name != "archive"
6375
]
76+
for summary in all_dataset_summaries_by_region:
77+
for column_name in ("number_of_requests", "number_of_downloads"):
78+
summary[column_name] = pandas.to_numeric(summary[column_name], errors="coerce").fillna(0).astype("int64")
6479
aggregated_dataset_summaries_by_region = pandas.concat(objs=all_dataset_summaries_by_region, ignore_index=True)
6580

6681
pre_aggregated = aggregated_dataset_summaries_by_region.groupby(by="region", as_index=False)[
@@ -74,6 +89,9 @@ def generate_archive_summaries(
7489
aggregated_activity_by_region = aggregated_activity_by_region.astype(
7590
dtype={"bytes_sent": "int64", "number_of_requests": "int64", "number_of_downloads": "int64"}
7691
)
92+
aggregated_activity_by_region = _privacy_round_request_download_columns(
93+
summary_table=aggregated_activity_by_region, minimum=privacy_threshold_minimum
94+
)
7795

7896
archive_summary_by_region_file_path = archive_directory / "by_region.tsv"
7997
aggregated_activity_by_region.to_csv(
@@ -87,7 +105,11 @@ def generate_archive_summaries(
87105
if summary_file_path.parent.name != "archive" and "<" not in (value := summary_file_path.read_text().strip())
88106
]
89107
total_requester_count: int = sum(requester_counts)
90-
archive_requester_count: str = "<50" if total_requester_count < 50 else str(total_requester_count)
108+
archive_requester_count: str = (
109+
f"<{privacy_threshold_minimum}"
110+
if total_requester_count < privacy_threshold_minimum
111+
else str(total_requester_count)
112+
)
91113

92114
archive_requester_count_file_path = archive_directory / "requester_count.tsv"
93115
archive_requester_count_file_path.write_text(archive_requester_count)

src/s3_log_extraction/summarize/_generate_archive_totals.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@
44
import beartype
55
import pandas
66

7+
from ._generate_summaries import _round_requester_count
78
from ..config import get_cache_subdirectory
89
from ..ip_utils._globals import EXCLUDED_REGION_LABELS
910

1011

1112
@beartype.beartype
1213
def generate_archive_totals(
1314
cache_directory: str | pathlib.Path | None = None,
15+
privacy_threshold_minimum: int = 50,
1416
) -> None:
1517
"""
1618
Generate top-level totals of the entire archive from the archive summaries in the mapped S3 logs folder.
@@ -20,13 +22,18 @@ def generate_archive_totals(
2022
cache_directory : path-like, optional
2123
The top-level cache directory from which the summary directory is derived.
2224
If not provided, the default cache directory is used.
25+
privacy_threshold_minimum : int
26+
Minimum disclosure threshold for privacy-rounded request/download totals.
27+
Default is ``50``.
2328
"""
2429
summary_directory = get_cache_subdirectory(cache_directory=cache_directory, name="summaries")
2530
archive_directory = summary_directory / "archive"
2631
archive_directory.mkdir(exist_ok=True)
2732

2833
summary_file_path = archive_directory / "by_region.tsv"
2934
summary = pandas.read_table(filepath_or_buffer=summary_file_path)
35+
for column_name in ("number_of_requests", "number_of_downloads"):
36+
summary[column_name] = pandas.to_numeric(summary[column_name], errors="coerce").fillna(0).astype("int64")
3037

3138
unique_countries: set[str] = set()
3239
for region in summary["region"]:
@@ -60,8 +67,12 @@ def generate_archive_totals(
6067
"total_bytes_sent": int(summary["bytes_sent"].sum()),
6168
"number_of_unique_regions": number_of_unique_regions,
6269
"number_of_unique_countries": number_of_unique_countries,
63-
"total_number_of_requests": int(summary["number_of_requests"].sum()),
64-
"total_number_of_downloads": int(summary["number_of_downloads"].sum()),
70+
"total_number_of_requests": _round_requester_count(
71+
count=int(summary["number_of_requests"].sum()), modulo=20, minimum=privacy_threshold_minimum
72+
),
73+
"total_number_of_downloads": _round_requester_count(
74+
count=int(summary["number_of_downloads"].sum()), modulo=20, minimum=privacy_threshold_minimum
75+
),
6576
"number_of_requesters": number_of_requesters,
6677
}
6778

src/s3_log_extraction/summarize/_generate_summaries.py

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,16 @@ def _round_requester_count(count: int, modulo: int, minimum: int) -> str | int:
3939
return round(count / modulo) * modulo
4040

4141

42+
def _privacy_round_request_download_columns(
43+
summary_table: pandas.DataFrame, *, modulo: int = 20, minimum: int = 50
44+
) -> pandas.DataFrame:
45+
for column_name in ("number_of_requests", "number_of_downloads"):
46+
summary_table[column_name] = summary_table[column_name].map(
47+
lambda count: _round_requester_count(count=int(count), modulo=modulo, minimum=minimum)
48+
)
49+
return summary_table
50+
51+
4252
def _collect_unique_ips(asset_directories: list[pathlib.Path], use_encryption: bool = True) -> set[str]:
4353
"""
4454
Collect all unique IP addresses across the given asset directories.
@@ -106,7 +116,10 @@ def _summarize_dataset_requester_count(
106116

107117

108118
def generate_summaries(
109-
level: int = 0, cache_directory: str | pathlib.Path | None = None, use_encryption: bool = True
119+
level: int = 0,
120+
cache_directory: str | pathlib.Path | None = None,
121+
use_encryption: bool = True,
122+
privacy_threshold_minimum: int = 50,
110123
) -> None:
111124
"""
112125
Generate summaries for each dataset in the extraction directory.
@@ -127,6 +140,9 @@ def generate_summaries(
127140
use_encryption : bool
128141
If ``True`` (default), ``ips.txt`` and IP cache files are decrypted when read.
129142
If ``False``, files are read as plaintext.
143+
privacy_threshold_minimum : int
144+
Minimum disclosure threshold for privacy-rounded requester/request/download
145+
values. Default is ``50``.
130146
"""
131147
if level != 0:
132148
message = (
@@ -164,6 +180,7 @@ def generate_summaries(
164180
summary_directory=summary_directory,
165181
ip_to_region=ip_to_region,
166182
use_encryption=use_encryption,
183+
privacy_threshold_minimum=privacy_threshold_minimum,
167184
)
168185

169186
all_archive_unique_ips.update(
@@ -172,7 +189,9 @@ def generate_summaries(
172189
if all_archive_unique_ips:
173190
archive_directory = summary_directory / "archive"
174191
archive_directory.mkdir(exist_ok=True)
175-
rounded_archive_count = _round_requester_count(count=len(all_archive_unique_ips), modulo=20, minimum=50)
192+
rounded_archive_count = _round_requester_count(
193+
count=len(all_archive_unique_ips), modulo=20, minimum=privacy_threshold_minimum
194+
)
176195
(archive_directory / "requester_count.tsv").write_text(str(rounded_archive_count))
177196

178197

@@ -183,29 +202,36 @@ def _summarize_dataset(
183202
summary_directory: pathlib.Path,
184203
ip_to_region: dict[str, str],
185204
use_encryption: bool = True,
205+
privacy_threshold_minimum: int = 50,
186206
) -> None:
187207
_summarize_dataset_by_day(
188208
asset_directories=asset_directories,
189209
summary_file_path=summary_directory / dataset_id / "by_day.tsv",
210+
privacy_threshold_minimum=privacy_threshold_minimum,
190211
)
191212
_summarize_dataset_by_asset(
192213
asset_directories=asset_directories,
193214
summary_file_path=summary_directory / dataset_id / "by_asset.tsv",
215+
privacy_threshold_minimum=privacy_threshold_minimum,
194216
)
195217
_summarize_dataset_by_region(
196218
asset_directories=asset_directories,
197219
summary_file_path=summary_directory / dataset_id / "by_region.tsv",
198220
ip_to_region=ip_to_region,
199221
use_encryption=use_encryption,
222+
privacy_threshold_minimum=privacy_threshold_minimum,
200223
)
201224
_summarize_dataset_requester_count(
202225
asset_directories=asset_directories,
203226
summary_file_path=summary_directory / dataset_id / "requester_count.tsv",
227+
minimum=privacy_threshold_minimum,
204228
use_encryption=use_encryption,
205229
)
206230

207231

208-
def _summarize_dataset_by_day(*, asset_directories: list[pathlib.Path], summary_file_path: pathlib.Path) -> None:
232+
def _summarize_dataset_by_day(
233+
*, asset_directories: list[pathlib.Path], summary_file_path: pathlib.Path, privacy_threshold_minimum: int = 50
234+
) -> None:
209235
all_dates = []
210236
all_bytes_sent = []
211237
all_downloads = []
@@ -258,10 +284,15 @@ def _summarize_dataset_by_day(*, asset_directories: list[pathlib.Path], summary_
258284
)
259285
summary_table.sort_values(by="date", inplace=True)
260286
summary_table.index = range(len(summary_table))
287+
summary_table = _privacy_round_request_download_columns(
288+
summary_table=summary_table, minimum=privacy_threshold_minimum
289+
)
261290
summary_table.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True, index=False)
262291

263292

264-
def _summarize_dataset_by_asset(*, asset_directories: list[pathlib.Path], summary_file_path: pathlib.Path) -> None:
293+
def _summarize_dataset_by_asset(
294+
*, asset_directories: list[pathlib.Path], summary_file_path: pathlib.Path, privacy_threshold_minimum: int = 50
295+
) -> None:
265296
dataset_id = summary_file_path.parent.name
266297
extraction_base_path = summary_file_path.parent.parent.parent / "extraction" / dataset_id # Assumes same cache dir
267298

@@ -302,6 +333,9 @@ def _summarize_dataset_by_asset(*, asset_directories: list[pathlib.Path], summar
302333
"number_of_downloads": [number_of_downloads_by_asset[path] for path in all_asset_paths],
303334
}
304335
)
336+
summary_table = _privacy_round_request_download_columns(
337+
summary_table=summary_table, minimum=privacy_threshold_minimum
338+
)
305339
summary_table.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True, index=False)
306340

307341

@@ -311,6 +345,7 @@ def _summarize_dataset_by_region(
311345
summary_file_path: pathlib.Path,
312346
ip_to_region: dict[str, str],
313347
use_encryption: bool = True,
348+
privacy_threshold_minimum: int = 50,
314349
) -> None:
315350
all_regions = []
316351
all_bytes_sent = []
@@ -359,4 +394,7 @@ def _summarize_dataset_by_region(
359394
"number_of_downloads": [number_of_downloads_by_region[region] for region in all_regions_ordered],
360395
}
361396
)
397+
summary_table = _privacy_round_request_download_columns(
398+
summary_table=summary_table, minimum=privacy_threshold_minimum
399+
)
362400
summary_table.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True, index=False)
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
date bytes_sent number_of_requests number_of_downloads
2-
2020-01-01 6286489 3 2
3-
2022-04-06 12 1 1
4-
2022-08-03 0 1 0
5-
2023-11-13 0 1 1
6-
2024-11-13 1194552 1 1
2+
2020-01-01 6286489 <50 <50
3+
2022-04-06 12 <50 <50
4+
2022-08-03 0 <50 <50
5+
2023-11-13 0 <50 <50
6+
2024-11-13 1194552 <50 <50
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
region bytes_sent number_of_requests number_of_downloads
2-
missing 7481053 7 5
2+
missing 7481053 <50 <50
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"total_bytes_sent": 7481053, "number_of_unique_regions": 1, "number_of_unique_countries": 0, "total_number_of_requests": 7, "total_number_of_downloads": 5, "number_of_requesters": "<50"}
1+
{"total_bytes_sent": 7481053, "number_of_unique_regions": 1, "number_of_unique_countries": 0, "total_number_of_requests": "<50", "total_number_of_downloads": "<50", "number_of_requesters": "<50"}
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
asset_path bytes_sent number_of_requests number_of_downloads
2-
dataset_description.json 1194552 1 1
3-
sub-23/anat/sub-23_run-01_T1w.nii.gz 0 1 1
4-
sub-23/func/sub-23_task-effortmixedgambles_run-01_bold.nii.gz 0 1 0
5-
task-effortmixedgambles_bold.json 12 1 1
2+
dataset_description.json 1194552 <50 <50
3+
sub-23/anat/sub-23_run-01_T1w.nii.gz 0 <50 <50
4+
sub-23/func/sub-23_task-effortmixedgambles_run-01_bold.nii.gz 0 <50 <50
5+
task-effortmixedgambles_bold.json 12 <50 <50
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
date bytes_sent number_of_requests number_of_downloads
2-
2022-04-06 12 1 1
3-
2022-08-03 0 1 0
4-
2023-11-13 0 1 1
5-
2024-11-13 1194552 1 1
2+
2022-04-06 12 <50 <50
3+
2022-08-03 0 <50 <50
4+
2023-11-13 0 <50 <50
5+
2024-11-13 1194552 <50 <50

0 commit comments

Comments
 (0)