Skip to content

Commit 997e8a8

Browse files
CopilotCodyCBakerPhDpre-commit-ci[bot]
authored
Require archive requester count for totals, generate archive-level requester/asset-type summaries, and support configurable asset-type ordering (#270)
* Initial plan * Fix archive totals to error when requester count is missing * Add archive requester and optional weekly asset-type aggregations * Stabilize archive asset-type column ordering * Allow archive asset type order via API and CLI * Apply suggestion from @CodyCBakerPhD * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply suggestions from code review Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> * Update test_generic_summaries.py * Fix CLI option wiring for asset type ordering * Remove archive asset-type defaults from generic summaries * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply suggestion from @CodyCBakerPhD * Simplify asset_types_in_order initialization * Fix conditions for dataset summary file paths * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update _generate_archive_summaries.py * Fix conditional statement in requester count extraction * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix type conversion for requester counts * Refactor requester_counts and archive_requester_count * Fix variable name for total requester counts * Update pyproject.toml --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 929b494 commit 997e8a8

6 files changed

Lines changed: 220 additions & 11 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ packages = ["src/s3_log_extraction"]
1212

1313
[project]
1414
name = "s3-log-extraction"
15-
version="1.10.0"
15+
version="1.10.1"
1616
authors = [
1717
{ name="Cody Baker", email="cody.c.baker.phd@gmail.com" },
1818
]

src/s3_log_extraction/_command_line_interface/_cli.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,13 @@ def _update_ip_coordinates_cli(cache_directory: str | None = None, use_encryptio
375375
type=rich_click.IntRange(min=-os.cpu_count() + 1, max=os.cpu_count()),
376376
default=-2,
377377
)
378+
@rich_click.option(
379+
"--asset-types-in-order",
380+
help="Archive mode only: comma-separated list of known asset types used for output column ordering (no spaces).",
381+
required=False,
382+
type=rich_click.STRING,
383+
default=None,
384+
)
378385
@rich_click.option(
379386
"--cache",
380387
"cache_directory",
@@ -398,14 +405,19 @@ def _update_summaries_cli(
398405
pick: str | None = None,
399406
skip: str | None = None,
400407
workers: int = -2,
408+
asset_types_in_order: str | None = None,
401409
cache_directory: str | None = None,
402410
use_encryption: bool = True,
403411
) -> None:
404412
"""Generate condensed summaries of activity."""
405413
cache_path = pathlib.Path(cache_directory) if cache_directory is not None else None
406414
match mode:
407415
case "archive":
408-
generate_archive_summaries(cache_directory=cache_path)
416+
parsed_asset_types_in_order = asset_types_in_order.split(",") if asset_types_in_order is not None else None
417+
generate_archive_summaries(
418+
cache_directory=cache_path,
419+
asset_types_in_order=parsed_asset_types_in_order,
420+
)
409421
case _:
410422
generate_summaries(cache_directory=cache_path, use_encryption=use_encryption)
411423

src/s3_log_extraction/summarize/_generate_archive_summaries.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88

99

1010
@beartype.beartype
11-
def generate_archive_summaries(cache_directory: str | pathlib.Path | None = None) -> None:
11+
def generate_archive_summaries(
12+
cache_directory: str | pathlib.Path | None = None, asset_types_in_order: tuple[str, ...] | list[str] | None = None
13+
) -> None:
1214
"""
1315
Generate summaries by day and region for the entire archive from the mapped S3 logs.
1416
@@ -17,7 +19,12 @@ def generate_archive_summaries(cache_directory: str | pathlib.Path | None = None
1719
cache_directory : path-like, optional
1820
The top-level cache directory from which the summary directory is derived.
1921
If not provided, the default cache directory is used.
22+
asset_types_in_order : sequence[str], optional
23+
Preferred output column ordering for known asset types in the archive
24+
``by_asset_type_per_week.tsv`` summary.
2025
"""
26+
asset_types_in_order = list(dict.fromkeys(asset_types_in_order)) if asset_types_in_order is not None else []
27+
2128
summary_directory = get_cache_subdirectory(cache_directory=cache_directory, name="summaries")
2229
archive_directory = summary_directory / "archive"
2330
archive_directory.mkdir(exist_ok=True)
@@ -72,3 +79,45 @@ def generate_archive_summaries(cache_directory: str | pathlib.Path | None = None
7279
aggregated_activity_by_region.to_csv(
7380
path_or_buf=archive_summary_by_region_file_path, mode="w", sep="\t", header=True, index=False
7481
)
82+
83+
# Requester count (aggregated from dataset requester_count.tsv files)
84+
requester_counts: list[int] = [
85+
int(value)
86+
for summary_file_path in summary_directory.rglob(pattern="requester_count.tsv")
87+
if summary_file_path.parent.name != "archive" and "<" not in (value := summary_file_path.read_text().strip())
88+
]
89+
total_requester_count: int = sum(requester_counts)
90+
archive_requester_count: str = "<50" if total_requester_count < 50 else str(total_requester_count)
91+
92+
archive_requester_count_file_path = archive_directory / "requester_count.tsv"
93+
archive_requester_count_file_path.write_text(archive_requester_count)
94+
95+
# Optional by_asset_type_per_week aggregation
96+
all_dataset_summaries_by_asset_type_per_week = [
97+
pandas.read_table(filepath_or_buffer=summary_file_path)
98+
for summary_file_path in summary_directory.rglob(pattern="by_asset_type_per_week.tsv")
99+
if summary_file_path.parent.name != "archive"
100+
]
101+
if all_dataset_summaries_by_asset_type_per_week:
102+
all_summary_data = pandas.concat(objs=all_dataset_summaries_by_asset_type_per_week, ignore_index=True)
103+
all_summary_data.fillna(value=0, inplace=True)
104+
105+
all_asset_type_columns = [
106+
column_name for column_name in all_summary_data.columns if column_name != "week_start"
107+
]
108+
known_asset_type_columns = [
109+
column_name for column_name in asset_types_in_order if column_name in all_asset_type_columns
110+
]
111+
additional_asset_type_columns = sorted(set(all_asset_type_columns).difference(asset_types_in_order))
112+
asset_type_columns = [*known_asset_type_columns, *additional_asset_type_columns]
113+
if asset_type_columns:
114+
archive_summary = (
115+
all_summary_data.groupby(by="week_start", as_index=False)[asset_type_columns]
116+
.sum()
117+
.reindex(columns=["week_start", *asset_type_columns])
118+
)
119+
archive_summary = archive_summary.astype(dtype={column_name: "int64" for column_name in asset_type_columns})
120+
archive_summary.sort_values(by="week_start", key=natsort.natsort_keygen(), inplace=True)
121+
122+
archive_summary_file_path = archive_directory / "by_asset_type_per_week.tsv"
123+
archive_summary.to_csv(path_or_buf=archive_summary_file_path, mode="w", sep="\t", header=True, index=False)

src/s3_log_extraction/summarize/_generate_archive_totals.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,14 @@ def generate_archive_totals(
4545
number_of_unique_countries = len(unique_countries)
4646

4747
requester_count_file_path = archive_directory / "requester_count.tsv"
48-
number_of_requesters: str | int = (
49-
requester_count_file_path.read_text().strip() if requester_count_file_path.exists() else 0
50-
)
48+
if not requester_count_file_path.exists():
49+
msg = (
50+
f"Archive requester count file not found: {requester_count_file_path}. "
51+
"Run archive summaries before archive totals."
52+
)
53+
raise FileNotFoundError(msg)
54+
55+
number_of_requesters: str | int = requester_count_file_path.read_text().strip()
5156
if isinstance(number_of_requesters, str) and not number_of_requesters.startswith("<"):
5257
number_of_requesters = int(number_of_requesters)
5358

tests/test_generic_summaries.py

Lines changed: 131 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,14 +91,14 @@ def test_generate_all_dataset_totals_skips_archive(tmpdir: py.path.local):
9191
dataset_dir = summary_dir / "ds001161"
9292
dataset_dir.mkdir(parents=True)
9393
(dataset_dir / "by_region.tsv").write_text(
94-
"region\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\n" "missing\t1194564\t4\t3\n"
94+
"region\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\nmissing\t1194564\t4\t3\n"
9595
)
9696

9797
# Set up an archive summary that should be excluded
9898
archive_dir = summary_dir / "archive"
9999
archive_dir.mkdir(parents=True)
100100
(archive_dir / "by_region.tsv").write_text(
101-
"region\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\n" "missing\t7481053\t7\t5\n"
101+
"region\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\nmissing\t7481053\t7\t5\n"
102102
)
103103

104104
s3_log_extraction.summarize.generate_all_dataset_totals(cache_directory=test_dir)
@@ -108,6 +108,135 @@ def test_generate_all_dataset_totals_skips_archive(tmpdir: py.path.local):
108108
assert "archive" not in totals, "'archive' should be excluded from totals.json"
109109

110110

111+
@pytest.mark.ai_generated
112+
def test_generate_archive_totals_raises_without_archive_requester_count(tmpdir: py.path.local) -> None:
113+
"""Archive totals should fail if archive requester count has not been generated."""
114+
test_dir = pathlib.Path(tmpdir)
115+
archive_dir = test_dir / "summaries" / "archive"
116+
archive_dir.mkdir(parents=True)
117+
(archive_dir / "by_region.tsv").write_text(
118+
"region\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\nmissing\t7481053\t7\t5\n"
119+
)
120+
121+
with pytest.raises(FileNotFoundError, match="Archive requester count file not found"):
122+
s3_log_extraction.summarize.generate_archive_totals(cache_directory=test_dir)
123+
124+
125+
@pytest.mark.ai_generated
126+
def test_generate_archive_summaries_aggregates_requester_count(tmpdir: py.path.local) -> None:
127+
test_dir = pathlib.Path(tmpdir)
128+
summary_dir = test_dir / "summaries"
129+
130+
ds001_dir = summary_dir / "ds001"
131+
ds001_dir.mkdir(parents=True)
132+
(ds001_dir / "by_day.tsv").write_text(
133+
"date\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\n2026-01-01\t10\t1\t1\n"
134+
)
135+
(ds001_dir / "by_region.tsv").write_text(
136+
"region\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\nmissing\t10\t1\t1\n"
137+
)
138+
(ds001_dir / "requester_count.tsv").write_text("60\n")
139+
140+
ds002_dir = summary_dir / "ds002"
141+
ds002_dir.mkdir(parents=True)
142+
(ds002_dir / "by_day.tsv").write_text(
143+
"date\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\n2026-01-01\t40\t2\t1\n"
144+
)
145+
(ds002_dir / "by_region.tsv").write_text(
146+
"region\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\nmissing\t40\t2\t1\n"
147+
)
148+
(ds002_dir / "requester_count.tsv").write_text("40\n")
149+
150+
s3_log_extraction.summarize.generate_archive_summaries(cache_directory=test_dir)
151+
152+
archive_requester_count_file_path = summary_dir / "archive" / "requester_count.tsv"
153+
assert archive_requester_count_file_path.exists()
154+
assert archive_requester_count_file_path.read_text().strip() == "100"
155+
156+
157+
@pytest.mark.ai_generated
158+
def test_generate_archive_summaries_aggregates_optional_by_asset_type_per_week(tmpdir: py.path.local) -> None:
159+
test_dir = pathlib.Path(tmpdir)
160+
summary_dir = test_dir / "summaries"
161+
162+
ds001_dir = summary_dir / "ds001"
163+
ds001_dir.mkdir(parents=True)
164+
(ds001_dir / "by_day.tsv").write_text(
165+
"date\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\n2026-01-01\t10\t1\t1\n"
166+
)
167+
(ds001_dir / "by_region.tsv").write_text(
168+
"region\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\nmissing\t10\t1\t1\n"
169+
)
170+
(ds001_dir / "requester_count.tsv").write_text("20\n")
171+
(ds001_dir / "by_asset_type_per_week.tsv").write_text(
172+
"week_start\tNeurophysiology\tMiscellaneous\n2025-12-29\t1\t2\n2026-01-05\t3\t4\n"
173+
)
174+
175+
ds002_dir = summary_dir / "ds002"
176+
ds002_dir.mkdir(parents=True)
177+
(ds002_dir / "by_day.tsv").write_text(
178+
"date\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\n2026-01-01\t40\t2\t1\n"
179+
)
180+
(ds002_dir / "by_region.tsv").write_text(
181+
"region\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\nmissing\t40\t2\t1\n"
182+
)
183+
(ds002_dir / "requester_count.tsv").write_text("20\n")
184+
(ds002_dir / "by_asset_type_per_week.tsv").write_text("week_start\tVideo\n2025-12-29\t5\n2026-01-05\t7\n")
185+
186+
s3_log_extraction.summarize.generate_archive_summaries(cache_directory=test_dir)
187+
188+
archive_file_path = summary_dir / "archive" / "by_asset_type_per_week.tsv"
189+
assert archive_file_path.exists()
190+
archive_summary = pandas.read_table(filepath_or_buffer=archive_file_path)
191+
expected_summary = pandas.DataFrame(
192+
data={
193+
"week_start": ["2025-12-29", "2026-01-05"],
194+
"Miscellaneous": [2, 4],
195+
"Neurophysiology": [1, 3],
196+
"Video": [5, 7],
197+
}
198+
)
199+
pandas.testing.assert_frame_equal(left=archive_summary, right=expected_summary)
200+
201+
202+
@pytest.mark.ai_generated
203+
def test_generate_archive_summaries_accepts_custom_asset_type_order(tmpdir: py.path.local) -> None:
204+
test_dir = pathlib.Path(tmpdir)
205+
summary_dir = test_dir / "summaries"
206+
207+
ds001_dir = summary_dir / "ds001"
208+
ds001_dir.mkdir(parents=True)
209+
(ds001_dir / "by_day.tsv").write_text(
210+
"date\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\n2026-01-01\t10\t1\t1\n"
211+
)
212+
(ds001_dir / "by_region.tsv").write_text(
213+
"region\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\nmissing\t10\t1\t1\n"
214+
)
215+
(ds001_dir / "requester_count.tsv").write_text("20\n")
216+
(ds001_dir / "by_asset_type_per_week.tsv").write_text(
217+
"week_start\tNeurophysiology\tMiscellaneous\n2025-12-29\t1\t2\n"
218+
)
219+
220+
ds002_dir = summary_dir / "ds002"
221+
ds002_dir.mkdir(parents=True)
222+
(ds002_dir / "by_day.tsv").write_text(
223+
"date\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\n2026-01-01\t40\t2\t1\n"
224+
)
225+
(ds002_dir / "by_region.tsv").write_text(
226+
"region\tbytes_sent\tnumber_of_requests\tnumber_of_downloads\nmissing\t40\t2\t1\n"
227+
)
228+
(ds002_dir / "requester_count.tsv").write_text("20\n")
229+
(ds002_dir / "by_asset_type_per_week.tsv").write_text("week_start\tVideo\n2025-12-29\t5\n")
230+
231+
s3_log_extraction.summarize.generate_archive_summaries(
232+
cache_directory=test_dir, asset_types_in_order=["Video", "Neurophysiology", "Miscellaneous"]
233+
)
234+
235+
archive_file_path = summary_dir / "archive" / "by_asset_type_per_week.tsv"
236+
archive_summary = pandas.read_table(filepath_or_buffer=archive_file_path)
237+
assert archive_summary.columns.tolist() == ["week_start", "Video", "Neurophysiology", "Miscellaneous"]
238+
239+
111240
@pytest.mark.ai_generated
112241
@pytest.mark.parametrize(
113242
("count", "modulo", "minimum", "expected"),

tests/test_log_bucket_stats.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -332,22 +332,36 @@ def test_update_summaries_archive_forwards_cache_directory(
332332
"""
333333
``update summaries --mode archive`` passes ``cache_directory`` directly to ``generate_archive_summaries``.
334334
"""
335-
captured: dict[str, pathlib.Path] = {}
335+
captured: dict[str, pathlib.Path | list[str] | None] = {}
336336

337-
def _stub_generate_archive_summaries(cache_directory: pathlib.Path | str | None = None) -> None:
337+
def _stub_generate_archive_summaries(
338+
cache_directory: pathlib.Path | str | None = None,
339+
asset_types_in_order: tuple[str, ...] | list[str] | None = None,
340+
) -> None:
338341
captured["cache_directory"] = pathlib.Path(cache_directory) if cache_directory is not None else None
342+
captured["asset_types_in_order"] = list(asset_types_in_order) if asset_types_in_order is not None else None
339343

340344
monkeypatch.setattr(cli_module, "generate_archive_summaries", _stub_generate_archive_summaries)
341345

342346
cache_dir = tmp_path / "custom-cache"
343347
runner = CliRunner()
344348
result = runner.invoke(
345349
s3logextraction_cli,
346-
["update", "summaries", "--mode", "archive", "--cache", str(cache_dir)],
350+
[
351+
"update",
352+
"summaries",
353+
"--mode",
354+
"archive",
355+
"--asset-types-in-order",
356+
"Video,Neurophysiology,Miscellaneous",
357+
"--cache",
358+
str(cache_dir),
359+
],
347360
)
348361

349362
assert result.exit_code == 0, f"CLI failed: {result.output}"
350363
assert captured["cache_directory"] == cache_dir
364+
assert captured["asset_types_in_order"] == ["Video", "Neurophysiology", "Miscellaneous"]
351365

352366

353367
@pytest.mark.ai_generated

0 commit comments

Comments
 (0)