Skip to content

Commit e0e4dac

Browse files
authored
feat(core): require TrackerResult from collectors and migrate in-repo trackers (#279)
* feat(protocols): introduce LibraryDocsTrackerResult and CollectBoostLibrariesResult DTOs * refactor(tests): remove unused Wg21Reflector classes from test_collector_protocol_conformance.py * feat(protocols): add MappingProxyType for counts and extras in various tracker result classes * feat(collectors): enhance collector behavior and error reporting * feat(collectors): update collect method to return TrackerResult
1 parent a5805a7 commit e0e4dac

55 files changed

Lines changed: 1298 additions & 148 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2020

2121
### Changed
2222

23+
- **core.collectors:** `AbstractCollector.last_result` is set only after `post_collect()` completes successfully (including default incremental checkpoint persistence), matching the documented “most recent successful run” semantics.
24+
- **discord_activity_tracker:** `backfill_discord_activity_tracker` reports per-file import failures on `DiscordCollectionTrackerResult` (`success=False`, `errors`, `failed_files` count) instead of always returning `success=True`.
2325
- **core.protocols / ActivityRecord:** `occurred_at` is timezone-aware UTC `datetime | None`; `source_system` is `SourceSystem` (`StrEnum`); `activity_type` is branded `ActivityType`; `actor_external_id` is `ActorExternalId` (`NewType`). Legacy string payloads use `core.activity_types.migrate_legacy_activity_fields` and `activity_record_to_legacy_dict` on GitHub/Discord `protocol_impl` dataclasses.
2426
- **Celery schedule:** Added `discord` group to `config/boost_collector_schedule.yaml` (`run_discord_activity_tracker` daily at 16:40 UTC).
2527
- **core.collectors:** Removed deprecated `CollectorBase` and `DjangoCommandCollector`; the supported collector contract is **`AbstractCollector`** + **`BaseCollectorCommand`** (see docs).

STABILITY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ Supported in production with **forward migrations** and **CHANGELOG** notes. Not
139139
| --- | --- |
140140
| **PostgreSQL schema** | Changed only via Django migrations; every deploy runs `python manage.py migrate` |
141141
| **`services.py` functions** | Per-app write API; signatures may change in minor `0.x` releases when [docs/service_api/](docs/service_api/) and all callers are updated together. Cross-app reads should use **`services`** or **`sync_api`**, not foreign models (see [CONTRIBUTING.md](CONTRIBUTING.md)) |
142+
| **Collector run outcomes** | `TrackerResult.success` and `errors` must reflect the real outcome (e.g. batch backfills must not report `success=True` when individual files fail). `AbstractCollector.last_result` is the most recent **fully** successful `run()` — after `collect()` **and** `post_collect()` (including checkpoint persistence) complete without error. |
142143

143144
### Tier C — Unstable
144145

boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
from django.core.management.base import CommandError
4545

4646
from core.collectors import AbstractCollector, BaseCollectorCommand
47+
from core.protocols import TrackerResult
48+
from boost_library_docs_tracker.protocol_impl import LibraryDocsTrackerResult
4749

4850
from boost_library_docs_tracker import fetcher, services, workspace
4951
from boost_library_docs_tracker.preprocessor import preprocess_for_pinecone
@@ -72,10 +74,10 @@ def validate_config(self) -> None:
7274
if max_pages is not None and max_pages < 1:
7375
raise CommandError("--max-pages must be at least 1.")
7476

75-
def collect(self) -> None:
77+
def collect(self) -> TrackerResult:
7678
o = self.options
7779
try:
78-
self.cmd._run(
80+
return self.cmd._run(
7981
versions_arg=o["versions"],
8082
library_filter=o["library"],
8183
dry_run=o["dry_run"],
@@ -169,16 +171,17 @@ def _run(
169171
max_pages,
170172
use_local,
171173
cleanup_extract,
172-
):
174+
) -> LibraryDocsTrackerResult:
173175
versions = self._resolve_versions(versions_arg)
174176
self.stdout.write(
175177
f"Processing {len(versions)} version(s): {', '.join(versions)}"
176178
)
177179
mode = "local-zip" if use_local else "HTTP crawl"
178180
self.stdout.write(f"Scrape mode: {mode}")
179181

182+
total_pages = 0
180183
for version in versions:
181-
self._process_version(
184+
total_pages += self._process_version(
182185
version=version,
183186
library_filter=library_filter,
184187
dry_run=dry_run,
@@ -191,6 +194,12 @@ def _run(
191194
reason = "dry run" if dry_run else "--skip-pinecone set"
192195
self.stdout.write(f"Skipping Pinecone sync ({reason}).")
193196

197+
return LibraryDocsTrackerResult.from_run(
198+
versions=len(versions),
199+
pages=total_pages,
200+
dry_run=dry_run,
201+
)
202+
194203
def _process_version(
195204
self, *, version, library_filter, dry_run, max_pages, use_local, cleanup_extract
196205
):
@@ -247,6 +256,7 @@ def _process_version(
247256
)
248257

249258
self.stdout.write(f"[{version}] Done — {total_pages} pages total.")
259+
return total_pages
250260

251261
def _prepare_local_source(self, *, version: str) -> tuple[Path, Path]:
252262
"""Download and extract the Boost source zip for a version.
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""Frozen DTOs implementing :mod:`core.protocols` for library docs tracker."""
2+
3+
from __future__ import annotations
4+
5+
from dataclasses import dataclass, field
6+
from types import MappingProxyType
7+
from typing import Mapping
8+
9+
10+
@dataclass(frozen=True)
11+
class LibraryDocsTrackerResult:
12+
"""Structured :class:`~core.protocols.TrackerResult` for docs scrape runs."""
13+
14+
success: bool
15+
counts: Mapping[str, int]
16+
errors: tuple[str, ...] = field(default_factory=tuple)
17+
duration_seconds: float | None = None
18+
19+
def __post_init__(self) -> None:
20+
object.__setattr__(self, "counts", MappingProxyType(dict(self.counts)))
21+
22+
@classmethod
23+
def from_run(
24+
cls,
25+
*,
26+
versions: int,
27+
pages: int = 0,
28+
dry_run: bool = False,
29+
) -> LibraryDocsTrackerResult:
30+
return cls(
31+
success=True,
32+
counts={
33+
"versions": versions,
34+
"pages": pages,
35+
"dry_run": int(dry_run),
36+
},
37+
)

boost_library_docs_tracker/tests/test_run_boost_library_docs_tracker_command.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,8 +443,16 @@ def test_call_command_dry_run_skips_pinecone(boost_library_version):
443443
ver.version = "boost-1.81.0"
444444
ver.save()
445445
buf = StringIO()
446+
from boost_library_docs_tracker.protocol_impl import LibraryDocsTrackerResult
447+
446448
with (
447-
patch.object(Command, "_run") as run_mock,
449+
patch.object(
450+
Command,
451+
"_run",
452+
return_value=LibraryDocsTrackerResult.from_run(
453+
versions=1, pages=0, dry_run=True
454+
),
455+
) as run_mock,
448456
patch.object(Command, "_sync_pinecone") as sync_mock,
449457
):
450458
call_command(

boost_library_tracker/management/commands/collect_boost_libraries.py

Lines changed: 68 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,13 @@
1717

1818
import logging
1919
import re
20+
from collections.abc import Sequence
2021

21-
from django.core.management.base import BaseCommand, CommandError
22+
from django.core.management.base import CommandError
23+
24+
from core.collectors import AbstractCollector, BaseCollectorCommand
25+
from core.protocols import TrackerResult
26+
from boost_library_tracker.protocol_impl import CollectBoostLibrariesResult
2227
from django.db import transaction
2328

2429
from boost_library_tracker.models import (
@@ -174,7 +179,7 @@ def _collect_libraries_for_version(
174179
boost_version,
175180
ref: str,
176181
*,
177-
client: GitHubAPIClient = None,
182+
client: GitHubAPIClient | None = None,
178183
dry_run: bool = False,
179184
) -> tuple[int, int]:
180185
"""
@@ -238,7 +243,53 @@ def _collect_libraries_for_version(
238243
return created_total, len(lib_submodules)
239244

240245

241-
class Command(BaseCommand):
246+
class CollectBoostLibrariesCollector(AbstractCollector):
247+
"""Collect Boost versions and library metadata from boostorg/boost."""
248+
249+
def __init__(self, cmd: "Command", options: dict) -> None:
250+
self.cmd = cmd
251+
self.options = options
252+
253+
@property
254+
def name(self) -> str:
255+
return "collect_boost_libraries"
256+
257+
def validate_config(self) -> None:
258+
return None
259+
260+
def collect(self) -> TrackerResult:
261+
dry_run = self.options.get("dry_run", False)
262+
limit = self.options.get("limit")
263+
264+
try:
265+
boost_versions_list = _parse_boost_version_option(
266+
self.options.get("boost_version")
267+
)
268+
except CommandError as e:
269+
logger.error("Error parsing --boost-version: %s", e)
270+
raise
271+
272+
target_releases: Sequence[tuple[str, str | None]] = []
273+
274+
if boost_versions_list and "all" == boost_versions_list[0]:
275+
target_releases = all_boost_versions_from_api() or []
276+
elif boost_versions_list and "new" not in boost_versions_list:
277+
target_releases = [(ref, None) for ref in boost_versions_list]
278+
elif not boost_versions_list or "new" in boost_versions_list:
279+
target_releases = new_boost_versions_from_api()
280+
281+
if not target_releases:
282+
logger.warning("No releases to process")
283+
return CollectBoostLibrariesResult.empty(dry_run=dry_run)
284+
285+
if limit:
286+
target_releases = target_releases[:limit]
287+
logger.info("Processing first %s releases", limit)
288+
289+
return self.cmd._process_refs(target_releases, dry_run=dry_run)
290+
291+
292+
class Command(BaseCollectorCommand):
242293
"""Management command: collect Boost versions and library metadata."""
243294

244295
help = (
@@ -268,44 +319,15 @@ def add_arguments(self, parser):
268319
help="Fetch and report what would be done; no DB writes.",
269320
)
270321

271-
def handle(self, *_args, **options):
272-
273-
dry_run = options.get("dry_run", False)
274-
limit = options.get("limit")
275-
276-
try:
277-
boost_versions_list = _parse_boost_version_option(
278-
options.get("boost_version")
279-
)
280-
except CommandError as e:
281-
logger.error("Error parsing --boost-version: %s", e)
282-
return
283-
284-
target_releases: list[tuple[str, str]] = []
285-
286-
if boost_versions_list and "all" == boost_versions_list[0]:
287-
target_releases = all_boost_versions_from_api()
288-
elif boost_versions_list and "new" not in boost_versions_list:
289-
target_releases = [(ref, None) for ref in boost_versions_list]
290-
elif not boost_versions_list or "new" in boost_versions_list:
291-
target_releases = new_boost_versions_from_api()
292-
293-
if not target_releases:
294-
logger.warning("No releases to process")
295-
return
296-
297-
if limit:
298-
target_releases = target_releases[:limit]
299-
logger.info("Processing first %s releases", limit)
300-
301-
self._process_refs(target_releases, dry_run=dry_run)
322+
def get_collector(self, **options) -> AbstractCollector:
323+
return CollectBoostLibrariesCollector(cmd=self, options=dict(options))
302324

303325
def _process_refs(
304326
self,
305-
target_releases: list[tuple[str, str | None]],
327+
target_releases: Sequence[tuple[str, str | None]],
306328
*,
307329
dry_run: bool = False,
308-
) -> None:
330+
) -> CollectBoostLibrariesResult:
309331
"""Process (ref, published_at) pairs; each ref in its own transaction.
310332
311333
``published_at`` is set when refs came from the GitHub releases API; use None
@@ -315,14 +337,18 @@ def _process_refs(
315337
if dry_run:
316338
logger.info("Dry run: no DB writes.")
317339
logger.info("Would process %s releases", len(target_releases))
318-
return
340+
return CollectBoostLibrariesResult.from_totals(
341+
versions_created=0,
342+
library_versions_created=0,
343+
dry_run=True,
344+
)
319345
total_versions_created = 0
320346
total_lib_versions_created = 0
321347

322348
client = get_github_client(use="scraping")
323349
if not client:
324350
logger.error("Could not create GitHub Client")
325-
return
351+
return CollectBoostLibrariesResult.empty()
326352

327353
for tag, sha in target_releases:
328354
if not sha:
@@ -363,3 +389,7 @@ def _process_refs(
363389
total_versions_created,
364390
total_lib_versions_created,
365391
)
392+
return CollectBoostLibrariesResult.from_totals(
393+
versions_created=total_versions_created,
394+
library_versions_created=total_lib_versions_created,
395+
)

boost_library_tracker/management/commands/run_boost_github_activity_tracker.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
ensure_repository_owner,
2929
get_or_create_repository,
3030
)
31+
from core.protocols import TrackerResult
32+
from github_activity_tracker.protocol_impl import GitHubSyncTrackerResult
3133
from github_activity_tracker.sync import sync_github
3234

3335
from boost_library_tracker.services import get_or_create_boost_library_repo
@@ -374,8 +376,8 @@ def validate_config(self) -> None:
374376
except ValueError as e:
375377
raise CommandError(str(e)) from e
376378

377-
def collect(self) -> None:
378-
self.cmd._handle_core(self.options)
379+
def collect(self) -> TrackerResult:
380+
return self.cmd._handle_core(self.options)
379381

380382
def sync_pinecone(self) -> None:
381383
o = self.options
@@ -454,7 +456,7 @@ def add_arguments(self, parser):
454456
def get_collector(self, **options):
455457
return BoostGithubActivityCollector(cmd=self, options=dict(options))
456458

457-
def _handle_core(self, options):
459+
def _handle_core(self, options) -> GitHubSyncTrackerResult:
458460
dry_run = options["dry_run"]
459461
skip_github_sync = options["skip_github_sync"]
460462
skip_markdown_export = options["skip_markdown_export"]
@@ -511,7 +513,7 @@ def _handle_core(self, options):
511513
if not skip_pinecone:
512514
logger.info("dry-run would run Pinecone upsert for issues and PRs")
513515
logger.info("finished successfully")
514-
return
516+
return GitHubSyncTrackerResult(success=True, counts={})
515517

516518
synced_repos: list = []
517519
if not skip_github_sync:
@@ -555,6 +557,11 @@ def _handle_core(self, options):
555557
logger.info("skipping Pinecone (--skip-pinecone)")
556558

557559
logger.info("finished successfully")
560+
repo_results = [
561+
GitHubSyncTrackerResult.from_sync_dict(sr)
562+
for _own, _repo, _boost_repo, sr in synced_repos
563+
]
564+
return GitHubSyncTrackerResult.merge(*repo_results)
558565
except Exception as e:
559566
logger.exception("command failed: %s", e)
560567
raise
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""Frozen DTOs implementing :mod:`core.protocols` for boost library tracker."""
2+
3+
from __future__ import annotations
4+
5+
from dataclasses import dataclass, field
6+
from types import MappingProxyType
7+
from typing import Mapping
8+
9+
10+
@dataclass(frozen=True)
11+
class CollectBoostLibrariesResult:
12+
"""Structured :class:`~core.protocols.TrackerResult` for library metadata collection."""
13+
14+
success: bool
15+
counts: Mapping[str, int]
16+
errors: tuple[str, ...] = field(default_factory=tuple)
17+
duration_seconds: float | None = None
18+
19+
def __post_init__(self) -> None:
20+
object.__setattr__(self, "counts", MappingProxyType(dict(self.counts)))
21+
22+
@classmethod
23+
def from_totals(
24+
cls,
25+
*,
26+
versions_created: int,
27+
library_versions_created: int,
28+
dry_run: bool = False,
29+
) -> CollectBoostLibrariesResult:
30+
return cls(
31+
success=True,
32+
counts={
33+
"versions": versions_created,
34+
"library_versions": library_versions_created,
35+
"dry_run": int(dry_run),
36+
},
37+
)
38+
39+
@classmethod
40+
def empty(cls, *, dry_run: bool = False) -> CollectBoostLibrariesResult:
41+
return cls(success=True, counts={"dry_run": int(dry_run)})

0 commit comments

Comments
 (0)