Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Changed

- **core.collectors:** `AbstractCollector.last_result` is set only after `post_collect()` completes successfully (including default incremental checkpoint persistence), matching the documented “most recent successful run” semantics.
- **discord_activity_tracker:** `backfill_discord_activity_tracker` reports per-file import failures on `DiscordCollectionTrackerResult` (`success=False`, `errors`, `failed_files` count) instead of always returning `success=True`.
- **core.protocols / ActivityRecord:** `occurred_at` is timezone-aware UTC `datetime | None`; `source_system` is `SourceSystem` (`StrEnum`); `activity_type` is branded `ActivityType`; `actor_external_id` is `ActorExternalId` (`NewType`). Legacy string payloads use `core.activity_types.migrate_legacy_activity_fields` and `activity_record_to_legacy_dict` on GitHub/Discord `protocol_impl` dataclasses.
- **Celery schedule:** Added `discord` group to `config/boost_collector_schedule.yaml` (`run_discord_activity_tracker` daily at 16:40 UTC).
- **core.collectors:** Removed deprecated `CollectorBase` and `DjangoCommandCollector`; the supported collector contract is **`AbstractCollector`** + **`BaseCollectorCommand`** (see docs).
Expand Down
1 change: 1 addition & 0 deletions STABILITY.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ Supported in production with **forward migrations** and **CHANGELOG** notes. Not
| --- | --- |
| **PostgreSQL schema** | Changed only via Django migrations; every deploy runs `python manage.py migrate` |
| **`services.py` functions** | Per-app write API; signatures may change in minor `0.x` releases when [docs/service_api/](docs/service_api/) and all callers are updated together. Cross-app reads should use **`services`** or **`sync_api`**, not foreign models (see [CONTRIBUTING.md](CONTRIBUTING.md)) |
| **Collector run outcomes** | `TrackerResult.success` and `errors` must reflect the real outcome (e.g. batch backfills must not report `success=True` when individual files fail). `AbstractCollector.last_result` is the most recent **fully** successful `run()` — after `collect()` **and** `post_collect()` (including checkpoint persistence) complete without error. |

### Tier C — Unstable

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
from django.core.management.base import CommandError

from core.collectors import AbstractCollector, BaseCollectorCommand
from core.protocols import TrackerResult
from boost_library_docs_tracker.protocol_impl import LibraryDocsTrackerResult

from boost_library_docs_tracker import fetcher, services, workspace
from boost_library_docs_tracker.preprocessor import preprocess_for_pinecone
Expand Down Expand Up @@ -72,10 +74,10 @@ def validate_config(self) -> None:
if max_pages is not None and max_pages < 1:
raise CommandError("--max-pages must be at least 1.")

def collect(self) -> None:
def collect(self) -> TrackerResult:
o = self.options
try:
self.cmd._run(
return self.cmd._run(
versions_arg=o["versions"],
library_filter=o["library"],
dry_run=o["dry_run"],
Expand Down Expand Up @@ -169,16 +171,17 @@ def _run(
max_pages,
use_local,
cleanup_extract,
):
) -> LibraryDocsTrackerResult:
versions = self._resolve_versions(versions_arg)
self.stdout.write(
f"Processing {len(versions)} version(s): {', '.join(versions)}"
)
mode = "local-zip" if use_local else "HTTP crawl"
self.stdout.write(f"Scrape mode: {mode}")

total_pages = 0
for version in versions:
self._process_version(
total_pages += self._process_version(
version=version,
library_filter=library_filter,
dry_run=dry_run,
Expand All @@ -191,6 +194,12 @@ def _run(
reason = "dry run" if dry_run else "--skip-pinecone set"
self.stdout.write(f"Skipping Pinecone sync ({reason}).")

return LibraryDocsTrackerResult.from_run(
versions=len(versions),
pages=total_pages,
dry_run=dry_run,
)

def _process_version(
self, *, version, library_filter, dry_run, max_pages, use_local, cleanup_extract
):
Expand Down Expand Up @@ -247,6 +256,7 @@ def _process_version(
)

self.stdout.write(f"[{version}] Done — {total_pages} pages total.")
return total_pages

def _prepare_local_source(self, *, version: str) -> tuple[Path, Path]:
"""Download and extract the Boost source zip for a version.
Expand Down
37 changes: 37 additions & 0 deletions boost_library_docs_tracker/protocol_impl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Frozen DTOs implementing :mod:`core.protocols` for library docs tracker."""

from __future__ import annotations

from dataclasses import dataclass, field
from types import MappingProxyType
from typing import Mapping


@dataclass(frozen=True)
class LibraryDocsTrackerResult:
"""Structured :class:`~core.protocols.TrackerResult` for docs scrape runs."""

success: bool
counts: Mapping[str, int]
errors: tuple[str, ...] = field(default_factory=tuple)
duration_seconds: float | None = None
Comment thread
leostar0412 marked this conversation as resolved.

def __post_init__(self) -> None:
object.__setattr__(self, "counts", MappingProxyType(dict(self.counts)))

@classmethod
def from_run(
cls,
*,
versions: int,
pages: int = 0,
dry_run: bool = False,
) -> LibraryDocsTrackerResult:
return cls(
success=True,
counts={
"versions": versions,
"pages": pages,
"dry_run": int(dry_run),
},
)
Original file line number Diff line number Diff line change
Expand Up @@ -443,8 +443,16 @@ def test_call_command_dry_run_skips_pinecone(boost_library_version):
ver.version = "boost-1.81.0"
ver.save()
buf = StringIO()
from boost_library_docs_tracker.protocol_impl import LibraryDocsTrackerResult

with (
patch.object(Command, "_run") as run_mock,
patch.object(
Command,
"_run",
return_value=LibraryDocsTrackerResult.from_run(
versions=1, pages=0, dry_run=True
),
) as run_mock,
patch.object(Command, "_sync_pinecone") as sync_mock,
):
call_command(
Expand Down
106 changes: 68 additions & 38 deletions boost_library_tracker/management/commands/collect_boost_libraries.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,13 @@

import logging
import re
from collections.abc import Sequence

from django.core.management.base import BaseCommand, CommandError
from django.core.management.base import CommandError

from core.collectors import AbstractCollector, BaseCollectorCommand
from core.protocols import TrackerResult
from boost_library_tracker.protocol_impl import CollectBoostLibrariesResult
from django.db import transaction

from boost_library_tracker.models import (
Expand Down Expand Up @@ -174,7 +179,7 @@ def _collect_libraries_for_version(
boost_version,
ref: str,
*,
client: GitHubAPIClient = None,
client: GitHubAPIClient | None = None,
dry_run: bool = False,
) -> tuple[int, int]:
"""
Expand Down Expand Up @@ -238,7 +243,53 @@ def _collect_libraries_for_version(
return created_total, len(lib_submodules)


class Command(BaseCommand):
class CollectBoostLibrariesCollector(AbstractCollector):
"""Collect Boost versions and library metadata from boostorg/boost."""

def __init__(self, cmd: "Command", options: dict) -> None:
self.cmd = cmd
self.options = options

@property
def name(self) -> str:
return "collect_boost_libraries"

def validate_config(self) -> None:
return None

def collect(self) -> TrackerResult:
dry_run = self.options.get("dry_run", False)
limit = self.options.get("limit")

try:
boost_versions_list = _parse_boost_version_option(
self.options.get("boost_version")
)
except CommandError as e:
logger.error("Error parsing --boost-version: %s", e)
raise

target_releases: Sequence[tuple[str, str | None]] = []

if boost_versions_list and "all" == boost_versions_list[0]:
target_releases = all_boost_versions_from_api() or []
elif boost_versions_list and "new" not in boost_versions_list:
target_releases = [(ref, None) for ref in boost_versions_list]
elif not boost_versions_list or "new" in boost_versions_list:
target_releases = new_boost_versions_from_api()

if not target_releases:
logger.warning("No releases to process")
return CollectBoostLibrariesResult.empty(dry_run=dry_run)

if limit:
target_releases = target_releases[:limit]
logger.info("Processing first %s releases", limit)

return self.cmd._process_refs(target_releases, dry_run=dry_run)


class Command(BaseCollectorCommand):
"""Management command: collect Boost versions and library metadata."""

help = (
Expand Down Expand Up @@ -268,44 +319,15 @@ def add_arguments(self, parser):
help="Fetch and report what would be done; no DB writes.",
)

def handle(self, *_args, **options):

dry_run = options.get("dry_run", False)
limit = options.get("limit")

try:
boost_versions_list = _parse_boost_version_option(
options.get("boost_version")
)
except CommandError as e:
logger.error("Error parsing --boost-version: %s", e)
return

target_releases: list[tuple[str, str]] = []

if boost_versions_list and "all" == boost_versions_list[0]:
target_releases = all_boost_versions_from_api()
elif boost_versions_list and "new" not in boost_versions_list:
target_releases = [(ref, None) for ref in boost_versions_list]
elif not boost_versions_list or "new" in boost_versions_list:
target_releases = new_boost_versions_from_api()

if not target_releases:
logger.warning("No releases to process")
return

if limit:
target_releases = target_releases[:limit]
logger.info("Processing first %s releases", limit)

self._process_refs(target_releases, dry_run=dry_run)
def get_collector(self, **options) -> AbstractCollector:
return CollectBoostLibrariesCollector(cmd=self, options=dict(options))

def _process_refs(
self,
target_releases: list[tuple[str, str | None]],
target_releases: Sequence[tuple[str, str | None]],
*,
dry_run: bool = False,
) -> None:
) -> CollectBoostLibrariesResult:
"""Process (ref, published_at) pairs; each ref in its own transaction.

``published_at`` is set when refs came from the GitHub releases API; use None
Expand All @@ -315,14 +337,18 @@ def _process_refs(
if dry_run:
logger.info("Dry run: no DB writes.")
logger.info("Would process %s releases", len(target_releases))
return
return CollectBoostLibrariesResult.from_totals(
versions_created=0,
library_versions_created=0,
dry_run=True,
)
total_versions_created = 0
total_lib_versions_created = 0

client = get_github_client(use="scraping")
if not client:
logger.error("Could not create GitHub Client")
return
return CollectBoostLibrariesResult.empty()

for tag, sha in target_releases:
if not sha:
Expand Down Expand Up @@ -363,3 +389,7 @@ def _process_refs(
total_versions_created,
total_lib_versions_created,
)
return CollectBoostLibrariesResult.from_totals(
versions_created=total_versions_created,
library_versions_created=total_lib_versions_created,
)
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
ensure_repository_owner,
get_or_create_repository,
)
from core.protocols import TrackerResult
from github_activity_tracker.protocol_impl import GitHubSyncTrackerResult
from github_activity_tracker.sync import sync_github

from boost_library_tracker.services import get_or_create_boost_library_repo
Expand Down Expand Up @@ -374,8 +376,8 @@ def validate_config(self) -> None:
except ValueError as e:
raise CommandError(str(e)) from e

def collect(self) -> None:
self.cmd._handle_core(self.options)
def collect(self) -> TrackerResult:
return self.cmd._handle_core(self.options)

def sync_pinecone(self) -> None:
o = self.options
Expand Down Expand Up @@ -454,7 +456,7 @@ def add_arguments(self, parser):
def get_collector(self, **options):
return BoostGithubActivityCollector(cmd=self, options=dict(options))

def _handle_core(self, options):
def _handle_core(self, options) -> GitHubSyncTrackerResult:
dry_run = options["dry_run"]
skip_github_sync = options["skip_github_sync"]
skip_markdown_export = options["skip_markdown_export"]
Expand Down Expand Up @@ -511,7 +513,7 @@ def _handle_core(self, options):
if not skip_pinecone:
logger.info("dry-run would run Pinecone upsert for issues and PRs")
logger.info("finished successfully")
return
return GitHubSyncTrackerResult(success=True, counts={})

synced_repos: list = []
if not skip_github_sync:
Expand Down Expand Up @@ -555,6 +557,11 @@ def _handle_core(self, options):
logger.info("skipping Pinecone (--skip-pinecone)")

logger.info("finished successfully")
repo_results = [
GitHubSyncTrackerResult.from_sync_dict(sr)
for _own, _repo, _boost_repo, sr in synced_repos
]
return GitHubSyncTrackerResult.merge(*repo_results)
except Exception as e:
logger.exception("command failed: %s", e)
raise
41 changes: 41 additions & 0 deletions boost_library_tracker/protocol_impl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Frozen DTOs implementing :mod:`core.protocols` for boost library tracker."""

from __future__ import annotations

from dataclasses import dataclass, field
from types import MappingProxyType
from typing import Mapping


@dataclass(frozen=True)
class CollectBoostLibrariesResult:
"""Structured :class:`~core.protocols.TrackerResult` for library metadata collection."""

success: bool
counts: Mapping[str, int]
errors: tuple[str, ...] = field(default_factory=tuple)
duration_seconds: float | None = None

Comment thread
leostar0412 marked this conversation as resolved.
def __post_init__(self) -> None:
object.__setattr__(self, "counts", MappingProxyType(dict(self.counts)))

@classmethod
def from_totals(
cls,
*,
versions_created: int,
library_versions_created: int,
dry_run: bool = False,
) -> CollectBoostLibrariesResult:
return cls(
success=True,
counts={
"versions": versions_created,
"library_versions": library_versions_created,
"dry_run": int(dry_run),
},
)

@classmethod
def empty(cls, *, dry_run: bool = False) -> CollectBoostLibrariesResult:
return cls(success=True, counts={"dry_run": int(dry_run)})
Loading
Loading