diff --git a/.env.example b/.env.example index 4cc3e74f..8fd9918f 100644 --- a/.env.example +++ b/.env.example @@ -116,10 +116,12 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # CELERY_BROKER_URL=redis://localhost:6379/0 # CELERY_RESULT_BACKEND=redis://localhost:6379/0 # Worker: celery -A config worker -l info -# Optional override: set true to require config/boost_collector_schedule.yaml at startup +# Optional override: set true to require the collector schedule YAML at startup # when DEBUG=True (e.g. CI / staging). Unset or false for typical local dev; production # already enforces this when DEBUG=False. # BOOST_COLLECTOR_SCHEDULE_STRICT=false +# Path to schedule YAML (relative to repo root or absolute). Default: config/boost_collector_schedule.yaml +# BOOST_COLLECTOR_SCHEDULE_YAML=config/boost_collector_schedule.yaml # ============================================================================== # Workspace @@ -204,16 +206,9 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # GIT_AUTHOR_EMAIL=unknown@noreply.github.com # ============================================================================== -# Slack (slack_event_handler, cppa_slack_tracker) +# Slack (cppa_slack_tracker — public batch collector) # ============================================================================== -# Used by Socket Mode listeners, huddle transcript collection, and the Slack PR bot. -# See docs/Docker.md (Slack session tokens) and SECURITY.md (secret handling). - -# --- Huddle transcripts (GitHub upload target) --- -# GITHUB_SLACK_HUDDLE_REPO_OWNER=your-org -# GITHUB_SLACK_HUDDLE_REPO_NAME=your-repo - -# --- Bot tokens (required for API calls and Socket Mode) --- +# --- Bot tokens (required for cppa_slack_tracker API calls) --- # List every workspace team ID, then set one bot token per team. # SLACK_TEAM_IDS=T01234ABCD,T05678EFGH # SLACK_BOT_TOKEN_T01234ABCD=xoxb-your-bot-token @@ -222,83 +217,6 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # Optional single-team shorthand when SLACK_TEAM_IDS is unset: # SLACK_TEAM_ID=T01234ABCD -# --- Socket Mode (App-Level Token; scope: connections:write) --- -# SLACK_APP_TOKEN_T01234ABCD=xapp-your-app-token -# SLACK_APP_TOKEN_T05678EFGH=xapp-your-app-token - -# --- Per-team features (slack_event_handler) --- -# SLACK_TEAM_SCOPE_: comma-separated scopes; omit or leave empty for both. -# 0 = huddle AI note / transcript pipeline -# 1 = Slack PR comment bot -# SLACK_TEAM_SCOPE_T01234ABCD=0 -# SLACK_TEAM_SCOPE_T05678EFGH=1 - -# --- Slack PR bot (GitHub comments from Slack) --- -# SLACK_PR_BOT_TEAM=your-github-org-or-user -# SLACK_PR_BOT_GITHUB_TOKEN=ghp_your_token -# SLACK_PR_BOT_CHANNEL_NAME=slack-bot -# SLACK_PR_BOT_COMMENT_TEMPLATE=Automated comment from Slack bot. -# SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW=5 -# SLACK_PR_BOT_COMMENTS_WINDOW_SECONDS=3600 - -# --- Internal session tokens (xoxc/xoxd; compliance-gated) --- -# Do not put xoxc/xoxd in .env. When enabled, tokens live in workspace JSON and are -# loaded at runtime (not at Django startup). Huddle fetch can re-extract from the -# Chrome profile when JSON tokens are stale but the browser session is still valid. -# ALLOW_INTERNAL_SLACK_TOKENS=false -# SLACK_INTERNAL_TOKENS_JSON= -# Default path: workspace/slack_event_handler/slack_internal_tokens.json -# -# Chrome user-data directory (logged-in Slack session on disk): -# CHROME_PROFILE_PATH= -# Default: workspace/slack_event_handler/chrome_profile - -# ============================================================================== -# Discord (discord_activity_tracker) -# ============================================================================== -# Preferred: bot token. -# DISCORD_TOKEN=your.bot.token -# -# User token violates Discord ToS; use only if the bot path is impossible. -# See docs/operations/discord_chat_exporter.md (Tyrrrz upstream: Token and IDs, CLI guide). -# DISCORD_USER_TOKEN=your.user.token -# -# --- Internal Discord user token (compliance-gated) --- -# Do not put user token in .env when using workspace JSON. When enabled, tokens live in -# workspace JSON and are loaded at runtime (not at Django startup). Export can re-extract -# from the Chrome profile when JSON tokens are stale but the browser session is still valid. -# ALLOW_INTERNAL_DISCORD_TOKENS=false -# DISCORD_INTERNAL_TOKENS_JSON= -# Default path: workspace/discord_activity_tracker/discord_internal_tokens.json -# -# Chrome user-data directory (logged-in Discord session on disk): -# DISCORD_CHROME_PROFILE_PATH= -# Default: workspace/discord_activity_tracker/chrome_profile -# -# DISCORD_SERVER_ID=987654321098765432 -# DISCORD_CONTEXT_REPO_PATH=/absolute/path/to/discord-cplusplus-together-context -# DISCORD_CONTEXT_AUTO_COMMIT=false -# -# DiscordChatExporter CLI: -# https://github.com/Tyrrrz/DiscordChatExporter/releases -# DISCORD_CHAT_EXPORTER_CLI=/path/to/DiscordChatExporter.Cli -# macOS: system dotnet + DLL (avoids quarantined bundled runtime on some disks): -# DISCORD_CHAT_EXPORTER_DOTNET_DLL=/path/to/DiscordChatExporter.Cli.dll -# DISCORD_CHAT_EXPORTER_DOTNET=/usr/local/share/dotnet/dotnet -# DISCORD_CHAT_EXPORTER_MACOS_CLEAR_QUARANTINE=false -# DISCORD_CHAT_EXPORTER_PARALLEL=1 -# DISCORD_CHAT_EXPORTER_INCLUDE_VC=false -# DISCORD_CHAT_EXPORTER_SEQUENTIAL_EXPORT=true -# -# Injected into the exporter subprocess unless overridden (macOS memory pressure): -# DOTNET_GCConserveMemory=9 -# DOTNET_GCHighMemPercent=50 -# DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1 -# -# DISCORD_CHANNEL_IDS=851121440425639956,123456789012345678 -# PINECONE_DISCORD_APP_TYPE=discord -# PINECONE_DISCORD_NAMESPACE=discord-messages - # ============================================================================== # Reddit (reddit_activity_tracker) # ============================================================================== diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 1aff7524..d79abc84 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -17,10 +17,8 @@ boost_mailing_list_tracker/ @jonathanMLDev @wpak-ai cppa_pinecone_sync/ @jonathanMLDev @wpak-ai clang_github_tracker/ @snowfox1003 @wpak-ai cppa_slack_tracker/ @snowfox1003 @wpak-ai -discord_activity_tracker/ @snowfox1003 @wpak-ai wg21_paper_tracker/ @snowfox1003 @wpak-ai cppa_youtube_script_tracker/ @jonathanMLDev @wpak-ai -slack_event_handler/ @snowfox1003 @wpak-ai core/ @snowfox1003 @jonathanMLDev @wpak-ai .github/workflows/ @snowfox1003 @wpak-ai diff --git a/.gitignore b/.gitignore index d0b95fcd..79dc502c 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ db.sqlite3 staticfiles/ media/ .test_artifacts/ +config/boost_collector_schedule.local.yaml # Ephemeral probe app + pyrightconfig for scripts/validate_collector_scaffold.py (entire tree ignored above). # Testing / coverage @@ -46,8 +47,6 @@ celerybeat.pid *.swo .cursor/ -# Optional legacy CLI folder under the Django app (default CLI lives in workspace/.../script/) -discord_activity_tracker/tools/ # macOS .DS_Store ._* diff --git a/.importlinter b/.importlinter index b917510e..6cd05779 100644 --- a/.importlinter +++ b/.importlinter @@ -11,9 +11,7 @@ root_packages = cppa_slack_tracker cppa_user_tracker cppa_youtube_script_tracker - discord_activity_tracker github_activity_tracker - slack_event_handler wg21_paper_tracker [importlinter:contract:forbid-tech-debt-pinecone] diff --git a/CHANGELOG.md b/CHANGELOG.md index d170f974..c8c4d026 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Removed + +- **`discord_activity_tracker`** and **`slack_event_handler`** apps, workspace layouts, service API docs, and related operations guides from this repository. +- **`DiscordProfile`** from `cppa_user_tracker` Django state (`0010_remove_discordprofile`); physical table `cppa_user_tracker_discordprofile` is unchanged. + ## [0.2.0] - 2026-06-12 ### Added diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6ce53bcc..0adf559c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -64,7 +64,6 @@ Each Django app that has **models** provides a **`services.py`** module. This is | `boost_library_docs_tracker` | `boost_library_docs_tracker/services.py` | BoostDocContent and BoostLibraryDocumentation (doc scrape and sync status). | | `boost_usage_tracker` | `boost_usage_tracker/services.py` | External repos, Boost usage, missing-header tmp. | | `cppa_pinecone_sync` | `cppa_pinecone_sync/services.py` | Pinecone fail list and sync status writes. | -| `discord_activity_tracker` | `discord_activity_tracker/services.py` | Servers, channels, messages, reactions (Discord user profiles in cppa_user_tracker). | | `cppa_youtube_script_tracker` | `cppa_youtube_script_tracker/services.py` | YouTube channels, videos, tags, transcript state, speaker links. | | `clang_github_tracker` | `clang_github_tracker/services.py` | Clang/llvm GitHub issue, PR, and commit upserts; fetch watermarks. | | `boost_mailing_list_tracker` | `boost_mailing_list_tracker/services.py` | Mailing list messages and names. | diff --git a/Makefile b/Makefile index 16803880..0d7648fe 100644 --- a/Makefile +++ b/Makefile @@ -52,22 +52,6 @@ help: @echo " test-fast Run tests, stop on first failure" @echo " test-cov Run tests with coverage report" @echo "" - @echo " Slack session (xoxc/xoxd token extraction)" - @echo " slack-login Start slack-chromium (noVNC http://127.0.0.1:7900)" - @echo " slack-wait-profile Wait until Slack login wrote Cookies + LevelDB" - @echo " slack-login-stop Stop slack-chromium before extract" - @echo " extract-slack-tokens Extract tokens to workspace JSON (one-shot)" - @echo " slack-tokens-reextract Stop chromium → extract JSON" - @echo " slack-tokens-refresh Login (noVNC) → wait → extract JSON" - @echo "" - @echo " Discord session (user token extraction)" - @echo " discord-login Start discord-chromium (noVNC http://127.0.0.1:7901)" - @echo " discord-wait-profile Wait until Discord login wrote Cookies + LevelDB" - @echo " discord-login-stop Stop discord-chromium before extract" - @echo " extract-discord-tokens Extract token to workspace JSON (one-shot)" - @echo " discord-tokens-reextract Stop chromium → extract JSON" - @echo " discord-tokens-refresh Login (noVNC) → wait → extract JSON" - @echo "" @echo " Utilities" @echo " clean-mac Remove macOS ._* resource-fork files" @echo " clean-pyc Remove compiled Python files" @@ -184,63 +168,6 @@ test-fast: test-cov: python -m pytest --tb=short --cov=. --cov-report=term-missing -# ── Slack session ───────────────────────────────────────────────────────────── - -.PHONY: slack-login slack-wait-profile slack-login-stop extract-slack-tokens \ - slack-tokens-reextract slack-tokens-refresh - -slack-login: - @mkdir -p workspace/slack_event_handler/chrome_profile - $(COMPOSE) --profile slack-session up -d --force-recreate slack-chromium - @echo "Open http://127.0.0.1:7900 and sign in at https://app.slack.com (wait until Slack is fully loaded)" - @command -v open >/dev/null 2>&1 && open "http://127.0.0.1:7900" || true - -slack-wait-profile: - @chmod +x scripts/wait_slack_chrome_profile.sh - @./scripts/wait_slack_chrome_profile.sh - -slack-login-stop: - $(COMPOSE) --profile slack-session stop slack-chromium - -extract-slack-tokens: slack-login-stop - $(MANAGE) extract_slack_tokens - -# Profile already exists (re-extract without opening noVNC again). -slack-tokens-reextract: extract-slack-tokens - -# Login in noVNC, wait for profile files, then extract JSON. -slack-tokens-refresh: slack-login slack-wait-profile extract-slack-tokens - -# ── Discord session ─────────────────────────────────────────────────────────── - -.PHONY: discord-login discord-wait-profile discord-login-stop extract-discord-tokens \ - discord-tokens-reextract discord-tokens-refresh - -discord-login: - @mkdir -p workspace/discord_activity_tracker/chrome_profile - @rm -f workspace/discord_activity_tracker/chrome_profile/SingletonLock \ - workspace/discord_activity_tracker/chrome_profile/SingletonCookie \ - workspace/discord_activity_tracker/chrome_profile/SingletonSocket - $(COMPOSE) --profile discord-session up -d --force-recreate discord-chromium - @echo "noVNC (password: secret) — Chrome does NOT open automatically:" - @echo " http://127.0.0.1:7901/?autoconnect=1&resize=scale&password=secret" - @echo "Right-click desktop → Web Browsing → Google Chrome → https://discord.com" - @command -v open >/dev/null 2>&1 && open "http://127.0.0.1:7901/?autoconnect=1&resize=scale&password=secret" || true - -discord-wait-profile: - @chmod +x scripts/wait_discord_chrome_profile.sh - @./scripts/wait_discord_chrome_profile.sh - -discord-login-stop: - $(COMPOSE) --profile discord-session stop discord-chromium - -extract-discord-tokens: discord-login-stop - $(MANAGE) extract_discord_tokens - -discord-tokens-reextract: extract-discord-tokens - -discord-tokens-refresh: discord-login discord-wait-profile extract-discord-tokens - # ── Utilities ───────────────────────────────────────────────────────────────── .PHONY: clean-mac diff --git a/README.md b/README.md index ae863625..537a370e 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,7 @@ python -m pytest github_activity_tracker/tests/test_sync_utils.py -v CI runs pytest with coverage (`--cov`, HTML/XML reports). To match a **local** coverage gate, use **`--cov-fail-under=90`** (see step 5 above). If coverage fails locally or you need a fresh test DB schema after model changes, run once with `python -m pytest --create-db`. -**Pyright (local):** with dev dependencies installed (`uv pip install -r requirements-dev.lock`), run **`uv run pyright`** from the repo root to match the **`pyright`** CI job (`pyrightconfig.json` scopes `core`, `github_activity_tracker`, `discord_activity_tracker`, `cppa_slack_tracker`, `cppa_user_tracker`, and `cppa_pinecone_sync`). +**Pyright (local):** with dev dependencies installed (`uv pip install -r requirements-dev.lock`), run **`uv run pyright`** from the repo root to match the **`pyright`** CI job (`pyrightconfig.json` scopes `core`, `github_activity_tracker`, `cppa_slack_tracker`, `cppa_user_tracker`, and `cppa_pinecone_sync`). See [docs/Development_guideline.md](docs/Development_guideline.md#testing-workflow) for when to run tests during development. @@ -231,7 +231,7 @@ Typical top-level layout after clone (folder name is usually **`boost-data-colle │ ├── shared/ │ ├── scripts/ │ ├── github_activity_tracker/ -│ └── … # e.g. boost_library_tracker/, discord_activity_tracker/, … +│ └── … # e.g. boost_library_tracker/, cppa_slack_tracker/, … ├── scripts/ # Repo maintenance and codegen helpers ├── core/ # Shared collectors + operations (GitHub, Slack, markdown, files) ├── boost_collector_runner/ # YAML schedule → run_scheduled_collectors @@ -245,9 +245,7 @@ Typical top-level layout after clone (folder name is usually **`boost-data-colle ├── cppa_slack_tracker/ ├── cppa_user_tracker/ ├── cppa_youtube_script_tracker/ -├── discord_activity_tracker/ ├── github_activity_tracker/ -├── slack_event_handler/ └── wg21_paper_tracker/ ``` @@ -274,8 +272,6 @@ Some Django apps include a **README.md** at the app package root when that helps | [`boost_library_usage_dashboard/`](boost_library_usage_dashboard/README.md) | Library usage data for dashboards. | | [`cppa_slack_tracker/`](cppa_slack_tracker/README.md) | CPPA Slack workspace collection. | | [`cppa_user_tracker/`](cppa_user_tracker/README.md) | CPPA users and GitHub account linkage. | -| [`discord_activity_tracker/`](discord_activity_tracker/README.md) | Discord activity ingestion (exporter + workspace). | -| [`slack_event_handler/`](slack_event_handler/README.md) | Slack Socket Mode listener (dev `runserver` integration). | ## How it works diff --git a/SECURITY.md b/SECURITY.md index 28e893b9..2e81e35f 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -83,9 +83,9 @@ If you operate a deployment and suspect a leak or breach, **rotate** at least th | Category | Examples / environment variables | | --- | --- | -| **GitHub** | `GITHUB_TOKEN`, `GITHUB_TOKENS_SCRAPING` (multi-token pool), `GITHUB_TOKEN_WRITE`; PAT-style tokens used by integrations (for example `SLACK_PR_BOT_GITHUB_TOKEN` if it is a PAT) | +| **GitHub** | `GITHUB_TOKEN`, `GITHUB_TOKENS_SCRAPING` (multi-token pool), `GITHUB_TOKEN_WRITE` | | **Slack** | `SLACK_BOT_TOKEN_`, `SLACK_APP_TOKEN_` | -| **Discord** | `DISCORD_TOKEN` | +| **Notifications** | `DISCORD_WEBHOOK_URL`, `SLACK_WEBHOOK_URL` (optional error alerting) | | **Pinecone** | `PINECONE_API_KEY`, `PINECONE_PRIVATE_API_KEY`, and any host/index settings that grant write access | | **YouTube** | `YOUTUBE_API_KEY` | diff --git a/STABILITY.md b/STABILITY.md index c886ce16..745d17b3 100644 --- a/STABILITY.md +++ b/STABILITY.md @@ -61,7 +61,6 @@ These per-collector command names appear in [config/boost_collector_schedule.yam - `collect_boost_libraries` - `run_wg21_paper_tracker` - `run_cppa_slack_tracker` -- `run_discord_activity_tracker` - `run_boost_mailing_list_tracker` Other `manage.py` commands exist for manual runs, backfills, and development; only commands **listed in your deployed schedule YAML** (plus **`run_scheduled_collectors`**) are Tier A for that deployment. @@ -149,7 +148,7 @@ No compatibility promise. May change in any release without deprecation. - Imports of tracker internals bypassing `sync_api` (e.g. `github_activity_tracker.fetcher`, `cppa_pinecone_sync.sync` from apps covered by import-linter). - Workspace directory layouts under `WORKSPACE_DIR`, except paths explicitly documented in [`.env.example`](.env.example) and [docs/Workspace.md](docs/Workspace.md). **Per-app JSON schemas** under `workspace/` are not stable. - Docker Compose service names (`web`, `celery_worker`, `celery_beat`) and host ports are not Tier A unless documented here in a future release. -- `slack_event_handler` internals, management commands not in your schedule, scripts under `scripts/`, tests, and Django admin customization. +- Optional apps registered via `config/local_settings.py`, management commands not in your schedule, scripts under `scripts/`, tests, and Django admin customization. ## Deprecation diff --git a/boost_collector_runner/schedule_config.py b/boost_collector_runner/schedule_config.py index 24936e6d..6438028e 100644 --- a/boost_collector_runner/schedule_config.py +++ b/boost_collector_runner/schedule_config.py @@ -74,6 +74,27 @@ ) +def resolve_schedule_yaml_path( + *, + base_dir: Path, + env_path: str = "", +) -> Path: + """ + Resolve the collector schedule YAML path. + + Precedence: ``env_path`` (from parent ``BOOST_COLLECTOR_SCHEDULE_YAML`` in ``.env``), + then the default ``config/boost_collector_schedule.yaml`` under ``base_dir``. + Relative paths are resolved under ``base_dir``. + """ + raw = (env_path or "").strip() + if not raw: + return (base_dir / "config" / "boost_collector_schedule.yaml").resolve() + path = Path(raw) + if not path.is_absolute(): + path = base_dir / path + return path.resolve() + + class ScheduleConfigurationError(ImproperlyConfigured): """Raised when the collector schedule YAML is missing or invalid in strict mode.""" diff --git a/boost_collector_runner/tests/test_schedule_config.py b/boost_collector_runner/tests/test_schedule_config.py index 8b946af9..84724aba 100644 --- a/boost_collector_runner/tests/test_schedule_config.py +++ b/boost_collector_runner/tests/test_schedule_config.py @@ -13,13 +13,42 @@ DEFAULT_GROUP_BATCH_SCHEDULE_KIND, INTERVAL_MINUTES_MAX, ScheduleConfigurationError, + ensure_schedule_yaml_loaded, get_beat_schedule, + get_groups_and_tasks, get_tasks_for_schedule, + is_schedule_strict, + iter_beat_schedule_entry_keys, load_config, + resolve_schedule_yaml_path, + _parse_time, _validate_task, ) +# --- resolve_schedule_yaml_path --- + + +def test_resolve_schedule_yaml_path_default(tmp_path): + assert ( + resolve_schedule_yaml_path(base_dir=tmp_path) + == (tmp_path / "config" / "boost_collector_schedule.yaml").resolve() + ) + + +def test_resolve_schedule_yaml_path_from_env_relative(tmp_path): + path = resolve_schedule_yaml_path( + base_dir=tmp_path, + env_path="custom/schedule.yaml", + ) + assert path == (tmp_path / "custom" / "schedule.yaml").resolve() + + +def test_resolve_schedule_yaml_path_from_env_absolute(tmp_path): + custom = tmp_path / "abs" / "schedule.yaml" + assert resolve_schedule_yaml_path(base_dir=tmp_path, env_path=str(custom)) == custom + + # --- load_config validation --- @@ -822,3 +851,272 @@ def test_committed_schedule_yaml_loads_non_empty_beat_schedule(settings): assert cmd in registered, f"unknown management command in YAML: {cmd!r}" schedule = get_beat_schedule() assert schedule, "CELERY_BEAT_SCHEDULE must not be empty when committed YAML exists" + + +# --- is_schedule_strict / ensure_schedule_yaml_loaded --- + + +def test_is_schedule_strict_explicit_override(): + assert is_schedule_strict(strict=True) is True + assert is_schedule_strict(strict=False) is False + + +@pytest.mark.django_db +def test_is_schedule_strict_from_settings(settings): + settings.DEBUG = True + settings.BOOST_COLLECTOR_SCHEDULE_STRICT = False + assert is_schedule_strict() is False + + settings.DEBUG = False + assert is_schedule_strict() is True + + settings.DEBUG = True + settings.BOOST_COLLECTOR_SCHEDULE_STRICT = True + assert is_schedule_strict() is True + + +def test_ensure_schedule_yaml_loaded_raises_when_missing(tmp_path): + missing = tmp_path / "missing.yaml" + with pytest.raises(ScheduleConfigurationError, match="not found"): + with patch( + "boost_collector_runner.schedule_config._get_yaml_path", + return_value=missing, + ): + ensure_schedule_yaml_loaded() + + +# --- iter_beat_schedule_entry_keys / get_groups_and_tasks --- + + +def test_iter_beat_schedule_entry_keys_matches_get_beat_schedule_keys( + tmp_path, settings +): + yaml_path = tmp_path / "schedule.yaml" + yaml_path.write_text( + yaml.dump( + { + "groups": { + "g1": { + "default_time": "04:10", + "tasks": [ + {"command": "run_foo", "schedule": "daily"}, + { + "command": "run_interval", + "schedule": "interval", + "minutes": 30, + }, + ], + }, + }, + } + ), + encoding="utf-8", + ) + data = load_config(yaml_path) + keys = list(iter_beat_schedule_entry_keys(data)) + assert keys == [ + "boost-collector-group-g1-04-10", + "boost-collector-interval-30min", + ] + + +def test_get_groups_and_tasks_skips_disabled_tasks(tmp_path): + path = tmp_path / "schedule.yaml" + path.write_text( + yaml.dump( + { + "groups": { + "g1": { + "default_time": "04:10", + "tasks": [ + {"command": "run_on", "schedule": "daily"}, + { + "command": "run_off", + "schedule": "daily", + "enabled": False, + }, + ], + }, + }, + } + ), + encoding="utf-8", + ) + data = load_config(path) + groups = get_groups_and_tasks(data=data) + assert len(groups) == 1 + assert len(groups[0][2]) == 1 + assert groups[0][2][0]["command"] == "run_on" + + +def test_get_groups_and_tasks_raises_when_default_time_blank(): + data = { + "groups": { + "g2": { + "default_time": " ", + "tasks": [], + }, + }, + } + with pytest.raises(ValueError, match="must have 'default_time'"): + get_groups_and_tasks(data=data) + + +def test_parse_time_rejects_non_numeric_parts(): + with pytest.raises(ValueError, match="Invalid time"): + _parse_time("ab:cd") + + +# --- get_tasks_for_schedule validation --- + + +def test_get_tasks_for_schedule_rejects_invalid_schedule_kind(tmp_path): + data = load_config(_minimal_schedule_yaml(tmp_path)) + with pytest.raises(ValueError, match="schedule_kind must be one of"): + get_tasks_for_schedule("not-a-schedule", data=data) + + +def test_get_tasks_for_schedule_weekly_requires_day_of_week(tmp_path): + data = load_config(_minimal_schedule_yaml(tmp_path)) + with pytest.raises(ValueError, match="day_of_week required"): + get_tasks_for_schedule("weekly", data=data) + + +def test_get_tasks_for_schedule_weekly_rejects_invalid_day(tmp_path): + data = load_config(_minimal_schedule_yaml(tmp_path)) + with pytest.raises(ValueError, match="day_of_week must be monday"): + get_tasks_for_schedule("weekly", day_of_week="notaday", data=data) + + +def test_get_tasks_for_schedule_monthly_requires_day_of_month(tmp_path): + data = load_config(_minimal_schedule_yaml(tmp_path)) + with pytest.raises(ValueError, match="day_of_month required"): + get_tasks_for_schedule("monthly", data=data) + + +def test_get_tasks_for_schedule_monthly_rejects_non_integer(tmp_path): + data = load_config(_minimal_schedule_yaml(tmp_path)) + with pytest.raises(ValueError, match="day_of_month must be an integer"): + get_tasks_for_schedule("monthly", day_of_month="x", data=data) + + +def test_get_tasks_for_schedule_monthly_rejects_out_of_range(tmp_path): + data = load_config(_minimal_schedule_yaml(tmp_path)) + with pytest.raises(ValueError, match="day_of_month must be 1-31"): + get_tasks_for_schedule("monthly", day_of_month=32, data=data) + + +def test_get_tasks_for_schedule_interval_requires_minutes(tmp_path): + data = load_config(_minimal_schedule_yaml(tmp_path)) + with pytest.raises(ValueError, match="interval_minutes required"): + get_tasks_for_schedule("interval", data=data) + + +def test_get_tasks_for_schedule_interval_rejects_non_integer(tmp_path): + data = load_config(_minimal_schedule_yaml(tmp_path)) + with pytest.raises(ValueError, match="interval_minutes must be an integer"): + get_tasks_for_schedule("interval", interval_minutes="x", data=data) + + +def test_get_tasks_for_schedule_interval_rejects_out_of_range(tmp_path): + data = load_config(_minimal_schedule_yaml(tmp_path)) + with pytest.raises( + ValueError, match=f"interval_minutes must be 1-{INTERVAL_MINUTES_MAX}" + ): + get_tasks_for_schedule( + "interval", + interval_minutes=INTERVAL_MINUTES_MAX + 1, + data=data, + ) + + +def test_get_tasks_for_schedule_daily_and_weekly_filters(tmp_path): + path = tmp_path / "schedule.yaml" + path.write_text( + yaml.dump( + { + "groups": { + "g1": { + "default_time": "04:10", + "tasks": [ + {"command": "run_daily", "schedule": "daily"}, + { + "command": "run_weekly", + "schedule": "weekly", + "on": "tuesday", + }, + ], + }, + }, + } + ), + encoding="utf-8", + ) + data = load_config(path) + daily = get_tasks_for_schedule("daily", data=data) + assert len(daily) == 1 + assert daily[0][1]["command"] == "run_daily" + + weekly = get_tasks_for_schedule("weekly", day_of_week="tuesday", data=data) + assert len(weekly) == 1 + assert weekly[0][1]["command"] == "run_weekly" + + +def test_get_tasks_for_schedule_group_batch_excludes_interval(tmp_path): + path = tmp_path / "schedule.yaml" + path.write_text( + yaml.dump( + { + "groups": { + "g1": { + "default_time": "04:10", + "tasks": [ + {"command": "run_daily", "schedule": "daily"}, + { + "command": "run_interval", + "schedule": "interval", + "minutes": 15, + }, + ], + }, + }, + } + ), + encoding="utf-8", + ) + data = load_config(path) + batch = get_tasks_for_schedule(DEFAULT_GROUP_BATCH_SCHEDULE_KIND, data=data) + commands = {t[1]["command"] for t in batch} + assert commands == {"run_daily"} + + +@pytest.mark.django_db +def test_get_beat_schedule_invalid_yaml_non_strict_returns_empty( + tmp_path, caplog, settings +): + settings.DEBUG = True + settings.BOOST_COLLECTOR_SCHEDULE_STRICT = False + bad = tmp_path / "bad.yaml" + bad.write_text(yaml.dump({"groups": []}), encoding="utf-8") + caplog.set_level(logging.WARNING) + schedule = get_beat_schedule(strict=False, yaml_path=bad) + assert schedule == {} + assert any("Invalid schedule YAML" in r.getMessage() for r in caplog.records) + + +def _minimal_schedule_yaml(tmp_path: Path) -> Path: + path = tmp_path / "schedule.yaml" + path.write_text( + yaml.dump( + { + "groups": { + "g1": { + "default_time": "04:10", + "tasks": [{"command": "run_foo", "schedule": "daily"}], + }, + }, + } + ), + encoding="utf-8", + ) + return path diff --git a/config/boost_collector_schedule.yaml b/config/boost_collector_schedule.yaml index e747c5bb..2f0b388c 100644 --- a/config/boost_collector_schedule.yaml +++ b/config/boost_collector_schedule.yaml @@ -43,12 +43,6 @@ groups: - command: run_cppa_slack_tracker schedule: daily - discord: - default_time: "16:40" - tasks: - - command: run_discord_activity_tracker - schedule: daily - mailing_list: default_time: "00:10" tasks: diff --git a/config/boost_collector_schedule.yaml.example b/config/boost_collector_schedule.yaml.example index 71af1d2e..fca40ec7 100644 --- a/config/boost_collector_schedule.yaml.example +++ b/config/boost_collector_schedule.yaml.example @@ -47,9 +47,6 @@ groups: tasks: - command: run_cppa_slack_tracker schedule: daily - - command: run_discord_activity_tracker - schedule: interval - minutes: 60 mailing_list: default_time: "00:10" diff --git a/config/settings.py b/config/settings.py index e9e78b55..fdc22e6d 100644 --- a/config/settings.py +++ b/config/settings.py @@ -12,6 +12,27 @@ # Build paths BASE_DIR = Path(__file__).resolve().parent.parent +# Optional machine-specific Django overrides (config/local_settings.py). +try: + from . import local_settings as _local_settings +except ModuleNotFoundError as exc: + if exc.name not in {"config.local_settings", "local_settings"}: + raise + _local_settings = None +except ImportError as exc: + # Missing submodule file raises ImportError(name=__package__), not ModuleNotFoundError. + if getattr(exc, "name", None) != __package__: + raise + _local_settings = None + +_local_app_dir = None +if _local_settings is not None: + _local_app_dir = getattr(_local_settings, "LOCAL_APP_DIR", None) + if _local_app_dir: + _local_app_root = (BASE_DIR / _local_app_dir).resolve() + if _local_app_root.is_dir(): + sys.path.insert(0, str(_local_app_root)) + # Load environment env = environ.Env( DEBUG=(bool, False), @@ -68,11 +89,9 @@ "cppa_pinecone_sync", "clang_github_tracker", "cppa_slack_tracker", - "discord_activity_tracker", "reddit_activity_tracker", "wg21_paper_tracker", "cppa_youtube_script_tracker", - "slack_event_handler", ] MIDDLEWARE = [ @@ -162,16 +181,19 @@ "boost_library_usage_dashboard", "boost_usage_tracker", "cppa_slack_tracker", - "discord_activity_tracker", "reddit_activity_tracker", "boost_mailing_list_tracker", "wg21_paper_tracker", "cppa_youtube_script_tracker", - "slack_event_handler", "shared", ) +_EXTRA_WORKSPACE_SLUGS = ( + tuple(getattr(_local_settings, "EXTRA_WORKSPACE_APP_SLUGS", ())) + if _local_settings is not None + else () +) WORKSPACE_DIR.mkdir(parents=True, exist_ok=True) -for _slug in _WORKSPACE_APP_SLUGS: +for _slug in (*_WORKSPACE_APP_SLUGS, *_EXTRA_WORKSPACE_SLUGS): (WORKSPACE_DIR / _slug).mkdir(parents=True, exist_ok=True) # Orphan workspace cleanup (github_activity_tracker JSON cache — see docs/Workspace.md) @@ -351,7 +373,7 @@ ).strip() -# Slack (bot + app token for operations.slack_ops and slack_event_handler) +# Slack (bot + app token for operations.slack_ops and cppa_slack_tracker) # SLACK_BOT_TOKEN: built from env (prefixed vars). In settings it is a dict (team_id -> token). # Env: SLACK_TEAM_IDS=id1,id2 and SLACK_BOT_TOKEN_id1=xoxb-..., etc. @@ -416,128 +438,6 @@ def _slack_team_scope_from_env(): SLACK_TEAM_SCOPE = _slack_team_scope_from_env() -ALLOW_INTERNAL_SLACK_TOKENS = ( - env("ALLOW_INTERNAL_SLACK_TOKENS", default="") or "" -).strip().lower() == "true" -SLACK_INTERNAL_TOKENS_JSON = ( - env("SLACK_INTERNAL_TOKENS_JSON", default="") or "" -).strip() -# xoxc/xoxd are read at runtime from workspace JSON (see slack_internal_tokens_store), -# not loaded into settings at Django startup. -SLACK_XOXC_TOKEN = "" -SLACK_XOXD_TOKEN = "" -# Chrome user-data dir for Slack xoxc/xoxd extraction (logged-in session on disk) -_DEFAULT_CHROME_PROFILE = str(WORKSPACE_DIR / "slack_event_handler" / "chrome_profile") -CHROME_PROFILE_PATH = ( - env("CHROME_PROFILE_PATH", default=_DEFAULT_CHROME_PROFILE) or "" -).strip() - -# Slack PR Bot configuration (for slack_event_handler) -SLACK_PR_BOT_TEAM = (env("SLACK_PR_BOT_TEAM", default="") or "").strip() -SLACK_PR_BOT_GITHUB_TOKEN = (env("SLACK_PR_BOT_GITHUB_TOKEN", default="") or "").strip() -SLACK_PR_BOT_CHANNEL_NAME = ( - env("SLACK_PR_BOT_CHANNEL_NAME", default="slack-bot") or "slack-bot" -).strip() -SLACK_PR_BOT_COMMENT_TEMPLATE = ( - env( - "SLACK_PR_BOT_COMMENT_TEMPLATE", - default="Automated comment from Slack bot.", - ) - or "" -).strip() or "Automated comment from Slack bot." -SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW = int( - env("SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW", default="5") or "5" -) -SLACK_PR_BOT_COMMENTS_WINDOW_SECONDS = int( - env("SLACK_PR_BOT_COMMENTS_WINDOW_SECONDS", default="3600") or "3600" -) - -# Discord configuration (for discord_activity_tracker) -DISCORD_TOKEN = (env("DISCORD_TOKEN", default="") or "").strip() -DISCORD_USER_TOKEN = (env("DISCORD_USER_TOKEN", default="") or "").strip() -ALLOW_INTERNAL_DISCORD_TOKENS = ( - env("ALLOW_INTERNAL_DISCORD_TOKENS", default="") or "" -).strip().lower() == "true" -DISCORD_INTERNAL_TOKENS_JSON = ( - env("DISCORD_INTERNAL_TOKENS_JSON", default="") or "" -).strip() -# Chrome user-data dir for Discord user token extraction (logged-in session on disk) -_DEFAULT_DISCORD_CHROME_PROFILE = str( - WORKSPACE_DIR / "discord_activity_tracker" / "chrome_profile" -) -DISCORD_CHROME_PROFILE_PATH = ( - env("DISCORD_CHROME_PROFILE_PATH", default=_DEFAULT_DISCORD_CHROME_PROFILE) or "" -).strip() -_discord_server_id_str = (env("DISCORD_SERVER_ID", default="") or "").strip() -DISCORD_SERVER_ID: int | None = ( - int(_discord_server_id_str) if _discord_server_id_str.isdigit() else None -) -# Comma-separated channel snowflake IDs to scrape; empty = scrape all channels -_discord_channel_ids_str = (env("DISCORD_CHANNEL_IDS", default="") or "").strip() -DISCORD_CHANNEL_IDS: list[int] = [ - int(c.strip()) for c in _discord_channel_ids_str.split(",") if c.strip().isdigit() -] -DISCORD_CONTEXT_REPO_PATH = Path( - env( - "DISCORD_CONTEXT_REPO_PATH", - default=str( - WORKSPACE_DIR - / "discord_activity_tracker" - / "discord-cplusplus-together-context" - ), - ) -).resolve() -# Full path to DiscordChatExporter CLI executable (optional). -# Default: workspace/discord_activity_tracker/script/DiscordChatExporter.Cli.exe (Windows) -# or .../DiscordChatExporter.Cli (macOS/Linux) -# Releases: https://github.com/Tyrrrz/DiscordChatExporter/releases/latest -_discord_chat_exporter_cli = ( - env("DISCORD_CHAT_EXPORTER_CLI", default="") or "" -).strip() -DISCORD_CHAT_EXPORTER_CLI: str | None = ( - _discord_chat_exporter_cli if _discord_chat_exporter_cli else None -) -# Run via ``dotnet /path/to/DiscordChatExporter.Cli.dll`` (uses system .NET host; avoids blocked -# bundled libhostfxr on macOS external volumes / quarantine). Requires ``dotnet`` on PATH or -# DISCORD_CHAT_EXPORTER_DOTNET below. -_discord_chat_exporter_dotnet_dll = ( - env("DISCORD_CHAT_EXPORTER_DOTNET_DLL", default="") or "" -).strip() -DISCORD_CHAT_EXPORTER_DOTNET_DLL: str | None = ( - _discord_chat_exporter_dotnet_dll if _discord_chat_exporter_dotnet_dll else None -) -_discord_chat_exporter_dotnet = ( - env("DISCORD_CHAT_EXPORTER_DOTNET", default="") or "" -).strip() -DISCORD_CHAT_EXPORTER_DOTNET: str | None = ( - _discord_chat_exporter_dotnet if _discord_chat_exporter_dotnet else None -) -# macOS: run ``xattr -cr`` on the CLI bundle directory before export (helps when Gatekeeper -# blocks downloaded binaries; use only if you trust the DiscordChatExporter files). -DISCORD_CHAT_EXPORTER_MACOS_CLEAR_QUARANTINE = env.bool( - "DISCORD_CHAT_EXPORTER_MACOS_CLEAR_QUARANTINE", default=False -) -# DiscordChatExporter --parallel (default 1: lower RAM; 3+ can SIGKILL/OOM on small machines). -DISCORD_CHAT_EXPORTER_PARALLEL: int = env.int( - "DISCORD_CHAT_EXPORTER_PARALLEL", default=1 -) -# Voice channels are rarely needed for text analytics; excluding them cuts exportguild work. -DISCORD_CHAT_EXPORTER_INCLUDE_VC = env.bool( - "DISCORD_CHAT_EXPORTER_INCLUDE_VC", default=False -) -# One CLI process per channel after `channels` listing — slower but avoids macOS SIGKILL on exportguild. -DISCORD_CHAT_EXPORTER_SEQUENTIAL_EXPORT = env.bool( - "DISCORD_CHAT_EXPORTER_SEQUENTIAL_EXPORT", - default=(sys.platform == "darwin"), -) -PINECONE_DISCORD_APP_TYPE: str = ( - env("PINECONE_DISCORD_APP_TYPE", default="discord-together-c-cpp") - or "discord-together-c-cpp" -).strip() -PINECONE_DISCORD_NAMESPACE: str = ( - env("PINECONE_DISCORD_NAMESPACE", default="discord-together-c-cpp") - or "discord-together-c-cpp" -).strip() # Reddit configuration (for reddit_activity_tracker) REDDIT_CLIENT_ID = (env("REDDIT_CLIENT_ID", default="") or "").strip() @@ -668,14 +568,18 @@ def _slack_team_scope_from_env(): # Schedule from YAML (boost_collector_runner). Strict mode (DEBUG=False or BOOST_COLLECTOR_SCHEDULE_STRICT): # missing/invalid YAML raises ScheduleConfigurationError at import time. In strict mode, any other # load failure is also re-raised after logging. Non-strict: unexpected errors fall back to {}. -BOOST_COLLECTOR_SCHEDULE_YAML = BASE_DIR / "config" / "boost_collector_schedule.yaml" +from boost_collector_runner.schedule_config import ( # noqa: E402 + ScheduleConfigurationError, + get_beat_schedule, + resolve_schedule_yaml_path, +) + +BOOST_COLLECTOR_SCHEDULE_YAML = resolve_schedule_yaml_path( + base_dir=BASE_DIR, + env_path=env("BOOST_COLLECTOR_SCHEDULE_YAML", default=""), +) _schedule_strict = BOOST_COLLECTOR_SCHEDULE_STRICT or not DEBUG try: - from boost_collector_runner.schedule_config import ( - ScheduleConfigurationError, - get_beat_schedule, - ) - # Pass strict and yaml_path explicitly; settings proxy is not ready during this import. CELERY_BEAT_SCHEDULE = get_beat_schedule( strict=_schedule_strict, @@ -737,13 +641,10 @@ def _slack_team_scope_from_env(): YOUTUBE_DEFAULT_PUBLISHED_AFTER = ( env("YOUTUBE_DEFAULT_PUBLISHED_AFTER", default="") or "" ).strip() -# You can add your own Django apps here by adding them to the EXTRA_INSTALLED_APPS list in config/local_settings.py. -try: - from . import local_settings as _local_settings - - _LOCAL_EXTRA_INSTALLED_APPS = tuple( - getattr(_local_settings, "EXTRA_INSTALLED_APPS", ()) - ) -except ImportError: - _LOCAL_EXTRA_INSTALLED_APPS = () +# Optional extra apps via config/local_settings.py (EXTRA_INSTALLED_APPS). +_LOCAL_EXTRA_INSTALLED_APPS = ( + tuple(getattr(_local_settings, "EXTRA_INSTALLED_APPS", ())) + if _local_settings is not None + else () +) INSTALLED_APPS = [*INSTALLED_APPS, *_LOCAL_EXTRA_INSTALLED_APPS] diff --git a/config/test_settings.py b/config/test_settings.py index d5280b16..07cfe11e 100644 --- a/config/test_settings.py +++ b/config/test_settings.py @@ -13,7 +13,10 @@ from django.core.exceptions import ImproperlyConfigured from .settings import * # noqa: F401, F403 -from .settings import DATABASES # explicit import for ruff F405 (after star import) +from .settings import ( # explicit imports for ruff F405 (after star import) + DATABASES, + _EXTRA_WORKSPACE_SLUGS, +) # Never run workspace orphan cleanup during tests (CoreConfig.ready). WORKSPACE_ORPHAN_CLEANUP_ENABLED = False @@ -82,15 +85,11 @@ "github_activity_tracker", "boost_library_tracker", "clang_github_tracker", - "discord_activity_tracker", "reddit_activity_tracker", "shared", + *_EXTRA_WORKSPACE_SLUGS, ): (WORKSPACE_DIR / _slug).mkdir(parents=True, exist_ok=True) -# Base settings computed DISCORD_CONTEXT_REPO_PATH before WORKSPACE_DIR was overridden above. -DISCORD_CONTEXT_REPO_PATH = ( - WORKSPACE_DIR / "discord_activity_tracker" / "discord-cplusplus-together-context" -).resolve() LOG_DIR = _test_dir / "logs" LOG_DIR.mkdir(exist_ok=True) @@ -105,11 +104,3 @@ CLANG_GITHUB_CONTEXT_REPO_OWNER = "" CLANG_GITHUB_CONTEXT_REPO_NAME = "" CLANG_GITHUB_CONTEXT_REPO_BRANCH = "" - -# Tests patch a single subprocess.Popen for DiscordChatExporter. -DISCORD_CHAT_EXPORTER_SEQUENTIAL_EXPORT = False - -# Tests set DISCORD_USER_TOKEN via monkeypatch; do not inherit internal-token mode -# from developer .env (get_or_load_discord_user_token would ignore env token). -ALLOW_INTERNAL_DISCORD_TOKENS = False -DISCORD_USER_TOKEN = "" diff --git a/config/tests/test_settings_slack_helpers.py b/config/tests/test_settings_slack_helpers.py new file mode 100644 index 00000000..a5c6fc1f --- /dev/null +++ b/config/tests/test_settings_slack_helpers.py @@ -0,0 +1,56 @@ +"""Tests for Slack-related settings helpers in config.settings.""" + +import pytest + +from config.settings import ( + _slack_per_team_tokens_from_env, + _slack_team_ids_from_env, + _slack_team_scope_from_env, +) + + +@pytest.fixture(autouse=True) +def _clear_slack_env(monkeypatch): + for key in list(__import__("os").environ): + if key.startswith("SLACK_"): + monkeypatch.delenv(key, raising=False) + + +def test_slack_team_ids_from_env_empty(monkeypatch): + monkeypatch.delenv("SLACK_TEAM_IDS", raising=False) + assert _slack_team_ids_from_env() == [] + + +def test_slack_team_ids_from_env_parses_comma_separated(monkeypatch): + monkeypatch.setenv("SLACK_TEAM_IDS", " T1 , T2, ,T3 ") + assert _slack_team_ids_from_env() == ["T1", "T2", "T3"] + + +def test_slack_per_team_tokens_from_env(monkeypatch): + monkeypatch.setenv("SLACK_TEAM_IDS", "T1,T2") + monkeypatch.setenv("SLACK_BOT_TOKEN_T1", "xoxb-one") + monkeypatch.setenv("SLACK_BOT_TOKEN_T2", "") + assert _slack_per_team_tokens_from_env("SLACK_BOT_TOKEN") == {"T1": "xoxb-one"} + + +def test_slack_team_scope_defaults_when_missing(monkeypatch): + monkeypatch.setenv("SLACK_TEAM_IDS", "T1") + assert _slack_team_scope_from_env() == {"T1": [0, 1]} + + +def test_slack_team_scope_parses_valid_entries(monkeypatch): + monkeypatch.setenv("SLACK_TEAM_IDS", "T1") + monkeypatch.setenv("SLACK_TEAM_SCOPE_T1", "0, 1") + assert _slack_team_scope_from_env() == {"T1": [0, 1]} + + +def test_slack_team_scope_skips_invalid_and_defaults(monkeypatch): + monkeypatch.setenv("SLACK_TEAM_IDS", "T1") + monkeypatch.setenv("SLACK_TEAM_SCOPE_T1", "99, bogus, 1") + assert _slack_team_scope_from_env() == {"T1": [1]} + + +def test_slack_team_scope_all_invalid_falls_back_to_both(monkeypatch): + monkeypatch.setenv("SLACK_TEAM_IDS", "T1") + monkeypatch.setenv("SLACK_TEAM_SCOPE_T1", "99, abc") + assert _slack_team_scope_from_env() == {"T1": [0, 1]} diff --git a/core/errors.py b/core/errors.py index 40e180fd..f3986ab0 100644 --- a/core/errors.py +++ b/core/errors.py @@ -279,11 +279,6 @@ def classify_failure(exc: BaseException) -> CollectorFailureCategory: for mod_name, exc_name in ( ("github_activity_tracker.api_schemas", "GitHubApiValidationError"), ("cppa_slack_tracker.api_schemas", "SlackApiValidationError"), - ("discord_activity_tracker.staging_schema", "StagingValidationError"), - ( - "discord_activity_tracker.api_schemas", - "DiscordLiveSyncValidationError", - ), ): try: import importlib diff --git a/core/operations/README.md b/core/operations/README.md index de4458f5..83084cda 100644 --- a/core/operations/README.md +++ b/core/operations/README.md @@ -8,7 +8,7 @@ Shared **external I/O** (GitHub, Slack, markdown export, filenames). Not a separ | --- | --- | --- | | [`github_ops/`](github_ops/) | [github_ops/README.md](github_ops/README.md) | GitHub REST/GraphQL client, tokens, git clone/push/upload. | | [`slack_ops/`](slack_ops/) | [slack_ops/README.md](slack_ops/README.md) | Slack API client, tokens, channels, messages, file fetch. | -| [`md_ops/`](md_ops/) | [md_ops/README.md](md_ops/README.md) | JSON/HTML → Markdown (issues, PRs, transcripts, GitHub export). | +| [`md_ops/`](md_ops/) | [md_ops/README.md](md_ops/README.md) | JSON/HTML → Markdown (issues, PRs, GitHub export). | | [`file_ops/`](file_ops/) | [file_ops/README.md](file_ops/README.md) | Cross-platform `sanitize_filename`. | ## Docs diff --git a/core/operations/md_ops/README.md b/core/operations/md_ops/README.md index d52218a6..35a2ce10 100644 --- a/core/operations/md_ops/README.md +++ b/core/operations/md_ops/README.md @@ -9,10 +9,9 @@ Convert GitHub/Slack payloads and HTML into Markdown files on disk. | [`issue_to_md.py`](issue_to_md.py) | GitHub issue JSON → Markdown. | | [`pr_to_md.py`](pr_to_md.py) | Pull request JSON → Markdown (comments, diffs). | | [`html_to_md.py`](html_to_md.py) | Generic HTML → Markdown (`HTMLToMarkdownConverter`). | -| [`transcript.py`](transcript.py) | Slack huddle / export JSON → transcript Markdown. | | [`github_export.py`](github_export.py) | Batch export layout for GitHub activity workspace trees. | | [`_write.py`](_write.py) | Shared `write_markdown()` helper. | ## Tests -[`../../tests/operations/`](../../tests/operations/) (`test_*_md*.py`, `test_github_export*.py`, `test_stdlib_html_to_md.py`, `test_transcript_ops_more.py`) +[`../../tests/operations/`](../../tests/operations/) (`test_*_md*.py`, `test_github_export*.py`, `test_stdlib_html_to_md.py`) diff --git a/core/operations/md_ops/__init__.py b/core/operations/md_ops/__init__.py index ef6bd3b7..71f23ab6 100644 --- a/core/operations/md_ops/__init__.py +++ b/core/operations/md_ops/__init__.py @@ -12,24 +12,14 @@ ) from core.operations.md_ops.issue_to_md import issue_json_to_md from core.operations.md_ops.pr_to_md import pr_json_to_md -from core.operations.md_ops.transcript import ( - generate_transcript_from_json, - parse_datetime_range, - parse_html_summary, - write_huddle_transcript_md, -) __all__ = [ "HTMLToMarkdownConverter", "convert_html_file_to_markdown", "detect_renames", "detect_renames_from_dirs", - "generate_transcript_from_json", "html_to_markdown", "issue_json_to_md", - "parse_datetime_range", - "parse_html_summary", "pr_json_to_md", - "write_huddle_transcript_md", "write_md_files", ] diff --git a/core/operations/md_ops/transcript.py b/core/operations/md_ops/transcript.py deleted file mode 100644 index 50cdab5a..00000000 --- a/core/operations/md_ops/transcript.py +++ /dev/null @@ -1,319 +0,0 @@ -""" -Transcript markdown: parse huddle HTML/JSON and write transcript .md files. -Used by slack_event_handler; caller provides channel_name and user_info_map (from Slack). -""" - -from __future__ import annotations - -import logging -import re -from datetime import datetime -from pathlib import Path - -import pytz - -from core.operations.md_ops._write import write_markdown - -logger = logging.getLogger(__name__) -PST = pytz.timezone("America/Los_Angeles") - - -def parse_datetime_range(datetime_str: str, date_str: str | None = None) -> str: - """Parse datetime string and convert to PST formatted range.""" - try: - tz_match = re.search(r"\s+([A-Z]{2,4})\s*$", datetime_str) - input_tz_str = tz_match.group(1) if tz_match else None - tz_map = { - "PST": "America/Los_Angeles", - "PDT": "America/Los_Angeles", - "EST": "America/New_York", - "EDT": "America/New_York", - "CST": "America/Chicago", - "CDT": "America/Chicago", - "MST": "America/Denver", - "MDT": "America/Denver", - "UTC": "UTC", - "GMT": "UTC", - } - if input_tz_str and input_tz_str.upper() in tz_map: - source_tz = pytz.timezone(tz_map[input_tz_str.upper()]) - else: - source_tz = PST - time_match = re.search( - r"(\d+:\d+:\d+\s+[AP]M)\s*-\s*(\d+:\d+:\d+\s+[AP]M)", datetime_str - ) - if not time_match: - return datetime_str - start_time_str, end_time_str = time_match.group(1), time_match.group(2) - if date_str: - try: - parsed_date = ( - datetime.strptime(date_str, "%m/%d/%y") - if len(date_str.split("/")[-1]) == 2 - else datetime.strptime(date_str, "%m/%d/%Y") - ) - date_naive = parsed_date - except ValueError: - date_naive = datetime.now(PST).replace(tzinfo=None) - else: - date_naive = datetime.now(PST).replace(tzinfo=None) - try: - start_dt_naive = datetime.strptime( - f"{date_naive.strftime('%Y-%m-%d')} {start_time_str}", - "%Y-%m-%d %I:%M:%S %p", - ) - end_dt_naive = datetime.strptime( - f"{date_naive.strftime('%Y-%m-%d')} {end_time_str}", - "%Y-%m-%d %I:%M:%S %p", - ) - start_dt_source = source_tz.localize(start_dt_naive) - end_dt_source = source_tz.localize(end_dt_naive) - start_dt = start_dt_source.astimezone(PST) - end_dt = end_dt_source.astimezone(PST) - if end_dt < start_dt: - from datetime import timedelta - - end_dt_naive_next = end_dt_naive + timedelta(days=1) - end_dt_source = source_tz.localize(end_dt_naive_next) - end_dt = end_dt_source.astimezone(PST) - return f"{start_dt.strftime('%Y-%m-%d_%H-%M')} PST - {end_dt.strftime('%Y-%m-%d_%H-%M')} PST" - except ValueError: - return datetime_str - except Exception: - return datetime_str - - -def parse_html_summary(html_content: str) -> dict: - """Parse HTML content from Slack huddle summary file.""" - html_data = { - "channel_id": None, - "attendee_ids": [], - "datetime": "", - "datetime_formatted": "", - } - try: - channel_match = re.search(r"#(C[A-Z0-9]+)", html_content) - if channel_match: - html_data["channel_id"] = channel_match.group(1) - date_match = re.search(r"Huddle notes:\s*(\d+/\d+/\d+)", html_content) - date_str = date_match.group(1) if date_match else None - datetime_match = re.search(r"([^<]+)", html_content) - if datetime_match: - datetime_str = datetime_match.group(1) - html_data["datetime"] = datetime_str - html_data["datetime_formatted"] = parse_datetime_range( - datetime_str, date_str - ) - attendees_match = re.search( - r"]*>.*?Attendees.*?.*?]*>(.*?)

", - html_content, - re.DOTALL | re.IGNORECASE, - ) - if attendees_match: - attendees_section = attendees_match.group(1) - attendee_matches = re.findall(r"@(U[A-Z0-9]+)", attendees_section) - seen: set[str] = set() - html_data["attendee_ids"] = [ - x for x in attendee_matches if x not in seen and not seen.add(x) - ] - else: - attendee_matches = re.findall(r"@(U[A-Z0-9]+)", html_content) - seen = set() - html_data["attendee_ids"] = [ - x for x in attendee_matches if x not in seen and not seen.add(x) - ] - except Exception as e: - logger.debug("Error parsing HTML: %s", e) - return html_data - - -def replace_user_ids_with_usernames(markdown_content: str, user_info_map: dict) -> str: - """Replace user IDs with usernames in markdown.""" - - def replace_user_id(match: re.Match) -> str: - user_id = match.group(1) - if user_id in user_info_map: - u = user_info_map[user_id] - username = ( - u.get("display_name") or u.get("real_name") or u.get("name", user_id) - ) - return f"**@{username}**" - return match.group(0) - - return re.sub(r"@(U[A-Z0-9]+)", replace_user_id, markdown_content) - - -def replace_channel_ids_with_names( - markdown_content: str, channel_id: str | None, channel_name: str -) -> str: - """Replace channel IDs with channel names in markdown.""" - if channel_id and channel_name: - markdown_content = re.sub( - rf"#({re.escape(channel_id)})", f"#{channel_name}", markdown_content - ) - return markdown_content - - -def generate_transcript_from_json(result_json: dict) -> list[dict]: - """Generate transcript entries from Slack huddle result JSON.""" - transcript = [] - try: - file_data = result_json.get("file", {}) - transcription = file_data.get("huddle_transcription", {}) - blocks = transcription.get("blocks", []) - if isinstance(blocks, dict): - blocks = blocks.get("elements", []) - if not isinstance(blocks, list): - blocks = [] - for block in blocks: - if not isinstance(block, dict): - continue - elements = block.get("elements", []) - for element in elements: - if element.get("type") == "rich_text_section": - section_elements = element.get("elements", []) - user_id = None - time_str = "" - content_parts = [] - for sub_elem in section_elements: - if sub_elem.get("type") == "user": - user_id = sub_elem.get("user_id") - elif sub_elem.get("type") == "text": - text = sub_elem.get("text", "") - time_m = re.match(r"^\s*\[(\d+:\d+)\]:\s*$", text) - if time_m: - time_str = time_m.group(1) - else: - content_parts.append(text) - if user_id and (time_str or content_parts): - transcript.append( - { - "user_id": user_id, - "time": time_str, - "content": "".join(content_parts).strip(), - } - ) - except Exception as e: - logger.debug("Error parsing transcript: %s", e) - return transcript - - -def write_huddle_transcript_md( - output_dir: str | Path, - *, - html_content: str, - result_json: dict, - channel_name: str, - user_info_map: dict, - summary_markdown: str, -) -> Path | None: - """ - Build and write a huddle transcript markdown file. - - Caller must provide: - - html_content: raw HTML from huddle summary - - result_json: Slack transcript API response (file with huddle_transcription) - - channel_name: Slack channel name (from Slack API) - - user_info_map: dict user_id -> {display_name, real_name, name} - - summary_markdown: HTML converted to markdown, with @user/#channel replaced (caller does html_to_markdown + replace_*) - """ - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - html_data = parse_html_summary(html_content) - transcript = generate_transcript_from_json(result_json) - - if html_data.get("datetime_formatted"): - date_match = re.search( - r"^(\d{4}-\d{2}-\d{2})_\d{2}-\d{2}", html_data["datetime_formatted"] - ) - date_part = ( - date_match.group(1) - if date_match - else datetime.now(PST).strftime("%Y-%m-%d") - ) - time_match = re.search( - r"^\d{4}-\d{2}-\d{2}_(\d{2}-\d{2})", html_data["datetime_formatted"] - ) - time_part = ( - time_match.group(1) if time_match else datetime.now(PST).strftime("%H-%M") - ) - date_str = f"{date_part}_{time_part}" - else: - date_str = datetime.now(PST).strftime("%Y-%m-%d_%H-%M") - - usernames = [] - for user_id in html_data["attendee_ids"]: - u = user_info_map.get(user_id, {}) - usernames.append( - u.get("display_name") or u.get("real_name") or u.get("name", user_id) - ) - username_str = "_".join(usernames[:5]) - if len(usernames) > 5: - username_str += "_and_more" - filename = f"{channel_name}_{date_str}_{username_str}.md" - filename = re.sub(r'[<>:"/\\|?*]', "_", filename) - filepath = output_dir / filename - - markdown_lines = [] - title_date = date_str.replace("_", " ") - markdown_lines.append(f"# {channel_name} Huddle - {title_date}") - markdown_lines.append("") - if html_data.get("datetime_formatted"): - markdown_lines.append(f"**datetime:** {html_data['datetime_formatted']} ") - else: - original_datetime = html_data.get("datetime", "") - if original_datetime: - date_match = re.search(r"Huddle notes:\s*(\d+/\d+/\d+)", html_content) - date_str_parse = date_match.group(1) if date_match else None - markdown_lines.append( - f"**datetime:** {parse_datetime_range(original_datetime, date_str_parse)} " - ) - else: - markdown_lines.append( - f"**datetime:** {datetime.now(PST).strftime('%Y-%m-%d_%H-%M')} PST " - ) - markdown_lines.append(f"**location:** #{channel_name} Slack channel ") - markdown_lines.append("**type:** HUDDLE ") - attendee_names = [ - user_info_map[uid].get("display_name") - or user_info_map[uid].get("real_name") - or user_info_map[uid].get("name", uid) - for uid in html_data["attendee_ids"] - if uid in user_info_map - ] - if not attendee_names: - attendees_str = "Unknown" - elif len(attendee_names) == 1: - attendees_str = attendee_names[0] - elif len(attendee_names) == 2: - attendees_str = f"{attendee_names[0]} and {attendee_names[1]}" - else: - attendees_str = ", ".join(attendee_names[:-1]) + f", and {attendee_names[-1]}" - markdown_lines.append(f"**attendees:** {attendees_str} ") - markdown_lines.append("") - markdown_lines.append("## Slack AI Summary") - markdown_lines.append("") - if summary_markdown.strip(): - markdown_lines.append(summary_markdown.strip()) - markdown_lines.append("") - markdown_lines.append("## Transcript") - markdown_lines.append("") - for entry in transcript: - user_id = entry["user_id"] - time_str = entry.get("time", "") - content = entry.get("content", "") - u = user_info_map.get(user_id, {}) - username = u.get("display_name") or u.get("real_name") or u.get("name", user_id) - if time_str: - markdown_lines.append(f"**@{username} [{time_str}]:** {content} ") - else: - markdown_lines.append(f"**@{username}:** {content} ") - markdown_lines.append("") - - try: - write_markdown(filepath, "\n".join(markdown_lines)) - logger.debug("Markdown file generated: %s", filepath) - return filepath - except Exception as e: - logger.exception("Error writing markdown file %s: %s", filepath, e) - return None diff --git a/core/operations/slack_ops/__init__.py b/core/operations/slack_ops/__init__.py index 8e776203..acd50b02 100644 --- a/core/operations/slack_ops/__init__.py +++ b/core/operations/slack_ops/__init__.py @@ -1,5 +1,5 @@ """ -Slack operations: channel list/join, messages, client, fetcher (file download, huddle transcript). +Slack operations: channel list/join, messages, client, fetcher (file download). Similar layout to core.operations.github_ops; use get_slack_client() or token helpers. """ @@ -14,7 +14,6 @@ from core.operations.slack_ops.fetcher import ( SlackFetcher, download_file, - fetch_huddle_transcript, get_file_info, get_slack_fetcher, ) @@ -30,7 +29,6 @@ "channel_join", "channel_list", "download_file", - "fetch_huddle_transcript", "get_channel_messages", "get_default_team_key", "get_file_info", diff --git a/core/operations/slack_ops/fetcher.py b/core/operations/slack_ops/fetcher.py index fbad0669..7f1328b8 100644 --- a/core/operations/slack_ops/fetcher.py +++ b/core/operations/slack_ops/fetcher.py @@ -1,6 +1,6 @@ """ -Slack Fetcher: file download, user/channel info, huddle transcript. -Uses SlackAPIClient for API calls; huddle transcript uses workspace session credentials. +Slack Fetcher: file download, user/channel info. +Uses SlackAPIClient for API calls. """ import os @@ -21,7 +21,7 @@ class SlackFetcher: - """Slack Fetcher: user/channel/file info via SlackAPIClient; file download and huddle transcript.""" + """Slack Fetcher: user/channel/file info via SlackAPIClient; file download.""" def __init__(self, bot_token=None): token = (bot_token or "").strip() @@ -209,102 +209,3 @@ def download_file(file_url, save_path=None, filename=None, bot_token=None): """Download a file from Slack (standalone function).""" fetcher = SlackFetcher(bot_token) return fetcher.download_file(file_url, save_path, filename) - - -def fetch_huddle_transcript(file_id): - """ - Fetch huddle transcript/file info using session credentials from workspace JSON. - - Stale credentials are refreshed automatically. On auth errors, one refresh retry - is attempted before giving up. - """ - from slack_event_handler.utils.slack_internal_tokens_store import ( - SLACK_TOKENS_RELOGIN_HINT, - _extract_validate_and_return, - get_or_load_slack_internal_token_pair, - log_slack_internal_tokens_still_invalid, - ) - from slack_event_handler.utils.slack_tokens import ( - is_slack_internal_token_auth_error, - ) - - team_id = get_default_team_key() or None - pair = get_or_load_slack_internal_token_pair(team_id) - if not pair: - if team_id: - logger.error( - "Cannot fetch huddle transcript for file %s: no valid session " - "credentials for team %s. %s", - file_id, - team_id, - SLACK_TOKENS_RELOGIN_HINT, - ) - else: - logger.error( - "Cannot fetch huddle transcript for file %s: no Slack team id " - "(set SLACK_TEAM_IDS) and no valid session credentials. %s", - file_id, - SLACK_TOKENS_RELOGIN_HINT, - ) - return None - - xoxc_token, xoxd_token = pair - url = "https://slack.com/api/files.info" - headers = {"Authorization": f"Bearer {xoxc_token}"} - cookies = {"d": xoxd_token} - data = {"file": file_id, "include_transcription": "true"} - max_retries, retry_delay = 3, 2 - reextracted = False - for attempt in range(max_retries): - try: - response = requests.post( - url, headers=headers, cookies=cookies, data=data, timeout=30 - ) - response.raise_for_status() - result = response.json() - if result.get("ok"): - logger.debug("Fetched file info for: %s", file_id) - return result - err = (result.get("error") or "").strip() - if team_id and is_slack_internal_token_auth_error(err) and not reextracted: - reextracted = True - logger.info( - "Slack auth error (%s); refreshing session credentials", - err, - ) - new_pair = _extract_validate_and_return(team_id) - if new_pair: - xoxc_token, xoxd_token = new_pair - headers = {"Authorization": f"Bearer {xoxc_token}"} - cookies = {"d": xoxd_token} - continue - logger.error( - "Cannot fetch huddle transcript for file %s: credential refresh did not " - "yield valid session for team %s. %s", - file_id, - team_id, - SLACK_TOKENS_RELOGIN_HINT, - ) - return None - if reextracted and team_id and is_slack_internal_token_auth_error(err): - log_slack_internal_tokens_still_invalid(team_id) - logger.error( - "Cannot fetch huddle transcript for file %s: Slack auth error (%s) " - "after credential refresh. %s", - file_id, - err, - SLACK_TOKENS_RELOGIN_HINT, - ) - return result - logger.warning("Slack API error: %s", err or "Unknown error") - return result - except (ConnectionError, Timeout, RequestException) as e: - if attempt < max_retries - 1: - time.sleep(retry_delay * (2**attempt)) - else: - logger.exception("Slack API request error: %s", e) - return None - except Exception as e: - logger.exception("Unexpected error: %s", e) - return None - return None diff --git a/core/tests/operations/test_slack_fetcher.py b/core/tests/operations/test_slack_fetcher.py index a8c055c9..25964269 100644 --- a/core/tests/operations/test_slack_fetcher.py +++ b/core/tests/operations/test_slack_fetcher.py @@ -5,16 +5,13 @@ import pytest import requests -from django.test.utils import override_settings from core.operations.slack_ops.fetcher import ( SlackFetcher, download_file, - fetch_huddle_transcript, get_file_info, get_slack_fetcher, ) -from slack_event_handler.utils import slack_internal_tokens_store as token_store @pytest.fixture(autouse=True) @@ -212,44 +209,6 @@ def test_get_slack_fetcher_factory(): assert isinstance(get_slack_fetcher("x"), SlackFetcher) -@override_settings(WORKSPACE_DIR="/tmp/ws") -def test_save_slack_internal_tokens_json(tmp_path, settings): - settings.WORKSPACE_DIR = str(tmp_path) - path = token_store.save_slack_internal_tokens("T1", "xc", "xd") - assert path.is_file() - loaded = token_store.load_slack_internal_tokens("T1") - assert loaded["xoxc"] == "xc" - assert loaded["xoxd"] == "xd" - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True, WORKSPACE_DIR="/tmp/ws") -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.get_or_load_slack_internal_token_pair", - return_value=("xc", "xd"), -) -@patch("core.operations.slack_ops.fetcher.requests.post") -@patch("core.operations.slack_ops.fetcher.get_default_team_key", return_value="T1") -def test_fetch_huddle_transcript_ok(_mock_team, mock_post, _mock_load): - mock_resp = MagicMock() - mock_resp.raise_for_status = MagicMock() - mock_resp.json.return_value = {"ok": True, "file": {}} - mock_post.return_value = mock_resp - assert fetch_huddle_transcript("F1")["ok"] is True - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.get_or_load_slack_internal_token_pair", - return_value=("xc", "xd"), -) -@patch("core.operations.slack_ops.fetcher.requests.post") -@patch("core.operations.slack_ops.fetcher.get_default_team_key", return_value="T1") -def test_fetch_huddle_transcript_connection_error(_mock_team, mock_post, _pair): - mock_post.side_effect = requests.exceptions.ConnectionError("down") - with patch("core.operations.slack_ops.fetcher.time.sleep"): - assert fetch_huddle_transcript("F1") is None - - def test_get_file_info_returns_slack_error_payload(): client = MagicMock() client.files_info.return_value = {"ok": False, "error": "file_not_found"} @@ -341,169 +300,3 @@ def test_get_file_and_download_with_private_url(tmp_path): finfo, path = f.get_file_and_download("F1", save_path=str(tmp_path)) assert finfo["ok"] assert path.endswith("n.txt") - - -def test_save_slack_internal_tokens_write_error(tmp_path, settings): - settings.WORKSPACE_DIR = str(tmp_path) - with patch.object(token_store, "_write_document", side_effect=OSError("perm")): - with pytest.raises(OSError): - token_store.save_slack_internal_tokens("T1", "a", "b") - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.get_or_load_slack_internal_token_pair", - return_value=("xc", "xd"), -) -@patch("core.operations.slack_ops.fetcher.requests.post") -@patch("core.operations.slack_ops.fetcher.get_default_team_key", return_value="T1") -def test_fetch_huddle_loads_tokens_from_json( - _mock_team, - mock_post, - _mock_load, -): - mock_resp = MagicMock() - mock_resp.raise_for_status = MagicMock() - mock_resp.json.return_value = {"ok": True, "file": {}} - mock_post.return_value = mock_resp - assert fetch_huddle_transcript("F9")["ok"] is True - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.get_or_load_slack_internal_token_pair", - return_value=None, -) -@patch("core.operations.slack_ops.fetcher.get_default_team_key", return_value=None) -def test_fetch_huddle_missing_tokens_and_no_team(_mock_team, _mock_load): - assert fetch_huddle_transcript("F1") is None - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.get_or_load_slack_internal_token_pair", - return_value=None, -) -@patch("core.operations.slack_ops.fetcher.get_default_team_key", return_value="T1") -def test_fetch_huddle_extract_returns_invalid(_mock_team, _mock_load): - assert fetch_huddle_transcript("F1") is None - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store._extract_validate_and_return", - return_value=("nxc", "nxd"), -) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.get_or_load_slack_internal_token_pair", - return_value=("xc", "xd"), -) -@patch("core.operations.slack_ops.fetcher.requests.post") -@patch("core.operations.slack_ops.fetcher.get_default_team_key", return_value="T1") -def test_fetch_huddle_reextracts_from_profile_on_auth_error( - _mock_team, - mock_post, - _mock_load, - _mock_reextract, -): - ok_resp = MagicMock() - ok_resp.raise_for_status = MagicMock() - ok_resp.json.return_value = {"ok": True, "file": {}} - bad_resp = MagicMock() - bad_resp.raise_for_status = MagicMock() - bad_resp.json.return_value = {"ok": False, "error": "token_revoked"} - mock_post.side_effect = [bad_resp, ok_resp] - assert fetch_huddle_transcript("Fx")["ok"] is True - _mock_reextract.assert_called_once_with("T1") - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store._extract_validate_and_return", - return_value=("nxc", "nxd"), -) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.get_or_load_slack_internal_token_pair", - return_value=("xc", "xd"), -) -@patch("core.operations.slack_ops.fetcher.requests.post") -@patch("core.operations.slack_ops.fetcher.get_default_team_key", return_value="T1") -def test_fetch_huddle_reextracts_after_connection_error_then_auth_error( - _mock_team, - mock_post, - _mock_load, - _mock_reextract, -): - """Auth on a later attempt must still trigger one re-extract (not gated on attempt == 0).""" - ok_resp = MagicMock() - ok_resp.raise_for_status = MagicMock() - ok_resp.json.return_value = {"ok": True, "file": {}} - bad_resp = MagicMock() - bad_resp.raise_for_status = MagicMock() - bad_resp.json.return_value = {"ok": False, "error": "token_revoked"} - mock_post.side_effect = [ - requests.exceptions.ConnectionError("down"), - bad_resp, - ok_resp, - ] - with patch("core.operations.slack_ops.fetcher.time.sleep"): - assert fetch_huddle_transcript("Fx")["ok"] is True - _mock_reextract.assert_called_once_with("T1") - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store._extract_validate_and_return", - return_value=None, -) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.get_or_load_slack_internal_token_pair", - return_value=("xc", "xd"), -) -@patch("core.operations.slack_ops.fetcher.requests.post") -@patch("core.operations.slack_ops.fetcher.get_default_team_key", return_value="T1") -def test_fetch_huddle_auth_error_when_reextract_fails( - _mock_team, mock_post, _mock_load, _mock_reextract, caplog -): - import logging - - bad_resp = MagicMock() - bad_resp.raise_for_status = MagicMock() - bad_resp.json.return_value = {"ok": False, "error": "token_revoked"} - mock_post.return_value = bad_resp - with caplog.at_level(logging.ERROR): - assert fetch_huddle_transcript("Fx") is None - _mock_reextract.assert_called_once_with("T1") - assert ".env.example" in caplog.text - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.get_or_load_slack_internal_token_pair", - return_value=("xc", "xd"), -) -@patch("core.operations.slack_ops.fetcher.requests.post") -@patch("core.operations.slack_ops.fetcher.get_default_team_key", return_value="T1") -def test_fetch_huddle_returns_error_payload_when_not_ok( - _mock_team, mock_post, _mock_pair -): - mock_resp = MagicMock() - mock_resp.raise_for_status = MagicMock() - mock_resp.json.return_value = {"ok": False, "error": "invalid"} - mock_post.return_value = mock_resp - out = fetch_huddle_transcript("Fz") - assert out["ok"] is False - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.get_or_load_slack_internal_token_pair", - return_value=("xc", "xd"), -) -@patch("core.operations.slack_ops.fetcher.requests.post") -@patch("core.operations.slack_ops.fetcher.get_default_team_key", return_value="T1") -def test_fetch_huddle_unexpected_exception_returns_none( - _mock_team, mock_post, _mock_pair -): - mock_post.side_effect = ValueError("weird") - assert fetch_huddle_transcript("Fe") is None diff --git a/core/tests/operations/test_transcript_ops_more.py b/core/tests/operations/test_transcript_ops_more.py deleted file mode 100644 index df6b6f9c..00000000 --- a/core/tests/operations/test_transcript_ops_more.py +++ /dev/null @@ -1,193 +0,0 @@ -"""Tests for core.operations.md_ops.transcript.""" - -from unittest.mock import patch - -import pytest - -from core.operations.md_ops.transcript import ( - generate_transcript_from_json, - parse_datetime_range, - parse_html_summary, - replace_channel_ids_with_names, - replace_user_ids_with_usernames, - write_huddle_transcript_md, -) - - -def test_parse_datetime_range_preserves_on_bad_match(): - assert parse_datetime_range("no times here") == "no times here" - - -def test_parse_datetime_range_with_pst_suffix(): - out = parse_datetime_range( - "10:00:00 AM - 11:00:00 AM PST", - date_str="01/15/24", - ) - assert "PST" in out - - -def test_parse_html_summary_extracts_channel_and_attendees(): - html = """ - #C01234567 Huddle notes: 1/15/24 - 10:00:00 AM - 11:00:00 AM PST -

Attendees

@U111 @U222

- """ - data = parse_html_summary(html) - assert data["channel_id"] == "C01234567" - assert "U111" in data["attendee_ids"] - - -def test_replace_user_ids(): - md = "Hello @U1 there" - out = replace_user_ids_with_usernames(md, {"U1": {"display_name": "Alice"}}) - assert "Alice" in out - - -def test_replace_channel_ids(): - md = "Join #C99 please" - assert "#general" in replace_channel_ids_with_names(md, "C99", "general") - - -def test_generate_transcript_from_json_list_blocks(): - payload = { - "file": { - "huddle_transcription": { - "blocks": [ - { - "elements": [ - { - "type": "rich_text_section", - "elements": [ - {"type": "user", "user_id": "U9"}, - {"type": "text", "text": "[10:15]: "}, - {"type": "text", "text": "hello"}, - ], - } - ] - } - ] - } - } - } - rows = generate_transcript_from_json(payload) - assert rows and rows[0]["user_id"] == "U9" - - -def test_generate_transcript_dict_blocks_elements(): - payload = { - "file": { - "huddle_transcription": { - "blocks": {"elements": []}, - } - } - } - assert generate_transcript_from_json(payload) == [] - - -@pytest.mark.django_db -def test_write_huddle_transcript_md_writes_file(tmp_path): - html = """ - #C01234567 Huddle notes: 1/15/24 - 10:00:00 AM - 11:00:00 AM PST -

Attendees

@U1

- """ - result_json = { - "file": { - "huddle_transcription": { - "blocks": [ - { - "elements": [ - { - "type": "rich_text_section", - "elements": [ - {"type": "user", "user_id": "U1"}, - {"type": "text", "text": "hi"}, - ], - } - ] - } - ] - } - } - } - path = write_huddle_transcript_md( - tmp_path, - html_content=html, - result_json=result_json, - channel_name="team-chat", - user_info_map={"U1": {"display_name": "Sam"}}, - summary_markdown="Summary line", - ) - assert path is not None - assert path.exists() - text = path.read_text(encoding="utf-8") - assert "team-chat" in text - assert "Summary line" in text - - -def test_write_huddle_transcript_md_write_error(tmp_path): - with patch( - "core.operations.md_ops.transcript.write_markdown", - side_effect=OSError("fail"), - ): - out = write_huddle_transcript_md( - tmp_path, - html_content="#C1 Huddle notes: 1/1/24t", - result_json={"file": {"huddle_transcription": {"blocks": []}}}, - channel_name="c", - user_info_map={}, - summary_markdown="", - ) - assert out is None - - -def test_parse_datetime_range_with_utc_suffix(): - out = parse_datetime_range( - "10:00:00 AM - 11:00:00 AM UTC", - date_str="01/15/2024", - ) - assert "PST" in out - - -def test_parse_datetime_range_end_before_start_crosses_midnight(): - out = parse_datetime_range( - "11:00:00 PM - 01:00:00 AM PST", - date_str="06/01/24", - ) - assert "PST" in out and "_" in out - - -def test_parse_datetime_range_invalid_date_str_falls_back(): - out = parse_datetime_range( - "10:00:00 AM - 11:00:00 AM PST", - date_str="not-a-date", - ) - assert "PST" in out - - -def test_replace_channel_ids_noop_without_channel_id(): - assert replace_channel_ids_with_names("#C1 here", None, "general") == "#C1 here" - - -def test_generate_transcript_skips_non_dict_block(): - payload = { - "file": { - "huddle_transcription": { - "blocks": ["bad", {"elements": []}], - } - } - } - assert generate_transcript_from_json(payload) == [] - - -def test_generate_transcript_non_dict_file_data_returns_empty(): - assert generate_transcript_from_json({"file": object()}) == [] - - -def test_replace_user_ids_prefers_real_name_when_no_display(): - md = "Hi @U55" - out = replace_user_ids_with_usernames( - md, - {"U55": {"real_name": "Real N", "name": "u55"}}, - ) - assert "Real N" in out diff --git a/core/tests/test_collector_protocol_conformance.py b/core/tests/test_collector_protocol_conformance.py index 0ea10160..56527e30 100644 --- a/core/tests/test_collector_protocol_conformance.py +++ b/core/tests/test_collector_protocol_conformance.py @@ -22,10 +22,6 @@ from cppa_pinecone_sync.protocol_impl import PineconeSyncTrackerResult from cppa_slack_tracker.protocol_impl import SlackIncrementalState, SlackTrackerResult from cppa_youtube_script_tracker.protocol_impl import YoutubeScriptTrackerResult -from discord_activity_tracker.protocol_impl import ( - DiscordCollectionTrackerResult, - DiscordIncrementalState, -) from github_activity_tracker.protocol_impl import ( GitHubIncrementalState, GitHubSyncTrackerResult, @@ -38,7 +34,6 @@ [ GenericTrackerResult.ok(), GitHubSyncTrackerResult(success=True, counts={"issues": 1}), - DiscordCollectionTrackerResult(success=True, counts={"messages": 2}), PineconeSyncTrackerResult.from_sync_dict( {"upserted": 1, "total": 1, "failed_count": 0} ), @@ -62,7 +57,6 @@ def test_tracker_result_isinstance(result: TrackerResult) -> None: [ GenericIncrementalState(checkpoint_token="t", human_readable_marker="m"), GitHubIncrementalState.from_repo_watermark(repo_id=1, marker="2024"), - DiscordIncrementalState.from_after_date(after=None), MailingListIncrementalState.from_start_date("2024-01-01"), SlackIncrementalState.from_team(team_id="T1", start_date="2024-01-01"), ClangGithubIncrementalState.from_watermarks( diff --git a/core/tests/test_protocol_serialization.py b/core/tests/test_protocol_serialization.py index 078f2fa6..45f37180 100644 --- a/core/tests/test_protocol_serialization.py +++ b/core/tests/test_protocol_serialization.py @@ -25,11 +25,6 @@ from cppa_pinecone_sync.protocol_impl import PineconeSyncTrackerResult from cppa_slack_tracker.protocol_impl import SlackIncrementalState, SlackTrackerResult from cppa_youtube_script_tracker.protocol_impl import YoutubeScriptTrackerResult -from discord_activity_tracker.protocol_impl import ( - DiscordActivityRecord, - DiscordCollectionTrackerResult, - DiscordIncrementalState, -) from github_activity_tracker.protocol_impl import ( GitHubActivityRecord, GitHubIncrementalState, @@ -47,7 +42,6 @@ duration_seconds=1.5, ), GitHubSyncTrackerResult(success=True, counts={"issues": 1}), - DiscordCollectionTrackerResult(success=True, counts={"messages": 2}), PineconeSyncTrackerResult.from_sync_dict( {"upserted": 1, "total": 1, "failed_count": 0} ), @@ -64,12 +58,6 @@ _INCREMENTAL_STATES = [ GenericIncrementalState(checkpoint_token="t", human_readable_marker="m"), GitHubIncrementalState.from_repo_watermark(repo_id=1, marker="2024"), - DiscordIncrementalState.from_after_date(after=None), - DiscordIncrementalState.from_after_date( - after=datetime(2024, 6, 1, 12, 0, 0, tzinfo=timezone.utc), - last_message_id=100, - channel_id=55, - ), MailingListIncrementalState.from_start_date("2024-01-01"), SlackIncrementalState.from_team(team_id="T1", start_date="2024-01-01"), ClangGithubIncrementalState.from_watermarks(start_commit="abc", start_item="2024"), @@ -77,17 +65,6 @@ _ACTIVITY_RECORDS = [ GitHubActivityRecord.from_issue(repo_id=7, issue_number=123, summary="title"), - DiscordActivityRecord.from_converted_export_dict( - { - "id": 5, - "created_at": "2024-01-01T00:00:00.0000000Z", - "message_type": "Reply", - "content": "hello", - "author": {"id": 7}, - }, - server_id=1, - channel_id=2, - ), GenericActivityRecord( source_system=SourceSystem.GITHUB, external_id="1:issue:1", diff --git a/core/tests/test_protocols.py b/core/tests/test_protocols.py index adae3652..e62c577b 100644 --- a/core/tests/test_protocols.py +++ b/core/tests/test_protocols.py @@ -9,12 +9,6 @@ import pytest -from core.activity_types import ( - ActivityType, - SourceSystem, - actor_external_id, - parse_activity_occurred_at, -) from core.protocols import ( ActivityRecord, IncrementalState, @@ -24,10 +18,6 @@ require_tracker_result, ) from core.tracker_result import GenericTrackerResult -from discord_activity_tracker.protocol_impl import ( - DiscordActivityRecord, - DiscordCollectionTrackerResult, -) from github_activity_tracker.protocol_impl import ( GitHubActivityRecord, GitHubIncrementalState, @@ -60,19 +50,6 @@ class NotState: require_incremental_state(NotState()) -def test_activity_record_isinstance_discord_dataclass() -> None: - rec = DiscordActivityRecord( - source_system=SourceSystem.DISCORD, - external_id="1:2:3", - occurred_at=parse_activity_occurred_at("2024-01-01T00:00:00Z"), - activity_type=ActivityType.discord_message("Default"), - actor_external_id=actor_external_id("99"), - source_url="https://discord.com/channels/1/2/3", - summary="hi", - ) - assert isinstance(rec, ActivityRecord) - - def test_incremental_state_isinstance_github() -> None: st = GitHubIncrementalState.from_repo_watermark(repo_id=42, marker="2024-06") assert isinstance(st, IncrementalState) @@ -84,13 +61,6 @@ def test_activity_record_isinstance_github_from_issue() -> None: assert "7:issue:123" in rec.external_id -def test_tracker_result_isinstance_discord_dataclass() -> None: - r = DiscordCollectionTrackerResult( - success=True, counts={"messages": 5, "channels": 1} - ) - assert isinstance(r, TrackerResult) - - def test_require_tracker_result_raises_type_error_on_bad_object() -> None: class NotAResult: success = True diff --git a/core/utils/README.md b/core/utils/README.md index 7282d69a..8fc07370 100644 --- a/core/utils/README.md +++ b/core/utils/README.md @@ -7,7 +7,7 @@ Stateless helpers imported across apps. Prefer adding a focused module here rath | Module | Role | | --- | --- | | [`datetime_parsing.py`](datetime_parsing.py) | CLI/API date strings → timezone-aware `datetime`. | -| [`text_processing.py`](text_processing.py) | Slack/Discord message cleaning and filler filtering. | +| [`text_processing.py`](text_processing.py) | Slack message cleaning and filler filtering. | | [`boost_version_operations.py`](boost_version_operations.py) | Boost version parse, encode, and loose compare for metadata keys. | ## Tests diff --git a/core/utils/text_processing.py b/core/utils/text_processing.py index 0b8a7631..a9cf51d2 100644 --- a/core/utils/text_processing.py +++ b/core/utils/text_processing.py @@ -1,9 +1,8 @@ """ Shared text cleaning and light filtering helpers. -Used by ``cppa_slack_tracker`` and ``discord_activity_tracker`` for normalizing -message text. ``SLACK_*`` phrase lists feed :func:`filter_sentence` (Slack) and -:func:`clean_discord_text` (Discord markup strip + same filler removal). +Used by ``cppa_slack_tracker`` for normalizing message text. ``SLACK_*`` phrase lists +feed :func:`filter_sentence`. """ from __future__ import annotations @@ -88,56 +87,6 @@ } ) -# Discord message / export plaintext: user, role, channel mentions and custom emoji tokens. -_DISCORD_USER_MENTION_RE = re.compile(r"<@!?(\d+)>") -_DISCORD_ROLE_MENTION_RE = re.compile(r"<@&(\d+)>") -_DISCORD_CHANNEL_MENTION_RE = re.compile(r"<#(\d+)>") -_DISCORD_CUSTOM_EMOJI_RE = re.compile(r"") -_DISCORD_COLLAPSE_WHITESPACE_RE = re.compile(r"\s+") - - -def clean_discord_text( - text: str, - *, - greeting_words: Optional[Iterable[str]] = None, - unessential_words: Optional[Iterable[str]] = None, - min_words_after: int = 0, -) -> str: - """ - Strip Discord markup, then greeting / unessential phrases (``SLACK_*`` lists). - - User mentions ``<@123>`` / ``<@!123>``, roles ``<@&id>``, channels ``<#id>`` - are removed. Custom emoji ``<:name:id>`` and animated ```` become - ``:name:``. Whitespace is collapsed to single spaces, then :func:`filter_sentence` - removes filler phrases (same defaults as Slack). Output is **lowercased** - because ``filter_sentence`` lowercases for matching. - - Args: - text: Raw Discord message content. - greeting_words: Optional override for ``filter_sentence`` (default: - ``SLACK_GREETING_WORDS``). - unessential_words: Optional override for ``filter_sentence`` (default: - ``SLACK_UNESSENTIAL_WORDS``). - min_words_after: Passed to ``filter_sentence`` (default ``0`` so short - messages are not blanked by word-count rules after phrase removal). - - Returns: - Plaintext suitable for search / embedding pipelines. - """ - if not text: - return "" - text = _DISCORD_USER_MENTION_RE.sub("", text) - text = _DISCORD_ROLE_MENTION_RE.sub("", text) - text = _DISCORD_CHANNEL_MENTION_RE.sub("", text) - text = _DISCORD_CUSTOM_EMOJI_RE.sub(r":\1:", text) - text = _DISCORD_COLLAPSE_WHITESPACE_RE.sub(" ", text).strip() - return filter_sentence( - text, - greeting_words=greeting_words, - unessential_words=unessential_words, - min_words_after=min_words_after, - ) - def clean_text(text: str | None, remove_extra_spaces: bool = True) -> str: """ diff --git a/cppa_slack_tracker/README.md b/cppa_slack_tracker/README.md index b3c7453f..1451e317 100644 --- a/cppa_slack_tracker/README.md +++ b/cppa_slack_tracker/README.md @@ -18,7 +18,7 @@ Slack **teams, users, channels, memberships, and messages** are upserted into th ### How content is published to GitHub -**Not applicable** for the scheduled collector. Markdown or repo uploads for Slack-derived content, if any, are handled elsewhere (for example the long-running [`slack_event_handler`](../slack_event_handler/README.md) for huddle transcripts). +**Not applicable** for the scheduled collector. ### How vectors sync to Pinecone diff --git a/cppa_user_tracker/README.md b/cppa_user_tracker/README.md index 781a5e9e..7380ff12 100644 --- a/cppa_user_tracker/README.md +++ b/cppa_user_tracker/README.md @@ -2,9 +2,9 @@ ## Overview -**Identity and profiles for CPPA workflows** — GitHub accounts, Slack/Discord profiles, mailing identities, staging rows, and helpers other apps call while they ingest. This app is **not** a standalone “hit an API and fill the DB” collector today. +**Identity and profiles for CPPA workflows** — GitHub accounts, Slack profiles, mailing identities, staging rows, and helpers other apps call while they ingest. This app is **not** a standalone “hit an API and fill the DB” collector today. -**`run_cppa_user_tracker`** is still a **stub** (it logs and exits successfully). **Real writes** happen when **other apps** import [`services.py`](services.py) during Slack, GitHub, Discord, mailing list, or similar runs. +**`run_cppa_user_tracker`** is still a **stub** (it logs and exits successfully). **Real writes** happen when **other apps** import [`services.py`](services.py) during Slack, GitHub, mailing list, or similar runs. **Docs:** [docs/service_api/cppa_user_tracker.md](../docs/service_api/cppa_user_tracker.md) · [docs/Schema.md, section 1 — Base tables, Identity, and profiles](../docs/Schema.md#1-base-tables-identity-and-profiles) · [`models.py`](models.py) diff --git a/cppa_user_tracker/migrations/0010_remove_discordprofile.py b/cppa_user_tracker/migrations/0010_remove_discordprofile.py new file mode 100644 index 00000000..a724a45b --- /dev/null +++ b/cppa_user_tracker/migrations/0010_remove_discordprofile.py @@ -0,0 +1,26 @@ +"""Drop DiscordProfile from cppa_user_tracker Django state (table unchanged). + +The physical table ``cppa_user_tracker_discordprofile`` remains for existing rows. +When the Discord collector app is installed, it adopts this model in its own app +label via a separate state-only migration there. +""" + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0009_reddituser_alter_baseprofile_type"), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.DeleteModel( + name="DiscordProfile", + ), + ], + database_operations=[], + ), + ] diff --git a/cppa_user_tracker/models.py b/cppa_user_tracker/models.py index 8cf284e9..dfb2309c 100644 --- a/cppa_user_tracker/models.py +++ b/cppa_user_tracker/models.py @@ -180,22 +180,6 @@ def save(self, *args, **kwargs): updated_at = models.DateTimeField(auto_now=True) -class DiscordProfile(BaseProfile): - """Profile for Discord; extends BaseProfile.""" - - def save(self, *args, **kwargs): - self.type = ProfileType.DISCORD - super().save(*args, **kwargs) - - discord_user_id = models.BigIntegerField(unique=True, db_index=True) - username = models.CharField(max_length=255, db_index=True, blank=True) - display_name = models.CharField(max_length=255, db_index=True, blank=True) - avatar_url = models.URLField(max_length=512, blank=True) - is_bot = models.BooleanField(default=False) - created_at = models.DateTimeField(auto_now_add=True) - updated_at = models.DateTimeField(auto_now=True) - - class RedditUser(BaseProfile): """Profile for Reddit; extends BaseProfile.""" diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 5e0dd7d9..9a8e6b3a 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -28,7 +28,6 @@ GitHubAccountType, MailingListProfile, SlackUser, - DiscordProfile, RedditUser, WG21PaperAuthorProfile, YoutubeSpeaker, @@ -363,41 +362,6 @@ def get_or_create_unknown_github_account( return account, True -def get_or_create_discord_profile( - discord_user_id: int, - username: str = "", - display_name: str = "", - avatar_url: str = "", - is_bot: bool = False, - identity: Identity | None = None, -) -> tuple[DiscordProfile, bool]: - """Get or create a DiscordProfile by discord_user_id. Returns (profile, created). - - If profile exists, updates username, display_name, avatar_url, is_bot if provided. - identity is only set on creation; to update identity use a separate service function. - """ - username_val = username or "" - display_name_val = display_name or "" - avatar_url_val = avatar_url or "" - profile, created = DiscordProfile.objects.get_or_create( - discord_user_id=discord_user_id, - defaults={ - "username": username_val, - "display_name": display_name_val, - "avatar_url": avatar_url_val, - "is_bot": is_bot, - "identity": identity, - }, - ) - if not created: - profile.username = username_val or profile.username - profile.display_name = display_name_val or profile.display_name - profile.avatar_url = avatar_url_val or profile.avatar_url - profile.is_bot = is_bot - profile.save() - return profile, created - - def get_or_create_wg21_paper_author_profile( display_name: str, email: str | None = None, diff --git a/cppa_user_tracker/tests/test_services.py b/cppa_user_tracker/tests/test_services.py index 0586d0a9..9dc1b2c7 100644 --- a/cppa_user_tracker/tests/test_services.py +++ b/cppa_user_tracker/tests/test_services.py @@ -5,7 +5,6 @@ import pytest from cppa_user_tracker.models import ( - DiscordProfile, Email, GitHubAccount, GitHubAccountType, @@ -781,34 +780,6 @@ def test_get_or_create_slack_user_updates_existing(): assert user.emails.filter(email="slack@example.com").exists() -# --- get_or_create_discord_profile --- - - -@pytest.mark.django_db -def test_get_or_create_discord_profile_updates_existing(): - """Updating paths merge username, display_name, avatar_url, is_bot.""" - DiscordProfile.objects.create( - discord_user_id=999, - username="u", - display_name="d", - avatar_url="", - is_bot=False, - ) - profile, created = services.get_or_create_discord_profile( - 999, - username="newu", - display_name="newd", - avatar_url="http://img", - is_bot=True, - ) - assert created is False - profile.refresh_from_db() - assert profile.username == "newu" - assert profile.display_name == "newd" - assert profile.avatar_url == "http://img" - assert profile.is_bot is True - - # --- get_or_create_reddit_user --- diff --git a/discord_activity_tracker/README.md b/discord_activity_tracker/README.md deleted file mode 100644 index f54b60e8..00000000 --- a/discord_activity_tracker/README.md +++ /dev/null @@ -1,73 +0,0 @@ -# Discord Activity Tracker - -## Overview - -Ingests **Discord server activity** (messages, threads, exports) into PostgreSQL and related stores, using workspace paths and optional preprocessors. Uses **DiscordChatExporter** and shared operations documented under [docs/operations/discord_chat_exporter.md](../docs/operations/discord_chat_exporter.md). - -## Data workflow - -`run_discord_activity_tracker` chains **exporter fetch → PostgreSQL → Markdown on disk → optional git push → optional Pinecone**. Service API: [docs/service_api/discord_activity_tracker.md](../docs/service_api/discord_activity_tracker.md). Architecture context: [docs/Architecture_data_flow.md](../docs/Architecture_data_flow.md). - -### Where we fetch data - -**Discord** via **DiscordChatExporter** (configured credentials + server/channel configuration) within the `--since`/`--until` window, honoring resume semantics documented in the command help. - -### How data is saved to the database - -Messages, threads, and related entities are upserted into this app’s models. **Raw JSON** exports and intermediate artifacts are archived under `WORKSPACE_DIR` for replay and backfills (`backfill_discord_activity_tracker` reads the fixed import subtree). **References:** [docs/Schema.md, section 11 — Discord Activity Tracker](../docs/Schema.md#11-discord-activity-tracker-discord_activity_tracker) · [`models.py`](models.py) · [docs/service_api/discord_activity_tracker.md](../docs/service_api/discord_activity_tracker.md). - -### How content is published to GitHub - -Markdown is written under **`DISCORD_CONTEXT_REPO_PATH`**. When auto-commit is enabled and `--skip-remote-push` is **not** set, the collector **commits and pushes** that context repository using local git (see [`sync/export.py`](sync/export.py)). Configure credentials and remotes per your deployment docs. - -### How vectors sync to Pinecone - -Unless `--skip-pinecone` (or deprecated `--ignore-pinecone`) is set, the run invokes **`run_cppa_pinecone_sync`** with the Discord preprocessor so message text becomes searchable vectors in the configured namespace. See [docs/Pinecone_preprocess_guideline.md](../docs/Pinecone_preprocess_guideline.md). - -## Common tasks - -- Run the main tracker: `python manage.py run_discord_activity_tracker --help`. -- Historical repair: `python manage.py backfill_discord_activity_tracker --help`. -- App-specific service API: [docs/service_api/discord_activity_tracker.md](../docs/service_api/discord_activity_tracker.md). - -## Main command: `run_discord_activity_tracker` - -Orchestrates exporter fetch → DB upsert + raw JSON → Markdown export to `DISCORD_CONTEXT_REPO_PATH` → optional Pinecone via `run_cppa_pinecone_sync`. Requires configured Discord credentials (see `.env.example`), plus `DISCORD_SERVER_ID`; channel scope from `DISCORD_CHANNEL_IDS` unless `--channels` is set. - -| Option | Description | -| --- | --- | -| `--dry-run` | Log planned steps only; no fetch, export, push, or Pinecone writes. | -| `--skip-discord-sync` | Skip DiscordChatExporter fetch, DB upserts, and raw JSON archival. | -| `--skip-markdown-export` | Skip writing Markdown from the DB to `DISCORD_CONTEXT_REPO_PATH`. | -| `--skip-remote-push` | Skip git commit/push after Markdown export (when auto-commit is enabled). | -| `--skip-pinecone` / `--ignore-pinecone` | Skip Pinecone upsert for Discord messages (`--ignore-pinecone` is a deprecated alias). | -| `--since`, `--from-date`, `--start-time` | Exporter lower bound (`--after`): `YYYY-MM-DD` or ISO-8601 UTC. If omitted, resumes from latest DB message for the guild (or today UTC only if empty). | -| `--until`, `--to-date`, `--end-time` | Exporter upper bound (`--before`); same formats. Omitted = through present. | -| `--channels` | Comma-separated channel IDs (overrides `DISCORD_CHANNEL_IDS`). | -| `--task` | **Deprecated.** `sync` \| `export` \| `all` — prefer `--skip-*` flags. | - -### `backfill_discord_activity_tracker` - -Imports DiscordChatExporter JSON from the fixed workspace subtree (see command `help`), deletes each file after a successful import. - -| Option | Description | -| --- | --- | -| `--skip-pinecone` / `--ignore-pinecone` | Skip Pinecone after import (`--ignore-pinecone` is a deprecated alias). | -| `--dry-run` | List JSON files that would be imported without writing or deleting them. | - -## Management commands - -| Command | Purpose | -| --- | --- | -| `run_discord_activity_tracker` | Primary sync / collection command. | -| `backfill_discord_activity_tracker` | Backfill or repair historical activity data. | - -Run `python manage.py COMMAND --help` for options. - -## Tests - -```bash -python -m pytest discord_activity_tracker/tests/ -v -``` - -(from repo root; see root [README](../README.md#running-tests).) diff --git a/discord_activity_tracker/__init__.py b/discord_activity_tracker/__init__.py deleted file mode 100644 index d21746bb..00000000 --- a/discord_activity_tracker/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Discord Activity Tracker Django app. - -Persists Discord guild, channel, message, and reaction data for analytics, Markdown -context export, and Pinecone indexing. All writes to app models go through -``discord_activity_tracker.services``. Ingestion is driven by management commands and -sync helpers (DiscordChatExporter and optional discord.py paths). - -App config: ``discord_activity_tracker.apps.DiscordActivityTrackerConfig``. -""" diff --git a/discord_activity_tracker/admin.py b/discord_activity_tracker/admin.py deleted file mode 100644 index 8dd4f52e..00000000 --- a/discord_activity_tracker/admin.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Django admin configuration for Discord Activity Tracker.""" - -from django.contrib import admin -from .models import ( - DiscordServer, - DiscordChannel, - DiscordMessage, - DiscordReaction, -) - - -@admin.register(DiscordServer) -class DiscordServerAdmin(admin.ModelAdmin): - list_display = ("server_name", "server_id", "created_at", "updated_at") - search_fields = ("server_name", "server_id") - readonly_fields = ("created_at", "updated_at") - - -@admin.register(DiscordChannel) -class DiscordChannelAdmin(admin.ModelAdmin): - list_display = ( - "channel_name", - "channel_id", - "server", - "channel_type", - "category_name", - ) - list_filter = ("channel_type", "server") - search_fields = ("channel_name", "channel_id", "category_name") - readonly_fields = ("created_at", "updated_at") - - -@admin.register(DiscordMessage) -class DiscordMessageAdmin(admin.ModelAdmin): - list_display = ( - "message_id", - "channel", - "author", - "message_type", - "is_pinned", - "message_created_at", - "is_deleted", - ) - list_filter = ( - "is_deleted", - "has_attachments", - "message_type", - "is_pinned", - "channel", - ) - search_fields = ("content", "message_id", "author__username") - readonly_fields = ("created_at", "updated_at") - date_hierarchy = "message_created_at" - - -@admin.register(DiscordReaction) -class DiscordReactionAdmin(admin.ModelAdmin): - list_display = ("emoji", "message", "count") - search_fields = ("emoji", "message__message_id") - readonly_fields = ("created_at", "updated_at") diff --git a/discord_activity_tracker/api_schemas.py b/discord_activity_tracker/api_schemas.py deleted file mode 100644 index 88db17e0..00000000 --- a/discord_activity_tracker/api_schemas.py +++ /dev/null @@ -1,95 +0,0 @@ -"""Pydantic models for discord.py API payloads at ingestion boundaries (live sync).""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any, NoReturn - -from pydantic import BaseModel, ConfigDict, Field, ValidationError - - -class DiscordLiveSyncValidationError(ValueError): - """Discord API payload failed Pydantic validation (live-sync path).""" - - -class DiscordLiveUserPayload(BaseModel): - """Normalized author from Bot API or exporter-shaped dict.""" - - model_config = ConfigDict(extra="allow") - - user_id: int - username: str = "unknown" - display_name: str = "" - avatar_url: str = "" - is_bot: bool = False - - -class DiscordReactionPayload(BaseModel): - model_config = ConfigDict(extra="allow") - - discord_message_id: int - emoji: str = Field(min_length=1) - count: int = Field(default=0, ge=0) - - -class DiscordLivePreparedMessage(BaseModel): - """Output of ``_prepare_message_data`` for bulk DB upsert.""" - - model_config = ConfigDict(extra="allow") - - message_id: int - author: DiscordLiveUserPayload - content: str = "" - message_type: str = "Default" - is_pinned: bool = False - message_created_at: datetime - message_edited_at: datetime | None = None - reply_to_message_id: int | None = None - attachment_urls: list[str] = Field(default_factory=list) - reactions: list[Any] = Field(default_factory=list) - - -def _validation_error(prefix: str, err: ValidationError) -> NoReturn: - detail = err.errors()[:5] - msg = f"{prefix}: " + "; ".join( - f"{e.get('loc', ())}: {e.get('msg', '')}" for e in detail - ) - if len(err.errors()) > 5: - msg += f" … ({len(err.errors())} errors total)" - raise DiscordLiveSyncValidationError(msg) from err - - -def parse_live_user( - data: dict[str, Any], - *, - source: str | None = None, -) -> DiscordLiveUserPayload: - prefix = f"Invalid Discord live user{f' ({source})' if source else ''}" - try: - return DiscordLiveUserPayload.model_validate(data) - except ValidationError as e: - _validation_error(prefix, e) - - -def parse_live_message( - data: dict[str, Any], - *, - source: str | None = None, -) -> DiscordLivePreparedMessage: - prefix = f"Invalid Discord live message{f' ({source})' if source else ''}" - try: - return DiscordLivePreparedMessage.model_validate(data) - except ValidationError as e: - _validation_error(prefix, e) - - -def parse_reaction( - data: dict[str, Any], - *, - source: str | None = None, -) -> DiscordReactionPayload: - prefix = f"Invalid Discord reaction{f' ({source})' if source else ''}" - try: - return DiscordReactionPayload.model_validate(data) - except ValidationError as e: - _validation_error(prefix, e) diff --git a/discord_activity_tracker/apps.py b/discord_activity_tracker/apps.py deleted file mode 100644 index e5ce1ba2..00000000 --- a/discord_activity_tracker/apps.py +++ /dev/null @@ -1,7 +0,0 @@ -from django.apps import AppConfig - - -class DiscordActivityTrackerConfig(AppConfig): - default_auto_field = "django.db.models.BigAutoField" - name = "discord_activity_tracker" - verbose_name = "Discord Activity Tracker" diff --git a/discord_activity_tracker/management/__init__.py b/discord_activity_tracker/management/__init__.py deleted file mode 100644 index 12d73c60..00000000 --- a/discord_activity_tracker/management/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Django ``management`` package for ``discord_activity_tracker`` (``manage.py`` commands).""" diff --git a/discord_activity_tracker/management/commands/__init__.py b/discord_activity_tracker/management/commands/__init__.py deleted file mode 100644 index 3ee9b00a..00000000 --- a/discord_activity_tracker/management/commands/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Management commands: ``run_discord_activity_tracker``, ``backfill_discord_activity_tracker``.""" diff --git a/discord_activity_tracker/management/commands/backfill_discord_activity_tracker.py b/discord_activity_tracker/management/commands/backfill_discord_activity_tracker.py deleted file mode 100644 index 23a1438a..00000000 --- a/discord_activity_tracker/management/commands/backfill_discord_activity_tracker.py +++ /dev/null @@ -1,265 +0,0 @@ -"""Django management command ``backfill_discord_activity_tracker``. - -Imports **pre-exported** DiscordChatExporter JSON from the workspace drop folder -(``workspace/discord_activity_tracker/Discussion - c-cpp-discussion/``, -recursively), validates envelope and normalized messages, upserts into the database -via the service layer, then **deletes** each file after a successful import so it is -not processed again. - -This command does **not** invoke DiscordChatExporter; place JSON exports in the drop -folder manually or from another host. - -Optional arguments: ``--dry-run`` (list files only), ``--skip-pinecone`` / -``--ignore-pinecone`` (skip ``task_discord_pinecone_sync`` after import). See -``Command.add_arguments`` and ``docs/service_api/discord_activity_tracker.md``. - -Side effects: DB writes to ``DiscordServer``, ``DiscordChannel``, ``DiscordMessage``, -``DiscordReaction``, and ``DiscordProfile`` (via services); filesystem deletes on -success; Pinecone sync when enabled. - -Raises: - Per-file parse/validation failures are caught inside ``DiscordBackfillCollector.collect`` - (logged and reported on stdout); they do not abort the whole command. Uncaught - exceptions from ``sync_pinecone`` or the base command layer may still propagate. -""" - -from __future__ import annotations - -import asyncio -import logging -from pathlib import Path -from typing import Any - -from asgiref.sync import sync_to_async - -from core.collectors import AbstractCollector, BaseCollectorCommand -from core.protocols import TrackerResult -from discord_activity_tracker.protocol_impl import DiscordCollectionTrackerResult -from discord_activity_tracker.pinecone_runner import task_discord_pinecone_sync -from discord_activity_tracker.services import ( - get_or_create_discord_channel, - get_or_create_discord_server, -) -from discord_activity_tracker.staging_schema import ( - validate_envelope, - validate_normalized_message, -) -from discord_activity_tracker.sync.chat_exporter import ( - convert_exporter_message_to_dict, - filter_discord_export_json_paths, - parse_exported_json, - _safe_int, -) -from discord_activity_tracker.sync.messages import _process_messages_in_batches -from discord_activity_tracker.workspace import get_cpp_discussion_import_dir - -logger = logging.getLogger(__name__) - - -def _json_display_path(import_dir: Path, json_path: Path) -> str: - """Short path for logs (relative to import root when possible).""" - try: - return str(json_path.relative_to(import_dir)) - except ValueError: - return json_path.name - - -class DiscordBackfillCollector(AbstractCollector): - """Backfill collector: scan drop folder, import each JSON, delete on success. - - ``collect()`` lists JSON under ``get_cpp_discussion_import_dir()``, optionally - dry-run prints paths, else for each file parses, validates staging schema, - upserts messages in batches, unlinks the file on success, or logs failure and - keeps the file. - - ``sync_pinecone()`` runs after a successful collector run (unless dry-run or - ``skip_pinecone``). - - Side effects: Same as module docstring (DB, deletes, optional Pinecone). - """ - - def __init__(self, *, stdout, style, **opts: Any) -> None: - self.stdout = stdout - self.style = style - self.dry_run: bool = opts["dry_run"] - self.skip_pinecone: bool = bool(opts.get("skip_pinecone")) - - @property - def name(self) -> str: - return "discord_activity_tracker_backfill" - - def validate_config(self) -> None: - return None - - def collect(self) -> TrackerResult: - import_dir = get_cpp_discussion_import_dir() - json_files = sorted( - filter_discord_export_json_paths(import_dir.rglob("*.json")) - ) - - self.stdout.write("=== Discord JSON import (c-cpp-discussion) ===") - self.stdout.write(f" Folder: {import_dir}") - self.stdout.write(f" Files: {len(json_files)}") - - if self.dry_run: - for p in json_files: - self.stdout.write( - f" (dry-run) would import {_json_display_path(import_dir, p)}" - ) - self.stdout.write(self.style.WARNING("DRY RUN — no writes or deletes")) - return DiscordCollectionTrackerResult( - success=True, counts={"files": len(json_files), "dry_run": 1} - ) - - processed_total = 0 - failed_files = 0 - errors: list[str] = [] - for i, json_path in enumerate(json_files, 1): - rel = _json_display_path(import_dir, json_path) - try: - data = parse_exported_json(json_path) - envelope = validate_envelope(data, source=rel) - guild_info = envelope.guild.model_dump(by_alias=True) - channel_info = envelope.channel.model_dump(by_alias=True) - messages = envelope.messages - - ch_name = channel_info.get("name", "?") - self.stdout.write( - f" [{i}/{len(json_files)}] {rel} — #{ch_name}: {len(messages)} messages" - ) - count = asyncio.run( - self._persist_channel(guild_info, channel_info, messages) - ) - processed_total += count - json_path.unlink(missing_ok=True) - self.stdout.write( - self.style.SUCCESS(f" Imported {count}; removed {rel}") - ) - except Exception as exc: - failed_files += 1 - err_msg = f"{rel}: {exc}" - errors.append(err_msg) - logger.error("Failed to process %s: %s", rel, exc) - self.stdout.write(self.style.ERROR(f" Failed {rel}: {exc}")) - - summary = ( - f"Import complete: {processed_total} messages from " - f"{len(json_files)} file(s)" - ) - if failed_files: - summary += f" ({failed_files} failed)" - self.stdout.write(self.style.WARNING(summary)) - else: - self.stdout.write(self.style.SUCCESS(summary)) - - return DiscordCollectionTrackerResult( - success=failed_files == 0, - counts={ - "messages": processed_total, - "files": len(json_files), - "failed_files": failed_files, - }, - errors=tuple(errors), - ) - - async def _persist_channel( - self, - guild_info: dict, - channel_info: dict, - messages: list, - ) -> int: - server, _ = await sync_to_async(get_or_create_discord_server)( - server_id=_safe_int(guild_info.get("id", 0)), - server_name=guild_info.get("name", ""), - icon_url=guild_info.get("iconUrl", ""), - ) - - raw_cat_id = channel_info.get("categoryId") - category_id = _safe_int(raw_cat_id) if raw_cat_id else None - - channel, _ = await sync_to_async(get_or_create_discord_channel)( - server=server, - channel_id=_safe_int(channel_info.get("id", 0)), - channel_name=channel_info.get("name", ""), - channel_type=channel_info.get("type", "GuildTextChat"), - topic=channel_info.get("topic") or "", - position=0, - category_id=category_id, - category_name=channel_info.get("category") or "", - ) - - srv_id = _safe_int(guild_info.get("id", 0)) - ch_id = _safe_int(channel_info.get("id", 0)) - converted = [ - convert_exporter_message_to_dict(m, server_id=srv_id, channel_id=ch_id) - for m in messages - ] - for idx, cmsg in enumerate(converted): - validate_normalized_message(cmsg, source=f"message[{idx}]") - count = await _process_messages_in_batches(channel, converted) - return count - - def sync_pinecone(self) -> None: - if self.dry_run or self.skip_pinecone: - return - task_discord_pinecone_sync(dry_run=False) - - -class Command(BaseCollectorCommand): - """``manage.py backfill_discord_activity_tracker`` — import JSON from the drop folder. - - Uses ``DiscordBackfillCollector``. Required layout: JSON files under - ``{WORKSPACE_DIR}/discord_activity_tracker/Discussion - c-cpp-discussion/``. - - Optional arguments: ``--dry-run``, ``--skip-pinecone`` / ``--ignore-pinecone``. - - Examples: - ``python manage.py backfill_discord_activity_tracker`` - - ``python manage.py backfill_discord_activity_tracker --dry-run`` - - ``python manage.py backfill_discord_activity_tracker --skip-pinecone`` - - Raises: - Per-file errors are swallowed in the collector loop; see class docstring. - Base command / Pinecone task may raise if misconfigured. - - See Also: - ``docs/service_api/discord_activity_tracker.md`` - """ - - help = ( - "Import DiscordChatExporter JSON from " - "workspace/discord_activity_tracker/Discussion - c-cpp-discussion/ " - "(recursively) into the database; delete each file after successful import." - ) - - def add_arguments(self, parser): - parser.add_argument( - "--skip-pinecone", - action="store_true", - dest="skip_pinecone", - help="Skip Pinecone sync after import", - ) - parser.add_argument( - "--ignore-pinecone", - action="store_true", - dest="skip_pinecone", - help="Deprecated alias for --skip-pinecone.", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="List JSON files that would be imported without writing or deleting", - ) - - def get_collector(self, **options: Any) -> AbstractCollector: - opts = dict(options) - if opts.get("skip_pinecone") is None: - opts["skip_pinecone"] = False - return DiscordBackfillCollector( - stdout=self.stdout, - style=self.style, - dry_run=opts["dry_run"], - skip_pinecone=opts["skip_pinecone"], - ) diff --git a/discord_activity_tracker/management/commands/extract_discord_tokens.py b/discord_activity_tracker/management/commands/extract_discord_tokens.py deleted file mode 100644 index d5fa6929..00000000 --- a/discord_activity_tracker/management/commands/extract_discord_tokens.py +++ /dev/null @@ -1,63 +0,0 @@ -""" -Management command: extract_discord_tokens - -Persist Discord session credentials to workspace JSON. -""" - -import logging - -from django.conf import settings -from django.core.management.base import BaseCommand, CommandError - -from discord_activity_tracker.utils.discord_internal_tokens_store import ( - discord_internal_tokens_json_path, - extract_and_save_discord_internal_tokens, -) -from discord_activity_tracker.utils.discord_tokens import ( - _resolve_discord_chrome_profile_root, -) -from discord_activity_tracker.workspace import get_chrome_profile_path - -logger = logging.getLogger(__name__) - - -class Command(BaseCommand): - help = ( - "Persist Discord session credentials to " - "workspace/discord_activity_tracker/discord_internal_tokens.json." - ) - - def handle(self, *args, **options): - allow_raw = getattr(settings, "ALLOW_INTERNAL_DISCORD_TOKENS", "") or "" - if isinstance(allow_raw, bool): - allow = allow_raw - else: - allow = str(allow_raw).strip().lower() == "true" - if not allow: - self.stderr.write( - self.style.WARNING( - "Internal Discord session mode is not enabled: credentials will be saved to " - "workspace JSON but ignored by Django until enabled. " - "Restart web/celery after enabling. See .env.example." - ) - ) - - try: - profile = _resolve_discord_chrome_profile_root() - except ValueError as e: - raise CommandError(str(e)) from e - profile_path = str(profile) - if not profile.is_dir(): - raise CommandError( - "Session storage not found " - f"({profile_path}). Expected: {get_chrome_profile_path()}. " - "See .env.example." - ) - - token = extract_and_save_discord_internal_tokens() - if not token: - raise CommandError("Failed to load session credentials. See .env.example.") - out_path = discord_internal_tokens_json_path() - self.stdout.write( - self.style.SUCCESS(f"Saved Discord session credentials to {out_path}.") - ) diff --git a/discord_activity_tracker/management/commands/run_discord_activity_tracker.py b/discord_activity_tracker/management/commands/run_discord_activity_tracker.py deleted file mode 100644 index 21d33025..00000000 --- a/discord_activity_tracker/management/commands/run_discord_activity_tracker.py +++ /dev/null @@ -1,705 +0,0 @@ -"""Django management command ``run_discord_activity_tracker``. - -Orchestrates the scheduled Discord ingest pipeline: workspace prep, optional -DiscordChatExporter fetch with DB upsert and raw JSON archival, Markdown export to -``DISCORD_CONTEXT_REPO_PATH``, and optional Pinecone sync via ``run_cppa_pinecone_sync``. - -Phases (see ``DiscordActivityCollector`` and task helpers in this module): - - 1. **Workspace** — Ensure raw/staging dirs under ``WORKSPACE_DIR`` (see - ``discord_activity_tracker.workspace``). - 2. **Sync** — Run DiscordChatExporter (unless ``--skip-discord-sync``), parse JSON, - validate staging schema, upsert via ``discord_activity_tracker.services``, - move exports under - ``{WORKSPACE_DIR}/raw/discord_activity_tracker///``. - 3. **Markdown** — Export DB rows to the context repo (unless ``--skip-markdown-export``); - optional git push when ``DISCORD_CONTEXT_AUTO_COMMIT`` is true and - ``--skip-remote-push`` is not set. - 4. **Pinecone** — ``task_discord_pinecone_sync`` when ``PINECONE_DISCORD_*`` are set - and ``--skip-pinecone`` is not used. - -Required settings for a full sync: configured Discord credentials (see ``.env.example``), -``DISCORD_SERVER_ID``. -Channel scope uses ``DISCORD_CHANNEL_IDS`` unless overridden by ``--channels``. - -CLI flags are documented on ``Command.add_argument`` ``help=`` strings and in -``docs/service_api/discord_activity_tracker.md``. - -Raises: - django.core.management.base.CommandError: Missing credentials/guild, invalid - ``--since``/``--until`` parse, or DiscordChatExporter failure (wrapped from - ``DiscordChatExporterError``). Other exceptions from the collector may propagate - after logging from ``_handle_core``. -""" - -from __future__ import annotations - -import asyncio -import logging -from datetime import datetime, timezone -from pathlib import Path -from typing import Any - -from asgiref.sync import sync_to_async -from django.conf import settings -from django.core.management.base import CommandError - -from core.collectors import AbstractCollector, BaseCollectorCommand -from core.protocols import IncrementalState, TrackerResult -from discord_activity_tracker.protocol_impl import ( - DiscordCollectionTrackerResult, - DiscordIncrementalState, -) -from core.utils.datetime_parsing import parse_iso_datetime -from discord_activity_tracker.models import DiscordServer -from discord_activity_tracker.pinecone_runner import task_discord_pinecone_sync -from discord_activity_tracker.services import ( - get_or_create_discord_channel, - get_or_create_discord_server, -) -from discord_activity_tracker.staging_schema import ( - StagingValidationError, - validate_envelope, - validate_normalized_message, -) -from discord_activity_tracker.sync.exporter_window import ( - latest_message_created_at_for_guild, -) -from discord_activity_tracker.sync.chat_exporter import ( - ChannelDayExport, - DiscordChatExporterError, - _safe_int, - convert_exporter_message_to_dict, - export_guild_to_json, - parse_exported_json, -) -from discord_activity_tracker.sync.raw_archive import merge_exporter_json -from discord_activity_tracker.sync.messages import _process_messages_in_batches -from discord_activity_tracker.workspace import ( - clear_exporter_staging_dir, - get_channel_raw_dir, - get_exporter_staging_dir, - get_raw_dir, -) - -logger = logging.getLogger(__name__) - - -def _parse_channel_ids(raw: str) -> list[int]: - """Parse comma-separated channel ID strings to a list of ints.""" - return [int(c.strip()) for c in raw.split(",") if c.strip().isdigit()] - - -def _naive_utc_to_aware_utc(dt: datetime) -> datetime: - """``parse_iso_datetime`` returns naive UTC; attach UTC tzinfo for exporter bounds.""" - if dt.tzinfo is None: - return dt.replace(tzinfo=timezone.utc) - return dt.astimezone(timezone.utc) - - -def _resolve_exporter_date_bounds( - options: dict, - *, - guild_snowflake: int, - channel_ids: list[int], -) -> tuple[datetime | None, datetime | None, bool]: - """Compute exporter date bounds and whether incremental mode is per-channel. - - - With ``--since``: lower bound is that timestamp for every channel. - - Without ``--since``: each channel resumes from the UTC day start of its own latest - stored message (overlap re-export; duplicates merged by message id). Channels with - no rows export today (UTC) only. - - With ``--until``: upper bound is that timestamp. - - Without ``--until``: upper bound is ``None`` (export through the present; no ``--before``). - - Returns ``(after_date, before_date, per_channel_incremental)``. When - ``per_channel_incremental`` is true, ``after_date`` is only used for logging / - checkpoint display (guild-wide latest), not passed to DiscordChatExporter. - """ - since_s = (options.get("since") or "").strip() or None - until_s = (options.get("until") or "").strip() or None - try: - since = parse_iso_datetime(since_s) - until = parse_iso_datetime(until_s) - except ValueError as e: - raise CommandError(str(e)) from e - - if since and until and since > until: - logger.warning( - "invalid date range: since (%s) is after until (%s); falling back to defaults", - since.isoformat(), - until.isoformat(), - ) - since, until = None, None - - scope = channel_ids if channel_ids else None - - if since is not None: - after_date = _naive_utc_to_aware_utc(since) - else: - latest_row = latest_message_created_at_for_guild( - guild_snowflake, - channel_ids=scope, - ) - after_date = ( - latest_row.astimezone(timezone.utc) if latest_row is not None else None - ) - if after_date is not None: - logger.debug( - "exporter lower bound from DB (--since omitted): %s", - after_date.isoformat(), - ) - else: - logger.debug( - "exporter lower bound: today UTC only (--since omitted, empty DB for guild scope)", - ) - - if until is not None: - before_date = _naive_utc_to_aware_utc(until) - else: - before_date = None - - per_channel_incremental = since is None - return after_date, before_date, per_channel_incremental - - -def task_preprocess_workspace(*, dry_run: bool) -> None: - """Ensure ``WORKSPACE_DIR/raw/discord_activity_tracker`` and staging dirs exist.""" - get_exporter_staging_dir() - get_raw_dir() - if dry_run: - logger.info( - "dry-run would ensure raw workspace under %s", - get_raw_dir(), - ) - - -def task_discord_sync( - *, - dry_run: bool, - skip_discord_sync: bool, - user_token: str, - guild_id: int, - channel_ids: list[int], - after_date: datetime | None, - before_date: datetime | None, - per_channel_incremental: bool, - collector: "DiscordActivityCollector", -) -> int: - """DiscordChatExporter → parse → db_sync → archive JSON per channel.""" - if skip_discord_sync: - logger.info("skipping Discord fetch / DB / raw (--skip-discord-sync)") - return 0 - - if dry_run: - logger.info( - "dry-run would run DiscordChatExporter and persist messages + raw JSON" - ) - return 0 - - raw_root = get_raw_dir() - staging = get_exporter_staging_dir() - clear_exporter_staging_dir() - - collector.stdout.write("=== Discord sync (fetch → db_sync → save_raw) ===") - if per_channel_incremental: - collector.stdout.write( - "Incremental: per-channel lower bound (UTC day start of latest stored " - "message per channel; duplicates merged by message id)" - ) - elif after_date: - collector.stdout.write( - f"Incremental: fetching messages after {after_date.isoformat()} UTC" - ) - else: - collector.stdout.write("Full mode: fetching all messages (no --after filter)") - if before_date: - collector.stdout.write( - f"Upper bound: messages before {before_date.isoformat()} UTC" - ) - - try: - exports: list[ChannelDayExport] = export_guild_to_json( - user_token=user_token, - guild_id=guild_id, - output_dir=staging, - after_date=after_date if not per_channel_incremental else None, - before_date=before_date, - channel_ids=channel_ids or None, - per_channel_incremental=per_channel_incremental, - ) - except DiscordChatExporterError as exc: - raise CommandError(f"DiscordChatExporter failed: {exc}") from exc - - collector.stdout.write(f"Exported {len(exports)} channel-day file(s)") - - processed_total = 0 - for i, export in enumerate(exports, 1): - json_path = export.path - day_str = export.day_str - try: - data = parse_exported_json(json_path) - envelope = validate_envelope(data, source=json_path.name) - guild_info = envelope.guild.model_dump(by_alias=True) - channel_info = envelope.channel.model_dump(by_alias=True) - messages = envelope.messages - - ch_name = channel_info.get("name", "?") - ch_id = _safe_int(channel_info.get("id", 0)) - srv_id = _safe_int(guild_info.get("id", 0)) - - if channel_ids and ch_id not in channel_ids: - logger.debug("Skipping channel %s (not in allowlist)", ch_id) - json_path.unlink(missing_ok=True) - continue - - collector.stdout.write( - f" [{i}/{len(exports)}] #{ch_name} / {day_str}: " - f"{len(messages)} message(s) fetched" - ) - count = asyncio.run( - collector._persist_channel(guild_info, channel_info, messages) - ) - processed_total += count - - channel_raw_dir = get_channel_raw_dir(srv_id, ch_id) - dest = channel_raw_dir / f"{day_str}.json" - merged_count = merge_exporter_json(dest, data, day=day_str) - collector.stdout.write( - f" archived {merged_count} message(s) -> {dest.name}" - ) - json_path.unlink(missing_ok=True) - - except StagingValidationError as exc: - logger.error( - "Staging validation failed for %s (file left in staging): %s", - json_path.name, - exc, - ) - continue - except ValueError as exc: - logger.error("Failed to process %s: %s", json_path.name, exc) - json_path.unlink(missing_ok=True) - continue - except Exception as exc: - logger.error("Failed to process %s: %s", json_path.name, exc) - json_path.unlink(missing_ok=True) - continue - - collector.stdout.write( - collector.style.SUCCESS( - f"Synced {processed_total} messages across all channels" - ) - ) - logger.debug("raw archive root: %s", raw_root) - return processed_total - - -def task_markdown_export_and_push( - *, - dry_run: bool, - skip_markdown_export: bool, - skip_remote_push: bool, - guild_id: int, - collector: "DiscordActivityCollector", -) -> None: - """Export Markdown to DISCORD_CONTEXT_REPO_PATH; optional git commit/push.""" - if skip_markdown_export: - logger.info("skipping Markdown export (--skip-markdown-export)") - return - - from discord_activity_tracker.sync.export import export_and_push - - context_repo_path = getattr(settings, "DISCORD_CONTEXT_REPO_PATH", None) - if not context_repo_path: - collector.stdout.write( - collector.style.WARNING( - "DISCORD_CONTEXT_REPO_PATH not set; skipping export" - ) - ) - return - - if dry_run: - collector.stdout.write( - "dry-run would export Markdown to " + str(context_repo_path) - ) - return - - try: - server = DiscordServer.objects.get(server_id=guild_id) - except DiscordServer.DoesNotExist: - collector.stdout.write( - collector.style.WARNING("Server not in DB — run sync first") - ) - return - - auto_commit = bool( - (not skip_remote_push) - and getattr(settings, "DISCORD_CONTEXT_AUTO_COMMIT", False) - ) - if skip_remote_push: - logger.info( - "skipping remote git push (--skip-remote-push); " - "files still written unless export fails" - ) - - success = export_and_push( - context_repo_path=Path(context_repo_path), - server=server, - auto_commit=auto_commit, - ) - if success: - collector.stdout.write( - collector.style.SUCCESS(f"Exported to {context_repo_path}") - ) - else: - collector.stdout.write(collector.style.WARNING("No markdown files exported")) - - -class DiscordActivityCollector(AbstractCollector): - """Collector implementation for ``run_discord_activity_tracker``. - - Holds stdout/style, resolved ``channel_ids`` (from ``--channels`` or - ``settings.DISCORD_CHANNEL_IDS``), and delegates to ``Command._handle_core``. - - ``collect()`` drives fetch → Markdown → Pinecone according to options. - ``sync_pinecone()`` runs ``task_discord_pinecone_sync`` when not dry-run and not - skipping Pinecone. - - Side effects: Same as the management command (DB, filesystem, subprocess calls - to DiscordChatExporter and Pinecone tooling via configured runners). - """ - - def __init__(self, cmd: "Command", options: dict) -> None: - self.cmd = cmd - self.options = options - self.stdout = cmd.stdout - self.style = cmd.style - - raw_channels = (options.get("channels") or "").strip() - if raw_channels: - self.channel_ids: list[int] = _parse_channel_ids(raw_channels) - else: - self.channel_ids = list(getattr(settings, "DISCORD_CHANNEL_IDS", [])) - - @property - def name(self) -> str: - return "discord_activity_tracker" - - def validate_config(self) -> None: - return None - - def load_incremental_state(self) -> IncrementalState | None: - guild_id: int | None = getattr(settings, "DISCORD_SERVER_ID", None) - if not guild_id: - return None - after_date, _before, _per_ch = _resolve_exporter_date_bounds( - self.options, - guild_snowflake=guild_id, - channel_ids=self.channel_ids, - ) - return DiscordIncrementalState.from_after_date(after=after_date) - - def collect(self) -> TrackerResult: - return self.cmd._handle_core(self.options, collector=self) - - def sync_pinecone(self) -> None: - if self.options.get("dry_run") or self.options.get("skip_pinecone"): - return - task_discord_pinecone_sync(dry_run=False) - - async def _persist_channel( - self, - guild_info: dict, - channel_info: dict, - messages: list, - ) -> int: - """Persist one channel's messages to DB.""" - server, _ = await sync_to_async(get_or_create_discord_server)( - server_id=_safe_int(guild_info.get("id", 0)), - server_name=guild_info.get("name", ""), - icon_url=guild_info.get("iconUrl", ""), - ) - - raw_cat_id = channel_info.get("categoryId") - category_id = _safe_int(raw_cat_id) if raw_cat_id else None - - channel, _ = await sync_to_async(get_or_create_discord_channel)( - server=server, - channel_id=_safe_int(channel_info.get("id", 0)), - channel_name=channel_info.get("name", ""), - channel_type=channel_info.get("type", "GuildTextChat"), - topic=channel_info.get("topic") or "", - position=0, - category_id=category_id, - category_name=channel_info.get("category") or "", - ) - - srv_id = _safe_int(guild_info.get("id", 0)) - ch_id = _safe_int(channel_info.get("id", 0)) - converted = [ - convert_exporter_message_to_dict(m, server_id=srv_id, channel_id=ch_id) - for m in messages - ] - for idx, cmsg in enumerate(converted): - validate_normalized_message(cmsg, source=f"message[{idx}]") - count = await _process_messages_in_batches(channel, converted) - return count - - -class Command(BaseCollectorCommand): - """``manage.py run_discord_activity_tracker`` — incremental Discord ingest and exports. - - Wraps ``DiscordActivityCollector`` with ``BaseCollectorCommand`` (dry-run, logging, - collector phases). See module docstring for phases and required settings. - - Optional arguments (full text on each ``add_argument``): - - ``--dry-run``, ``--skip-discord-sync``, ``--skip-markdown-export``, - ``--skip-remote-push``, ``--skip-pinecone`` / ``--ignore-pinecone``, - ``--since`` / ``--until`` (and aliases), ``--channels``, ``--task`` (deprecated). - - Examples: - ``python manage.py run_discord_activity_tracker`` — full pipeline with - settings-based channel allowlist. - - ``python manage.py run_discord_activity_tracker --dry-run`` — log planned - steps only. - - ``python manage.py run_discord_activity_tracker --channels 123,456 --skip-pinecone`` — - restrict channels and skip Pinecone. - - Raises: - CommandError: If Discord credentials or ``DISCORD_SERVER_ID`` is unset, or - date options fail to parse, or DiscordChatExporter fails (see ``task_discord_sync``). - - See Also: - ``docs/service_api/discord_activity_tracker.md`` - ``docs/operations/discord_chat_exporter.md`` - """ - - help = ( - "Discord activity tracker: (1) fetch via DiscordChatExporter + DB + raw archive; " - "(2) export Markdown to DISCORD_CONTEXT_REPO_PATH; " - "(3) Pinecone upsert (PINECONE_DISCORD_* settings). " - "Use --skip-* to skip steps; default runs all." - ) - - def add_arguments(self, parser): - parser.add_argument( - "--dry-run", - action="store_true", - help="No fetch, export, push, or Pinecone writes; planned steps logged at INFO.", - ) - parser.add_argument( - "--skip-discord-sync", - action="store_true", - help="Skip DiscordChatExporter fetch, DB upserts, and raw JSON archival.", - ) - parser.add_argument( - "--skip-markdown-export", - action="store_true", - help="Skip writing Markdown from the DB to DISCORD_CONTEXT_REPO_PATH.", - ) - parser.add_argument( - "--skip-remote-push", - action="store_true", - help="Skip git commit/push after Markdown export (requires DISCORD_CONTEXT_AUTO_COMMIT).", - ) - parser.add_argument( - "--skip-pinecone", - action="store_true", - dest="skip_pinecone", - help="Skip run_cppa_pinecone_sync for Discord messages.", - ) - parser.add_argument( - "--ignore-pinecone", - action="store_true", - dest="skip_pinecone", - help="Deprecated alias for --skip-pinecone.", - ) - parser.add_argument( - "--since", - "--from-date", - "--start-time", - type=str, - default=None, - dest="since", - help="Exporter lower bound (--after): YYYY-MM-DD or ISO-8601 (UTC). " - "If omitted, uses the latest message time already in the DB for this guild " - "(and channel allowlist), or today (UTC) only when the DB has no rows.", - ) - parser.add_argument( - "--until", - "--to-date", - "--end-time", - type=str, - default=None, - dest="until", - help="Exporter upper bound (--before): same formats as --since; " - "default when omitted: no upper bound (through present). " - "--to-date is deprecated; --end-time is an alias for --until.", - ) - parser.add_argument( - "--channels", - type=str, - default="", - help="Comma-separated channel IDs (overrides DISCORD_CHANNEL_IDS setting).", - ) - parser.add_argument( - "--task", - choices=["sync", "export", "all"], - default=None, - help="Deprecated: prefer --skip-*. sync=fetch only; export=markdown only; all=all phases.", - ) - - def get_collector(self, **options: Any) -> AbstractCollector: - opts = dict(options) - if opts.get("skip_pinecone") is None: - opts["skip_pinecone"] = False - return DiscordActivityCollector(cmd=self, options=opts) - - def _handle_core( - self, options: dict, collector: DiscordActivityCollector - ) -> DiscordCollectionTrackerResult: - dry_run = options["dry_run"] - skip_discord_sync = options["skip_discord_sync"] - skip_markdown_export = options["skip_markdown_export"] - skip_remote_push = options["skip_remote_push"] - skip_pinecone = options.get("skip_pinecone") or False - - task = options.get("task") - if task == "sync": - skip_markdown_export = True - skip_remote_push = True - elif task == "export": - skip_discord_sync = True - skip_pinecone = True - elif task == "all": - pass - - collector.options.update( - { - "skip_discord_sync": skip_discord_sync, - "skip_markdown_export": skip_markdown_export, - "skip_remote_push": skip_remote_push, - "skip_pinecone": skip_pinecone, - } - ) - - from discord_activity_tracker.utils.discord_internal_tokens_store import ( - get_or_load_discord_user_token, - ) - - user_token = get_or_load_discord_user_token() - guild_id: int | None = getattr(settings, "DISCORD_SERVER_ID", None) - - if not user_token: - raise CommandError("Discord credentials not configured. See .env.example.") - if not guild_id: - raise CommandError("DISCORD_SERVER_ID not configured.") - - try: - after_date, before_date, per_channel_incremental = ( - _resolve_exporter_date_bounds( - options, - guild_snowflake=guild_id, - channel_ids=collector.channel_ids, - ) - ) - except CommandError: - raise - - logger.debug( - "starting (dry_run=%s, skip_discord_sync=%s, skip_md=%s, skip_push=%s, skip_pinecone=%s)", - dry_run, - skip_discord_sync, - skip_markdown_export, - skip_remote_push, - skip_pinecone, - ) - - try: - if dry_run: - task_preprocess_workspace(dry_run=True) - if not skip_discord_sync: - logger.info("dry-run would run DiscordChatExporter + DB + raw JSON") - else: - logger.info("dry-run skipping Discord sync (--skip-discord-sync)") - if not skip_markdown_export: - logger.info("dry-run would export Markdown from DB") - if not skip_remote_push: - logger.info( - "dry-run would push Markdown if DISCORD_CONTEXT_AUTO_COMMIT is enabled" - ) - if not skip_pinecone: - logger.info( - "dry-run would run Pinecone upsert for Discord messages" - ) - collector.stdout.write(collector.style.WARNING("DRY RUN — no writes")) - collector.stdout.write(f" Guild ID: {guild_id}") - collector.stdout.write( - f" Channel allowlist: {collector.channel_ids or 'all channels'}" - ) - if per_channel_incremental: - collector.stdout.write( - " Lower bound (--after): per-channel (UTC day of latest " - "stored message; empty channel = today UTC only)" - ) - elif after_date: - collector.stdout.write( - f" Lower bound (--after): {after_date.isoformat()} UTC" - ) - else: - collector.stdout.write( - " Lower bound (--after): today (UTC) only (empty DB, no --since)" - ) - if before_date: - collector.stdout.write( - f" Upper bound (--before): {before_date.isoformat()} UTC" - ) - else: - collector.stdout.write( - " Upper bound (--before): none (through present)" - ) - logger.info("finished successfully (dry-run)") - return DiscordCollectionTrackerResult( - success=True, counts={"dry_run": 1} - ) - - messages_synced = task_discord_sync( - dry_run=False, - skip_discord_sync=skip_discord_sync, - user_token=user_token, - guild_id=guild_id, - channel_ids=collector.channel_ids, - after_date=after_date, - before_date=before_date, - per_channel_incremental=per_channel_incremental, - collector=collector, - ) - - task_markdown_export_and_push( - dry_run=False, - skip_markdown_export=skip_markdown_export, - skip_remote_push=skip_remote_push, - guild_id=guild_id, - collector=collector, - ) - - if skip_pinecone: - logger.info("skipping Pinecone (--skip-pinecone)") - - logger.info("finished successfully") - return DiscordCollectionTrackerResult( - success=True, - counts={ - "messages": messages_synced, - "channels": ( - len(collector.channel_ids) if collector.channel_ids else 0 - ), - }, - ) - except Exception as e: - logger.exception("command failed: %s", e) - raise diff --git a/discord_activity_tracker/migrations/0001_initial.py b/discord_activity_tracker/migrations/0001_initial.py deleted file mode 100644 index 0e084d0a..00000000 --- a/discord_activity_tracker/migrations/0001_initial.py +++ /dev/null @@ -1,140 +0,0 @@ -# Generated by Django 4.2.28 on 2026-02-09 21:00 - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - initial = True - - dependencies = [ - ] - - operations = [ - migrations.CreateModel( - name='DiscordChannel', - fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('channel_id', models.BigIntegerField(db_index=True, unique=True)), - ('channel_name', models.CharField(db_index=True, max_length=255)), - ('channel_type', models.CharField(max_length=50)), - ('topic', models.TextField(blank=True)), - ('position', models.IntegerField(default=0)), - ('last_synced_at', models.DateTimeField(blank=True, db_index=True, null=True)), - ('last_activity_at', models.DateTimeField(blank=True, db_index=True, null=True)), - ('created_at', models.DateTimeField(auto_now_add=True)), - ('updated_at', models.DateTimeField(auto_now=True)), - ], - options={ - 'db_table': 'discord_activity_tracker_discordchannel', - 'ordering': ['server', 'position', 'channel_name'], - }, - ), - migrations.CreateModel( - name='DiscordMessage', - fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('message_id', models.BigIntegerField(db_index=True, unique=True)), - ('content', models.TextField(blank=True)), - ('message_created_at', models.DateTimeField(db_index=True)), - ('message_edited_at', models.DateTimeField(blank=True, null=True)), - ('is_deleted', models.BooleanField(db_index=True, default=False)), - ('deleted_at', models.DateTimeField(blank=True, null=True)), - ('reply_to_message_id', models.BigIntegerField(blank=True, db_index=True, null=True)), - ('has_attachments', models.BooleanField(default=False)), - ('attachment_urls', models.JSONField(default=list)), - ('created_at', models.DateTimeField(auto_now_add=True)), - ('updated_at', models.DateTimeField(auto_now=True)), - ], - options={ - 'db_table': 'discord_activity_tracker_discordmessage', - 'ordering': ['channel', 'message_created_at'], - }, - ), - migrations.CreateModel( - name='DiscordServer', - fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('server_id', models.BigIntegerField(db_index=True, unique=True)), - ('server_name', models.CharField(db_index=True, max_length=255)), - ('icon_url', models.URLField(blank=True, max_length=512)), - ('created_at', models.DateTimeField(auto_now_add=True)), - ('updated_at', models.DateTimeField(auto_now=True)), - ], - options={ - 'db_table': 'discord_activity_tracker_discordserver', - 'ordering': ['server_name'], - }, - ), - migrations.CreateModel( - name='DiscordUser', - fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('user_id', models.BigIntegerField(db_index=True, unique=True)), - ('username', models.CharField(db_index=True, max_length=255)), - ('display_name', models.CharField(blank=True, max_length=255)), - ('avatar_url', models.URLField(blank=True, max_length=512)), - ('is_bot', models.BooleanField(db_index=True, default=False)), - ('created_at', models.DateTimeField(auto_now_add=True)), - ('updated_at', models.DateTimeField(auto_now=True)), - ], - options={ - 'db_table': 'discord_activity_tracker_discorduser', - 'ordering': ['username'], - }, - ), - migrations.CreateModel( - name='DiscordReaction', - fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('emoji', models.CharField(db_index=True, max_length=255)), - ('count', models.IntegerField(default=1)), - ('created_at', models.DateTimeField(auto_now_add=True)), - ('updated_at', models.DateTimeField(auto_now=True)), - ('message', models.ForeignKey(db_column='message_id', on_delete=django.db.models.deletion.CASCADE, related_name='reactions', to='discord_activity_tracker.discordmessage')), - ], - options={ - 'db_table': 'discord_activity_tracker_discordreaction', - }, - ), - migrations.AddField( - model_name='discordmessage', - name='author', - field=models.ForeignKey(db_column='author_id', on_delete=django.db.models.deletion.CASCADE, related_name='messages', to='discord_activity_tracker.discorduser'), - ), - migrations.AddField( - model_name='discordmessage', - name='channel', - field=models.ForeignKey(db_column='channel_id', on_delete=django.db.models.deletion.CASCADE, related_name='messages', to='discord_activity_tracker.discordchannel'), - ), - migrations.AddField( - model_name='discordchannel', - name='server', - field=models.ForeignKey(db_column='server_id', on_delete=django.db.models.deletion.CASCADE, related_name='channels', to='discord_activity_tracker.discordserver'), - ), - migrations.AddConstraint( - model_name='discordreaction', - constraint=models.UniqueConstraint(fields=('message', 'emoji'), name='discord_activity_tracker_msg_emoji_uniq'), - ), - migrations.AddIndex( - model_name='discordmessage', - index=models.Index(fields=['channel', 'message_created_at'], name='discord_act_channel_b1b3fe_idx'), - ), - migrations.AddIndex( - model_name='discordmessage', - index=models.Index(fields=['message_created_at'], name='discord_act_message_bfb7a8_idx'), - ), - migrations.AddIndex( - model_name='discordmessage', - index=models.Index(fields=['is_deleted'], name='discord_act_is_dele_3b2257_idx'), - ), - migrations.AddIndex( - model_name='discordchannel', - index=models.Index(fields=['server', 'channel_name'], name='discord_act_server__3c1258_idx'), - ), - migrations.AddIndex( - model_name='discordchannel', - index=models.Index(fields=['last_activity_at'], name='discord_act_last_ac_87ebfd_idx'), - ), - ] diff --git a/discord_activity_tracker/migrations/0002_migrate_users_to_discord_profile.py b/discord_activity_tracker/migrations/0002_migrate_users_to_discord_profile.py deleted file mode 100644 index 5e807bd2..00000000 --- a/discord_activity_tracker/migrations/0002_migrate_users_to_discord_profile.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Data migration: copy DiscordUser → DiscordProfile, remap DiscordMessage.author_id.""" - -from django.db import migrations - - -def migrate_users_forward(apps, schema_editor): - """Create DiscordProfile for each DiscordUser, remap DiscordMessage.author_id.""" - DiscordUser = apps.get_model("discord_activity_tracker", "DiscordUser") - BaseProfile = apps.get_model("cppa_user_tracker", "BaseProfile") - DiscordProfile = apps.get_model("cppa_user_tracker", "DiscordProfile") - - # Build mapping: old DiscordUser.pk → new DiscordProfile.pk - pk_map = {} - for du in DiscordUser.objects.all(): - # Create BaseProfile row first (multi-table inheritance) - bp = BaseProfile.objects.create(type="discord") - # Create DiscordProfile row - DiscordProfile.objects.create( - baseprofile_ptr_id=bp.pk, - discord_user_id=du.user_id, - username=du.username, - display_name=du.display_name, - avatar_url=du.avatar_url, - is_bot=du.is_bot, - ) - pk_map[du.pk] = bp.pk - - # Remap author_id in DiscordMessage using raw SQL for performance - if pk_map: - # Build CASE WHEN for bulk update - case_parts = " ".join( - f"WHEN {old_pk} THEN {new_pk}" for old_pk, new_pk in pk_map.items() - ) - old_pks = ",".join(str(pk) for pk in pk_map.keys()) - sql = ( - f"UPDATE discord_activity_tracker_discordmessage " - f"SET author_id = CASE author_id {case_parts} END " - f"WHERE author_id IN ({old_pks})" - ) - schema_editor.execute(sql) - - -def migrate_users_reverse(apps, schema_editor): - """Reverse: this is a one-way migration. Raise error.""" - raise RuntimeError( - "Cannot reverse DiscordUser → DiscordProfile migration. " - "Restore from backup if needed." - ) - - -class Migration(migrations.Migration): - - dependencies = [ - ("cppa_user_tracker", "0003_discordprofile_alter_baseprofile_type"), - ("discord_activity_tracker", "0001_initial"), - ] - - operations = [ - # Step 1: Drop the old FK constraint so we can remap author_id values - migrations.RunSQL( - sql="ALTER TABLE discord_activity_tracker_discordmessage DROP CONSTRAINT IF EXISTS discord_activity_tra_author_id_1b8afaa8_fk_discord_a", - reverse_sql="", # handled by reverse data migration - ), - # Step 2: Data migration — create DiscordProfile, remap author_id - migrations.RunPython(migrate_users_forward, migrate_users_reverse), - ] diff --git a/discord_activity_tracker/migrations/0003_alter_discordmessage_author.py b/discord_activity_tracker/migrations/0003_alter_discordmessage_author.py deleted file mode 100644 index 9d016ffe..00000000 --- a/discord_activity_tracker/migrations/0003_alter_discordmessage_author.py +++ /dev/null @@ -1,20 +0,0 @@ -# Generated by Django 4.2.28 on 2026-02-18 19:28 - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - ('cppa_user_tracker', '0003_discordprofile_alter_baseprofile_type'), - ('discord_activity_tracker', '0002_migrate_users_to_discord_profile'), - ] - - operations = [ - migrations.AlterField( - model_name='discordmessage', - name='author', - field=models.ForeignKey(db_column='author_id', on_delete=django.db.models.deletion.CASCADE, related_name='discord_messages', to='cppa_user_tracker.discordprofile'), - ), - ] diff --git a/discord_activity_tracker/migrations/0004_delete_discorduser.py b/discord_activity_tracker/migrations/0004_delete_discorduser.py deleted file mode 100644 index 15f543be..00000000 --- a/discord_activity_tracker/migrations/0004_delete_discorduser.py +++ /dev/null @@ -1,14 +0,0 @@ -from django.db import migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ("discord_activity_tracker", "0003_alter_discordmessage_author"), - ] - - operations = [ - migrations.DeleteModel( - name="DiscordUser", - ), - ] diff --git a/discord_activity_tracker/migrations/0005_channel_category_message_type_is_pinned.py b/discord_activity_tracker/migrations/0005_channel_category_message_type_is_pinned.py deleted file mode 100644 index c3dc21e8..00000000 --- a/discord_activity_tracker/migrations/0005_channel_category_message_type_is_pinned.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Add category_id/category_name to DiscordChannel; add message_type/is_pinned to DiscordMessage.""" - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("discord_activity_tracker", "0004_delete_discorduser"), - ] - - operations = [ - # DiscordChannel: category fields from DiscordChatExporter - migrations.AddField( - model_name="discordchannel", - name="category_id", - field=models.BigIntegerField(blank=True, db_index=True, null=True), - ), - migrations.AddField( - model_name="discordchannel", - name="category_name", - field=models.CharField(blank=True, default="", max_length=255), - preserve_default=False, - ), - # DiscordMessage: type and pinned fields from DiscordChatExporter - migrations.AddField( - model_name="discordmessage", - name="message_type", - field=models.CharField(default="Default", db_index=True, max_length=50), - ), - migrations.AddField( - model_name="discordmessage", - name="is_pinned", - field=models.BooleanField(default=False, db_index=True), - ), - migrations.AddIndex( - model_name="discordmessage", - index=models.Index( - fields=["message_type"], - name="discord_act_message_type_idx", - ), - ), - ] diff --git a/discord_activity_tracker/migrations/0006_remove_channel_last_timestamps.py b/discord_activity_tracker/migrations/0006_remove_channel_last_timestamps.py deleted file mode 100644 index f5c503e7..00000000 --- a/discord_activity_tracker/migrations/0006_remove_channel_last_timestamps.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Remove DiscordChannel.last_synced_at and last_activity_at (use DiscordMessage instead).""" - -from django.db import migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ("discord_activity_tracker", "0005_channel_category_message_type_is_pinned"), - ] - - operations = [ - migrations.RemoveIndex( - model_name="discordchannel", - name="discord_act_last_ac_87ebfd_idx", - ), - migrations.RemoveField( - model_name="discordchannel", - name="last_activity_at", - ), - migrations.RemoveField( - model_name="discordchannel", - name="last_synced_at", - ), - ] diff --git a/discord_activity_tracker/migrations/__init__.py b/discord_activity_tracker/migrations/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/discord_activity_tracker/models.py b/discord_activity_tracker/models.py deleted file mode 100644 index fc9cd720..00000000 --- a/discord_activity_tracker/models.py +++ /dev/null @@ -1,161 +0,0 @@ -from django.db import models - -from cppa_user_tracker.models import DiscordProfile - - -class DiscordServer(models.Model): - """Persisted Discord guild (server) metadata synced from export or API pipelines. - - One row per Discord guild snowflake ``server_id``. Holds display ``server_name`` - and optional ``icon_url`` for UI or audit. Timestamps ``created_at`` / - ``updated_at`` track row lifecycle. - - Relationships: - Reverse ``channels``: ``DiscordChannel`` rows with FK to this server - (``related_name="channels"`` on ``DiscordChannel``). - """ - - server_id = models.BigIntegerField(unique=True, db_index=True) - server_name = models.CharField(max_length=255, db_index=True) - icon_url = models.URLField(max_length=512, blank=True) - created_at = models.DateTimeField(auto_now_add=True) - updated_at = models.DateTimeField(auto_now=True) - - class Meta: - ordering = ["server_name"] - - def __str__(self): - return f"{self.server_name} ({self.server_id})" - - -class DiscordChannel(models.Model): - """A channel (text thread, category child, etc.) belonging to one ``DiscordServer``. - - Key fields: ``channel_id`` (Discord snowflake, globally unique), ``channel_name``, - ``channel_type`` (e.g. exporter string), ``topic``, ``position``, and optional - ``category_id`` / ``category_name`` for grouping in the guild tree. - - Relationships: - ``server``: FK to ``DiscordServer`` (column ``server_id``). - Reverse ``messages``: ``DiscordMessage`` rows for this channel - (``related_name="messages"`` on ``DiscordMessage``). - """ - - server = models.ForeignKey( - DiscordServer, - on_delete=models.CASCADE, - related_name="channels", - db_column="server_id", - ) - channel_id = models.BigIntegerField(unique=True, db_index=True) - channel_name = models.CharField(max_length=255, db_index=True) - channel_type = models.CharField(max_length=50) # GuildTextChat, text, etc. - # Category the channel belongs to (from DiscordChatExporter: categoryId / category) - category_id = models.BigIntegerField(null=True, blank=True, db_index=True) - category_name = models.CharField(max_length=255, blank=True) - topic = models.TextField(blank=True) - position = models.IntegerField(default=0) - created_at = models.DateTimeField(auto_now_add=True) - updated_at = models.DateTimeField(auto_now=True) - - class Meta: - ordering = ["server", "position", "channel_name"] - indexes = [ - models.Index(fields=["server", "channel_name"]), - ] - - def __str__(self): - return f"#{self.channel_name}" - - -class DiscordMessage(models.Model): - """A single Discord message stored for search, export, and Pinecone preprocessing. - - Key fields: ``message_id`` (snowflake, unique), ``content``, ``message_type`` - (e.g. ``Default``, ``Reply``), ``is_pinned``, ``message_created_at`` / - ``message_edited_at``, ``reply_to_message_id``, ``attachment_urls`` (JSON list), - ``has_attachments``, and soft-delete flags ``is_deleted`` / ``deleted_at``. - - Relationships: - ``channel``: FK to ``DiscordChannel`` (column ``channel_id``). - ``author``: FK to ``DiscordProfile`` (``cppa_user_tracker.models``); column - ``author_id``. Reverse on profile: ``discord_messages``. - Reverse ``reactions``: ``DiscordReaction`` rows - (``related_name="reactions"`` on ``DiscordReaction``). - - Indexes on ``(channel, message_created_at)``, ``message_created_at``, - ``is_deleted``, and ``message_type`` support sync windows and queries. - """ - - message_id = models.BigIntegerField(unique=True, db_index=True) - channel = models.ForeignKey( - DiscordChannel, - on_delete=models.CASCADE, - related_name="messages", - db_column="channel_id", - ) - author = models.ForeignKey( - DiscordProfile, - on_delete=models.CASCADE, - related_name="discord_messages", - db_column="author_id", - ) - content = models.TextField(blank=True) - # message_type: "Default", "Reply", "GuildBoost", etc. (from DiscordChatExporter type field) - message_type = models.CharField(max_length=50, default="Default", db_index=True) - is_pinned = models.BooleanField(default=False, db_index=True) - message_created_at = models.DateTimeField(db_index=True) - message_edited_at = models.DateTimeField(null=True, blank=True) - is_deleted = models.BooleanField(default=False, db_index=True) - deleted_at = models.DateTimeField(null=True, blank=True) - reply_to_message_id = models.BigIntegerField(null=True, blank=True, db_index=True) - has_attachments = models.BooleanField(default=False) - attachment_urls = models.JSONField(default=list) - created_at = models.DateTimeField(auto_now_add=True) - updated_at = models.DateTimeField(auto_now=True) - - class Meta: - ordering = ["channel", "message_created_at"] - indexes = [ - models.Index(fields=["channel", "message_created_at"]), - models.Index(fields=["message_created_at"]), - models.Index(fields=["is_deleted"]), - models.Index(fields=["message_type"]), - ] - - def __str__(self): - content_preview = self.content[:50] if self.content else "(no content)" - return f"{self.author.username}: {content_preview}" - - -class DiscordReaction(models.Model): - """Aggregated emoji reaction counts on a ``DiscordMessage``. - - One row per (``message``, ``emoji``) pair (enforced by unique constraint). ``count`` - stores the total from the source payload at sync time. - - Relationships: - ``message``: FK to ``DiscordMessage`` (column ``message_id``). - """ - - message = models.ForeignKey( - DiscordMessage, - on_delete=models.CASCADE, - related_name="reactions", - db_column="message_id", - ) - emoji = models.CharField(max_length=255, db_index=True) - count = models.IntegerField(default=1) - created_at = models.DateTimeField(auto_now_add=True) - updated_at = models.DateTimeField(auto_now=True) - - class Meta: - constraints = [ - models.UniqueConstraint( - fields=["message", "emoji"], - name="discord_activity_tracker_msg_emoji_uniq", - ) - ] - - def __str__(self): - return f"{self.emoji} ({self.count})" diff --git a/discord_activity_tracker/pinecone_runner.py b/discord_activity_tracker/pinecone_runner.py deleted file mode 100644 index 1d794ce7..00000000 --- a/discord_activity_tracker/pinecone_runner.py +++ /dev/null @@ -1,53 +0,0 @@ -"""Shared Pinecone upsert via ``run_cppa_pinecone_sync`` for Discord commands.""" - -from __future__ import annotations - -import logging - -from django.conf import settings -from django.core.management import call_command - -logger = logging.getLogger(__name__) - -DISCORD_PINECONE_PREPROCESSOR = ( - "discord_activity_tracker.preprocessor.preprocess_discord_for_pinecone" -) - - -def task_discord_pinecone_sync(*, dry_run: bool = False) -> None: - """Upsert Discord messages to Pinecone using settings (mirrors Boost tracker pattern).""" - logger.info("Pinecone upsert (Discord messages)") - if dry_run: - logger.info("dry-run would run Pinecone sync for Discord messages") - return - - app_type = (getattr(settings, "PINECONE_DISCORD_APP_TYPE", "") or "").strip() - namespace = (getattr(settings, "PINECONE_DISCORD_NAMESPACE", "") or "").strip() - if not app_type: - logger.warning( - "Pinecone sync skipped: PINECONE_DISCORD_APP_TYPE is empty (settings/env)." - ) - return - if not namespace: - logger.warning( - "Pinecone sync skipped: PINECONE_DISCORD_NAMESPACE is empty (settings/env)." - ) - return - - try: - call_command( - "run_cppa_pinecone_sync", - app_type=app_type, - namespace=namespace, - preprocessor=DISCORD_PINECONE_PREPROCESSOR, - ) - logger.info( - "pinecone sync completed (app_type=%s, namespace=%s)", - app_type, - namespace, - ) - except Exception as exc: # pylint: disable=broad-exception-caught - logger.warning( - "Pinecone sync skipped/failed (run_cppa_pinecone_sync unavailable or errored): %s", - exc, - ) diff --git a/discord_activity_tracker/preprocessor.py b/discord_activity_tracker/preprocessor.py deleted file mode 100644 index fbb074de..00000000 --- a/discord_activity_tracker/preprocessor.py +++ /dev/null @@ -1,257 +0,0 @@ -"""Pinecone preprocess function for discord_activity_tracker. - -Follows the contract defined in docs/Pinecone_preprocess_guideline.md and -mirrors the structure of cppa_slack_tracker.preprocessor. - -Groups DiscordMessage rows by channel, merges reply chains (thread roots with -their direct replies) into single documents (one line per message: -``author: "text"``, joined by newlines), filters short / empty content, and -emits ``{"content": str, "metadata": dict}`` records for cppa_pinecone_sync. -""" - -from __future__ import annotations - -import logging -from datetime import datetime, timezone -from typing import Any, Optional - -from django.conf import settings -from django.db.models import Q - -from core.utils.text_processing import clean_discord_text -from discord_activity_tracker.models import DiscordChannel, DiscordMessage - -logger = logging.getLogger(__name__) - -# Minimum characters a document's plain-text content must have before it is -# sent to Pinecone. Mirrors PINECONE_MIN_TEXT_LENGTH in other preprocessors. -_DEFAULT_MIN_TEXT_LENGTH = 20 - - -# --------------------------------------------------------------------------- -# Text helpers -# --------------------------------------------------------------------------- - - -def _min_text_length() -> int: - return int(getattr(settings, "PINECONE_MIN_TEXT_LENGTH", _DEFAULT_MIN_TEXT_LENGTH)) - - -def _is_content_too_short(text: str) -> bool: - return len(text.strip()) < _min_text_length() - - -# --------------------------------------------------------------------------- -# ID normalisation -# --------------------------------------------------------------------------- - - -def _normalize_failed_ids(failed_ids: list[str]) -> list[str]: - seen: set[str] = set() - out: list[str] = [] - for raw in failed_ids or []: - value = (raw or "").strip() - if not value or value in seen: - continue - seen.add(value) - out.append(value) - return out - - -# --------------------------------------------------------------------------- -# Grouping helpers -# --------------------------------------------------------------------------- - - -def _build_reply_chains( - messages: list[DiscordMessage], -) -> list[list[DiscordMessage]]: - """Group messages into reply chains. - - For each root message (not a reply to another message in this batch), the - chain is the root plus every other message in the batch whose - ``reply_to_message_id`` equals the root's ``message_id`` (direct replies - only). Standalone messages become single-item chains. Nested replies - (reply-to-reply) whose parent is not the root are emitted as separate - single-message chains by the orphan pass. - - Long merged ``content`` is split later by ``cppa_pinecone_sync`` ingestion - when ``is_chunked=False`` (see docs/Pinecone_preprocess_guideline.md). - """ - by_id: dict[int, DiscordMessage] = {m.message_id: m for m in messages} - assigned: set[int] = set() - chains: list[list[DiscordMessage]] = [] - - for msg in messages: - if msg.message_id in assigned: - continue - if msg.reply_to_message_id and msg.reply_to_message_id in by_id: - continue # Will be picked up as part of a root's chain - # This message is a root (or has no local parent) - chain = [msg] - assigned.add(msg.message_id) - for reply in messages: - if reply.message_id in assigned: - continue - if reply.reply_to_message_id == msg.message_id: - chain.append(reply) - assigned.add(reply.message_id) - chains.append(chain) - - # Any remaining (orphan replies whose root wasn't in this batch) - for msg in messages: - if msg.message_id not in assigned: - chains.append([msg]) - - return chains - - -def _pinecone_channel_display_name(channel: DiscordChannel) -> str: - """Human-readable channel label for Pinecone: ``category - channel`` when category exists.""" - name = (channel.channel_name or "").strip() - cat = (channel.category_name or "").strip() - if cat: - return f"{cat} - {name}" if name else cat - return name or "?" - - -def _format_chain_message_line(msg: DiscordMessage, cleaned: str) -> str: - """One line for merged reply-chain content: ``username: "message text"``.""" - username = msg.author.username if getattr(msg, "author_id", None) else "unknown" - escaped = cleaned.replace("\\", "\\\\").replace('"', '\\"') - return f'{username}: "{escaped}"' - - -def _chain_to_document( - chain: list[DiscordMessage], -) -> Optional[dict[str, Any]]: - """Convert a reply chain to a Pinecone document dict, or None if filtered.""" - parts: list[str] = [] - ids: list[str] = [] - - for msg in chain: - raw = (msg.content or "").strip() - if not raw: - continue - cleaned = clean_discord_text(raw) - if not cleaned: - continue - parts.append(_format_chain_message_line(msg, cleaned)) - ids.append(str(msg.message_id)) - - if not parts: - return None - - content = "\n".join(parts) - if _is_content_too_short(content): - return None - - root = chain[0] - channel = root.channel - server = channel.server - - try: - ts = int(root.message_created_at.astimezone(timezone.utc).timestamp()) - except (AttributeError, OSError, OverflowError): - ts = 0 - - return { - "content": content, - "metadata": { - "doc_id": str(root.message_id), - "type": "discord", - "channel_id": str(channel.channel_id), - "channel_name": _pinecone_channel_display_name(channel), - "server_id": str(server.server_id), - "server_name": server.server_name, - "author": ( - root.author.username if getattr(root, "author_id", None) else "unknown" - ), - "timestamp": ts, - "is_reply_chain": len(chain) > 1, - "source_ids": ",".join(ids), - }, - } - - -# --------------------------------------------------------------------------- -# Public preprocess function -# --------------------------------------------------------------------------- - - -def preprocess_discord_for_pinecone( - failed_ids: list[str], - final_sync_at: datetime | None, -) -> tuple[list[dict[str, Any]], bool]: - """Build Pinecone sync documents for Discord messages. - - Args: - failed_ids: Source IDs (message snowflakes as strings) that failed in a - previous sync run and should be retried. - final_sync_at: Timestamp of the last successful sync; ``None`` for first sync. - - Returns: - ``(documents, is_chunked)`` where ``documents`` is a list of - ``{"content": str, "metadata": dict}`` records and ``is_chunked`` - is ``False`` (whole documents; the ingestion pipeline may chunk them). - """ - normalized_failed = _normalize_failed_ids(failed_ids) - - qs = ( - DiscordMessage.objects.select_related("channel__server", "author") - .filter(is_deleted=False) - .order_by("message_created_at") - ) - - messages_new: list[DiscordMessage] = [] - messages_failed: list[DiscordMessage] = [] - - if final_sync_at is None and not normalized_failed: - # First sync — index everything - messages_new = list(qs) - logger.info( - "preprocess_discord: first sync, loaded %d messages", len(messages_new) - ) - else: - if final_sync_at is not None: - messages_new = list(qs.filter(updated_at__gt=final_sync_at)) - logger.info( - "preprocess_discord: incremental, loaded %d new messages", - len(messages_new), - ) - if normalized_failed: - messages_failed = list( - qs.filter( - Q( - message_id__in=[ - int(fid) for fid in normalized_failed if fid.isdigit() - ] - ) - ) - ) - logger.info( - "preprocess_discord: retrying %d failed messages", len(messages_failed) - ) - - all_messages = messages_new + messages_failed - if not all_messages: - logger.info("preprocess_discord: nothing to sync") - return [], False - - chains = _build_reply_chains(all_messages) - - docs: list[dict[str, Any]] = [] - seen_doc_ids: set[str] = set() - - for chain in chains: - doc = _chain_to_document(chain) - if doc is None: - continue - doc_id = doc["metadata"]["doc_id"] - if doc_id in seen_doc_ids: - continue - seen_doc_ids.add(doc_id) - docs.append(doc) - - logger.info("preprocess_discord: built %d Pinecone documents", len(docs)) - return docs, False diff --git a/discord_activity_tracker/protocol_impl.py b/discord_activity_tracker/protocol_impl.py deleted file mode 100644 index 9e765dd7..00000000 --- a/discord_activity_tracker/protocol_impl.py +++ /dev/null @@ -1,110 +0,0 @@ -"""Frozen DTOs implementing :mod:`core.protocols` for Discord activity sync.""" - -from __future__ import annotations - -from dataclasses import dataclass -from datetime import datetime -from typing import Any, Mapping - -from core.activity_types import ( - ActivityType, - LegacyActivityRecordDict, - SourceSystem, - activity_record_to_legacy_dict, - migrate_legacy_activity_fields, -) -from core.protocol_dto import ( - ActivityRecordDataclass, - IncrementalStateDataclass, - TrackerResultDataclass, -) - - -@dataclass(frozen=True, repr=False) -class DiscordCollectionTrackerResult(TrackerResultDataclass): - """Counts for a Discord collection slice (messages, channels, etc.).""" - - -@dataclass(frozen=True, repr=False) -class DiscordIncrementalState(IncrementalStateDataclass): - """Checkpoint between Discord runs (after-cursor + optional snowflake).""" - - @classmethod - def from_after_date( - cls, - *, - after: datetime | None, - last_message_id: int | None = None, - channel_id: int | None = None, - ) -> DiscordIncrementalState: - marker = after.isoformat() if after is not None else "" - tok_parts = ["discord"] - if channel_id is not None: - tok_parts.append(f"ch:{channel_id}") - if last_message_id is not None: - tok_parts.append(f"msg:{last_message_id}") - checkpoint = ":".join(tok_parts) - return cls( - checkpoint_token=checkpoint, - human_readable_marker=marker or None, - extras={ - "after_iso": marker, - "last_message_id": last_message_id, - "channel_id": channel_id, - }, - ) - - -@dataclass(frozen=True, repr=False) -class DiscordActivityRecord(ActivityRecordDataclass): - """Normalized Discord message as a portable activity row.""" - - def to_legacy_dict(self) -> LegacyActivityRecordDict: - """Tier-C workspace bridge format; prefer :meth:`asdict` for canonical protocol JSON.""" - return activity_record_to_legacy_dict( - source_system=self.source_system, - external_id=self.external_id, - occurred_at=self.occurred_at, - activity_type=self.activity_type, - actor_id=self.actor_external_id, - source_url=self.source_url, - summary=self.summary, - ) - - @classmethod - def from_converted_export_dict( - cls, - converted: Mapping[str, Any], - *, - server_id: int, - channel_id: int, - ) -> DiscordActivityRecord: - mid = int(converted.get("id") or 0) - author = converted.get("author") or {} - if isinstance(author, Mapping): - aid = author.get("id") - actor_raw: str | int | None = aid if aid is not None else "" - else: - actor_raw = "" - occurred_raw = converted.get("occurred_at") or converted.get("created_at") or "" - content = str(converted.get("content") or "") - summary = content[:2000] - ext_id = f"{server_id}:{channel_id}:{mid}" - src = converted.get("source_url") - source_url = str(src) if src else None - mtype = str(converted.get("message_type") or "Default") - source, occurred, atype, actor = migrate_legacy_activity_fields( - source_system=SourceSystem.DISCORD.value, - occurred_at=occurred_raw, - activity_type=ActivityType.discord_message(mtype), - actor_external_id_raw=actor_raw, - ) - return cls( - source_system=source, - external_id=ext_id, - occurred_at=occurred, - activity_type=atype, - actor_external_id=actor, - source_url=source_url, - summary=summary, - ) diff --git a/discord_activity_tracker/schemas/discord_staging_v1.json b/discord_activity_tracker/schemas/discord_staging_v1.json deleted file mode 100644 index fcccad41..00000000 --- a/discord_activity_tracker/schemas/discord_staging_v1.json +++ /dev/null @@ -1,337 +0,0 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "discord_staging_v1", - "description": "Optional JSON Schema bundle for Discord staging data. Runtime validation uses Pydantic models in discord_activity_tracker/staging_schema.py.", - "discord_chat_exporter_envelope": { - "$defs": { - "DiscordExporterChannel": { - "additionalProperties": true, - "description": "Channel object inside a DiscordChatExporter JSON file.", - "properties": { - "id": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Id" - }, - "name": { - "default": "", - "title": "Name", - "type": "string" - }, - "type": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Type" - }, - "topic": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Topic" - }, - "category": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Category" - }, - "categoryId": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Categoryid" - } - }, - "title": "DiscordExporterChannel", - "type": "object" - }, - "DiscordExporterGuild": { - "additionalProperties": true, - "description": "Guild object inside a DiscordChatExporter JSON file.", - "properties": { - "id": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Id" - }, - "name": { - "default": "", - "title": "Name", - "type": "string" - }, - "iconUrl": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Iconurl" - } - }, - "title": "DiscordExporterGuild", - "type": "object" - } - }, - "additionalProperties": true, - "description": "Top-level shape of a DiscordChatExporter ``.json`` export.", - "properties": { - "guild": { - "$ref": "#/discord_chat_exporter_envelope/$defs/DiscordExporterGuild" - }, - "channel": { - "$ref": "#/discord_chat_exporter_envelope/$defs/DiscordExporterChannel" - }, - "messages": { - "items": {}, - "title": "Messages", - "type": "array" - } - }, - "title": "DiscordChatExporterEnvelope", - "type": "object" - }, - "normalized_discord_message": { - "$defs": { - "NormalizedAttachment": { - "additionalProperties": true, - "properties": { - "url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Url" - } - }, - "title": "NormalizedAttachment", - "type": "object" - }, - "NormalizedAuthorExport": { - "additionalProperties": true, - "description": "Author block after ``convert_exporter_message_to_dict``.", - "properties": { - "id": { - "default": 0, - "title": "Id", - "type": "integer" - }, - "username": { - "default": "unknown", - "title": "Username", - "type": "string" - }, - "global_name": { - "default": "", - "title": "Global Name", - "type": "string" - }, - "avatar_url": { - "default": "", - "title": "Avatar Url", - "type": "string" - }, - "bot": { - "default": false, - "title": "Bot", - "type": "boolean" - } - }, - "title": "NormalizedAuthorExport", - "type": "object" - }, - "NormalizedReaction": { - "additionalProperties": true, - "properties": { - "emoji": { - "minLength": 1, - "title": "Emoji", - "type": "string" - }, - "count": { - "minimum": 0, - "title": "Count", - "type": "integer" - } - }, - "required": [ - "emoji", - "count" - ], - "title": "NormalizedReaction", - "type": "object" - } - }, - "additionalProperties": false, - "description": "Post-converter message dict (API-shaped + canonical enrichment fields).", - "properties": { - "id": { - "title": "Id", - "type": "integer" - }, - "content": { - "default": "", - "title": "Content", - "type": "string" - }, - "created_at": { - "pattern": "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(?:\\.\\d+)?Z$", - "title": "Created At", - "type": "string" - }, - "edited_at": { - "anyOf": [ - { - "pattern": "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(?:\\.\\d+)?Z$", - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Edited At" - }, - "message_type": { - "default": "Default", - "title": "Message Type", - "type": "string" - }, - "is_pinned": { - "default": false, - "title": "Is Pinned", - "type": "boolean" - }, - "author": { - "$ref": "#/normalized_discord_message/$defs/NormalizedAuthorExport" - }, - "attachments": { - "items": { - "$ref": "#/normalized_discord_message/$defs/NormalizedAttachment" - }, - "title": "Attachments", - "type": "array" - }, - "reactions": { - "items": { - "$ref": "#/normalized_discord_message/$defs/NormalizedReaction" - }, - "title": "Reactions", - "type": "array" - }, - "reference": { - "anyOf": [ - { - "additionalProperties": true, - "type": "object" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Reference" - }, - "occurred_at": { - "anyOf": [ - { - "pattern": "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(?:\\.\\d+)?Z$", - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Occurred At" - }, - "actor_id": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Actor Id" - }, - "source_url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Source Url" - } - }, - "required": [ - "id", - "created_at", - "author" - ], - "title": "NormalizedDiscordMessage", - "type": "object" - } -} diff --git a/discord_activity_tracker/scripts/__init__.py b/discord_activity_tracker/scripts/__init__.py deleted file mode 100644 index 5afb933f..00000000 --- a/discord_activity_tracker/scripts/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Utility entry points for discord_activity_tracker (e.g. schema export).""" diff --git a/discord_activity_tracker/scripts/write_staging_json_schema.py b/discord_activity_tracker/scripts/write_staging_json_schema.py deleted file mode 100644 index a7d39929..00000000 --- a/discord_activity_tracker/scripts/write_staging_json_schema.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Write ``discord_activity_tracker/schemas/discord_staging_v1.json``. - -Run from the repository root:: - - python -m discord_activity_tracker.scripts.write_staging_json_schema - -See ``docs/discord-tracker-schema.md`` (section *JSON Schema artifact vs runtime validation*). -""" - -from __future__ import annotations - -from discord_activity_tracker.staging_schema import write_staging_json_schema - - -def main() -> None: - path = write_staging_json_schema() - print(path) - - -if __name__ == "__main__": - main() diff --git a/discord_activity_tracker/services.py b/discord_activity_tracker/services.py deleted file mode 100644 index 934ed57b..00000000 --- a/discord_activity_tracker/services.py +++ /dev/null @@ -1,727 +0,0 @@ -"""Service layer for Discord Activity Tracker. - -All writes to ``discord_activity_tracker`` models go through this module (single -writer policy). Higher-level API tables and narrative docs live in -``docs/service_api/discord_activity_tracker.md``. - -Bulk ingest expects dicts shaped like the output of -``discord_activity_tracker.sync.messages._prepare_message_data`` or -``discord_activity_tracker.sync.chat_exporter.convert_exporter_message_to_dict`` -(normalized message payloads with ``author``, ``message_id``, ``reactions``, etc.). - -**CollectorFailureCategory:** These functions perform database I/O only; they do -not call Discord HTTP APIs and do not assign ``CollectorFailureCategory`` labels. -Collectors and sync code classify failures via ``core.errors.classify_failure``. -If a caller logs ORM failures through that helper, mapping follows ``core.errors``. - -This module does not intentionally raise ``ValueError`` for bad inputs; bulk -paths may skip individual rows and log warnings (see each function's side effects). -""" - -import logging -from datetime import datetime -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union - -from django.db import transaction -from django.db.models import Max, QuerySet -from django.utils import timezone as django_timezone - -from cppa_user_tracker.models import DiscordProfile -from cppa_user_tracker.services import get_or_create_discord_profile -from .api_schemas import ( - DiscordLivePreparedMessage, - DiscordLiveUserPayload, - DiscordReactionPayload, - parse_reaction, -) -from .models import ( - DiscordServer, - DiscordChannel, - DiscordMessage, - DiscordReaction, -) - -logger = logging.getLogger(__name__) - - -def get_or_create_discord_server( - server_id: int, server_name: str, icon_url: str = "" -) -> Tuple[DiscordServer, bool]: - """Get or create a Discord guild (server) row and refresh metadata when it already exists. - - Uses ``get_or_create`` on ``server_id``. When the row already exists, updates - name and icon only if they differ, via ``save(update_fields=...)``. - - Does not perform Discord HTTP calls; does not emit ``CollectorFailureCategory``. - - Args: - server_id: Discord snowflake for the guild. - server_name: Current guild name. - icon_url: CDN URL for the guild icon; may be empty. - - Returns: - ``(server, created)`` where ``created`` is ``True`` iff a new - ``DiscordServer`` row was inserted on this call (Django ``get_or_create`` - semantics). - - Raises: - None intentionally. Django ORM may raise database-related exceptions - (e.g. ``IntegrityError``, ``OperationalError``) under concurrency or DB faults. - - Side effects: - Reads/writes ``DiscordServer``. May emit ``logger.debug`` on update. - """ - server, created = DiscordServer.objects.get_or_create( - server_id=server_id, - defaults={ - "server_name": server_name, - "icon_url": icon_url, - }, - ) - - if not created: - # Update fields if changed - updated = False - if server.server_name != server_name: - server.server_name = server_name - updated = True - if server.icon_url != icon_url: - server.icon_url = icon_url - updated = True - - if updated: - server.save(update_fields=["server_name", "icon_url", "updated_at"]) - logger.debug(f"Updated server: {server_name}") - - return server, created - - -def get_or_create_discord_channel( - server: DiscordServer, - channel_id: int, - channel_name: str, - channel_type: str, - topic: str = "", - position: int = 0, - category_id: Optional[int] = None, - category_name: str = "", -) -> Tuple[DiscordChannel, bool]: - """Get or create a channel row and refresh fields when the row already exists. - - Uses ``get_or_create`` on ``channel_id``. Existing rows are updated when any - of name, type, topic, position, or category fields change (``category_name`` is - only applied when non-empty and different). - - Does not perform Discord HTTP calls; does not emit ``CollectorFailureCategory``. - - Args: - server: Parent ``DiscordServer`` (guild). - channel_id: Discord snowflake for the channel. - channel_name: Display name (e.g. without leading ``#``). - channel_type: Exporter/discord type string (e.g. ``GuildTextChat``). - topic: Channel topic text. - position: Sort order within the guild. - category_id: Parent category snowflake, or ``None`` if unknown/uncategorized. - category_name: Human-readable category name when known. - - Returns: - ``(channel, created)`` with Django ``get_or_create`` semantics for ``created``. - - Raises: - None intentionally. Django ORM may raise database-related exceptions. - - Side effects: - Reads/writes ``DiscordChannel``. May emit ``logger.debug`` on update. - """ - channel, created = DiscordChannel.objects.get_or_create( - channel_id=channel_id, - defaults={ - "server": server, - "channel_name": channel_name, - "channel_type": channel_type, - "topic": topic, - "position": position, - "category_id": category_id, - "category_name": category_name, - }, - ) - - if not created: - updated = False - if channel.channel_name != channel_name: - channel.channel_name = channel_name - updated = True - if channel.channel_type != channel_type: - channel.channel_type = channel_type - updated = True - if channel.topic != topic: - channel.topic = topic - updated = True - if channel.position != position: - channel.position = position - updated = True - if category_id is not None and channel.category_id != category_id: - channel.category_id = category_id - updated = True - if category_name and channel.category_name != category_name: - channel.category_name = category_name - updated = True - - if updated: - channel.save( - update_fields=[ - "channel_name", - "channel_type", - "topic", - "position", - "category_id", - "category_name", - "updated_at", - ] - ) - logger.debug(f"Updated channel: {channel_name}") - - return channel, created - - -def create_or_update_discord_message( - message_id: int, - channel: DiscordChannel, - author: DiscordProfile, - content: str, - message_created_at: datetime, - message_edited_at: Optional[datetime] = None, - reply_to_message_id: Optional[int] = None, - attachment_urls: Optional[list] = None, - message_type: str = "Default", - is_pinned: bool = False, -) -> Tuple[DiscordMessage, bool]: - """Create or update a single message by Discord ``message_id`` (upsert). - - Uses ``update_or_create`` so the row is keyed by ``message_id``; ``defaults`` - refresh channel, author, content, type, pins, timestamps, attachments, and - clears ``is_deleted``. ``has_attachments`` is derived from ``attachment_urls``. - - Does not perform Discord HTTP calls; does not emit ``CollectorFailureCategory``. - - Args: - message_id: Discord snowflake for the message. - channel: Channel the message belongs to. - author: ``DiscordProfile`` for the message author. - content: Message body text. - message_created_at: Original creation time (timezone-aware recommended). - message_edited_at: Last edit time, if any. - reply_to_message_id: Parent message snowflake for replies, or ``None``. - attachment_urls: List of attachment URLs; ``None`` is treated as empty. - message_type: Exporter/discord type string; empty coerces to the string ``Default``. - is_pinned: Whether the message is pinned in the channel. - - Returns: - ``(message, created)`` where ``created`` is ``True`` iff a new - ``DiscordMessage`` row was inserted (Django ``update_or_create`` semantics). - - Raises: - None intentionally. Django ORM may raise database-related exceptions. - - Side effects: - Reads/writes ``DiscordMessage``. - """ - if attachment_urls is None: - attachment_urls = [] - - message, created = DiscordMessage.objects.update_or_create( - message_id=message_id, - defaults={ - "channel": channel, - "author": author, - "content": content, - "message_type": message_type or "Default", - "is_pinned": is_pinned, - "message_created_at": message_created_at, - "message_edited_at": message_edited_at, - "reply_to_message_id": reply_to_message_id, - "has_attachments": len(attachment_urls) > 0, - "attachment_urls": attachment_urls, - "is_deleted": False, - }, - ) - - return message, created - - -def mark_message_deleted( - message: DiscordMessage, deleted_at: Optional[datetime] = None -) -> DiscordMessage: - """Soft-delete a message: set ``is_deleted`` and ``deleted_at``. - - Does not perform Discord HTTP calls; does not emit ``CollectorFailureCategory``. - - Args: - message: Row to mark deleted (mutated in memory and saved). - deleted_at: Deletion timestamp; defaults to ``django.utils.timezone.now()``. - - Returns: - The same ``DiscordMessage`` instance after ``save(update_fields=...)``. - - Raises: - None intentionally. Django ORM may raise database-related exceptions. - - Side effects: - Updates ``DiscordMessage.is_deleted``, ``deleted_at``, ``updated_at``. - Emits ``logger.debug``. - """ - if deleted_at is None: - deleted_at = django_timezone.now() - - message.is_deleted = True - message.deleted_at = deleted_at - message.save(update_fields=["is_deleted", "deleted_at", "updated_at"]) - - logger.debug(f"Marked message {message.message_id} as deleted") - return message - - -def add_or_update_reaction( - message: DiscordMessage, emoji: str, count: int -) -> Tuple[DiscordReaction, bool]: - """Upsert one reaction row per (message, emoji) with the given reaction count. - - Uses ``update_or_create`` on the unique pair ``(message, emoji)``. - - Does not perform Discord HTTP calls; does not emit ``CollectorFailureCategory``. - - Args: - message: Message the reaction is on. - emoji: Emoji string or custom emoji representation. - count: Aggregated reaction count from the source payload. - - Returns: - ``(reaction, created)`` with Django ``update_or_create`` semantics for ``created``. - - Raises: - None intentionally. Django ORM may raise database-related exceptions. - - Side effects: - Reads/writes ``DiscordReaction``. - """ - reaction, created = DiscordReaction.objects.update_or_create( - message=message, emoji=emoji, defaults={"count": count} - ) - - return reaction, created - - -def get_channel_latest_message_at(channel: DiscordChannel) -> Optional[datetime]: - """Return the latest ``message_created_at`` among non-deleted messages in a channel. - - Read-only aggregate over ``DiscordMessage``; no writes. - - Does not perform Discord HTTP calls; does not emit ``CollectorFailureCategory``. - - Args: - channel: Channel to scan. - - Returns: - Maximum ``message_created_at`` for rows with ``is_deleted=False``, or - ``None`` if there are no such messages. - - Raises: - None intentionally. Django ORM may raise database-related exceptions. - - Side effects: - None (read-only query). - """ - row = DiscordMessage.objects.filter(channel=channel, is_deleted=False).aggregate( - m=Max("message_created_at") - ) - return row["m"] - - -def queryset_channels_with_recent_messages( - server: DiscordServer, - cutoff: datetime, - channel_ids: Optional[List[int]] = None, -) -> QuerySet[DiscordChannel]: - """Channels on ``server`` with at least one non-deleted message at or after ``cutoff``. - - Compares ``message_created_at`` to ``cutoff``; use timezone-aware datetimes for - predictable UTC behavior. When ``channel_ids`` is set, restricts to those - Discord ``channel_id`` values (snowflakes), not internal PKs. - - Does not perform Discord HTTP calls; does not emit ``CollectorFailureCategory``. - - Args: - server: Guild whose channels are considered. - cutoff: Inclusive lower bound on ``DiscordMessage.message_created_at``. - channel_ids: Optional allowlist of Discord channel snowflakes. - - Returns: - ``QuerySet`` of ``DiscordChannel`` ordered by ``position``, ``channel_name``. - - Raises: - None intentionally. Django ORM may raise database-related exceptions. - - Side effects: - None (read-only query). - """ - pks = ( - DiscordMessage.objects.filter( - channel__server=server, - message_created_at__gte=cutoff, - is_deleted=False, - ) - .values_list("channel_id", flat=True) - .distinct() - ) - qs = DiscordChannel.objects.filter(server=server, pk__in=pks).order_by( - "position", "channel_name" - ) - if channel_ids: - qs = qs.filter(channel_id__in=channel_ids) - return qs - - -def get_active_channels( - server: DiscordServer, - days: int = 30, - channel_ids: Optional[List[int]] = None, -) -> QuerySet[DiscordChannel]: - """Same as ``queryset_channels_with_recent_messages`` with ``cutoff = now - days``. - - ``days`` is calendar-style span from ``django.utils.timezone.now()`` using - ``datetime.timedelta``. - - Does not perform Discord HTTP calls; does not emit ``CollectorFailureCategory``. - - Args: - server: Guild whose channels are considered. - days: Lookback window in days from the current time. - channel_ids: Optional allowlist of Discord channel snowflakes. - - Returns: - ``QuerySet`` of ``DiscordChannel`` with recent activity. - - Raises: - None intentionally. Django ORM may raise database-related exceptions. - - Side effects: - None (read-only query; delegates to ``queryset_channels_with_recent_messages``). - """ - from datetime import timedelta - - cutoff = django_timezone.now() - timedelta(days=days) - return queryset_channels_with_recent_messages(server, cutoff, channel_ids) - - -# --------------------------------------------------------------------------- -# Bulk operations (for high-throughput message sync) -# --------------------------------------------------------------------------- - - -def bulk_upsert_discord_users( - user_data_list: List[Union[DiscordLiveUserPayload, Dict[str, Any]]], -) -> Dict[int, DiscordProfile]: - """Upsert author profiles for a batch of messages. - - Deduplicates by ``user_id`` (last dict wins). Existing ``DiscordProfile`` rows - are fetched in one query and updated in Python when fields differ; missing - users are created via ``get_or_create_discord_profile`` (no - ``bulk_create(update_conflicts=True)`` because ``DiscordProfile`` uses MTI). - - Does not perform Discord HTTP calls; does not emit ``CollectorFailureCategory``. - - Args: - user_data_list: Dicts with at least ``user_id`` and ``username``; optional - ``display_name``, ``avatar_url``, ``is_bot`` (see sync normalizers). - - Returns: - Map ``discord_user_id -> DiscordProfile`` including database PKs on profiles. - - Raises: - None intentionally. Invalid payloads raise - :class:`~discord_activity_tracker.api_schemas.DiscordLiveSyncValidationError`. - Django ORM may raise database-related exceptions. - - Side effects: - Reads/writes ``cppa_user_tracker.DiscordProfile`` via queries and - ``get_or_create_discord_profile``; may call ``profile.save()`` without - ``update_fields`` when updating existing rows. - """ - if not user_data_list: - return {} - - from .api_schemas import parse_live_user - - normalized: list[DiscordLiveUserPayload] = [] - for d in user_data_list: - if isinstance(d, dict): - normalized.append(parse_live_user(d)) - else: - normalized.append(d) - - # Deduplicate by user_id (last-seen wins) - unique = {d.user_id: d for d in normalized} - - # Fetch existing profiles in one query - existing = { - p.discord_user_id: p - for p in DiscordProfile.objects.filter(discord_user_id__in=list(unique.keys())) - } - - result = {} - for uid, d in unique.items(): - if uid in existing: - profile = existing[uid] - username_val = d.username or "" - display_name_val = d.display_name or "" - avatar_url_val = d.avatar_url or "" - updated = False - if username_val and profile.username != username_val: - profile.username = username_val - updated = True - if display_name_val and profile.display_name != display_name_val: - profile.display_name = display_name_val - updated = True - if avatar_url_val and profile.avatar_url != avatar_url_val: - profile.avatar_url = avatar_url_val - updated = True - if profile.is_bot != d.is_bot: - profile.is_bot = d.is_bot - updated = True - if updated: - profile.save() - result[uid] = profile - else: - profile, _ = get_or_create_discord_profile( - discord_user_id=uid, - username=d.username, - display_name=d.display_name, - avatar_url=d.avatar_url, - is_bot=d.is_bot, - ) - result[uid] = profile - - return result - - -def bulk_upsert_discord_messages( - message_data_list: Sequence[Union[DiscordLivePreparedMessage, Dict[str, Any]]], - channel: DiscordChannel, - user_map: Dict[int, DiscordProfile], -) -> Dict[int, DiscordMessage]: - """Bulk upsert messages for one channel using ``bulk_create(update_conflicts=True)``. - - Skips a message (with ``logger.warning``) when ``user_map`` has no profile for - the author's ``user_id`` (``d["author"]["user_id"]``). Skips building rows when every message is skipped; - then returns an empty dict. - - Does not perform Discord HTTP calls; does not emit ``CollectorFailureCategory``. - - Args: - message_data_list: Normalized message dicts (``message_id``, ``author``, etc.). - channel: Target channel for all rows. - user_map: ``discord_user_id -> DiscordProfile`` from ``bulk_upsert_discord_users``. - - Returns: - Map ``message_id -> DiscordMessage`` with PKs loaded (``id``, ``message_id`` only). - - Raises: - None intentionally. Invalid payloads raise - :class:`~discord_activity_tracker.api_schemas.DiscordLiveSyncValidationError`. - Django ORM may raise database-related exceptions. - - Side effects: - Writes ``DiscordMessage`` via ``bulk_create``. May emit ``logger.warning``. - """ - if not message_data_list: - return {} - - from .api_schemas import parse_live_message - - now = django_timezone.now() - instances = [] - for raw in message_data_list: - d = parse_live_message(raw) if isinstance(raw, dict) else raw - author = user_map.get(d.author.user_id) - if author is None: - logger.warning("Skipping message %s: author not in user_map", d.message_id) - continue - attachments = d.attachment_urls or [] - instances.append( - DiscordMessage( - message_id=d.message_id, - channel=channel, - author=author, - content=d.content or "", - message_type=d.message_type or "Default", - is_pinned=bool(d.is_pinned), - message_created_at=d.message_created_at, - message_edited_at=d.message_edited_at, - reply_to_message_id=d.reply_to_message_id, - has_attachments=len(attachments) > 0, - attachment_urls=attachments, - is_deleted=False, - created_at=now, - updated_at=now, - ) - ) - - if not instances: - return {} - - DiscordMessage.objects.bulk_create( - instances, - update_conflicts=True, - unique_fields=["message_id"], - update_fields=[ - "channel", - "author", - "content", - "message_type", - "is_pinned", - "message_created_at", - "message_edited_at", - "reply_to_message_id", - "has_attachments", - "attachment_urls", - "is_deleted", - "updated_at", - ], - ) - - msg_ids = [inst.message_id for inst in instances] - db_msgs = DiscordMessage.objects.filter(message_id__in=msg_ids).only( - "id", "message_id" - ) - return {m.message_id: m for m in db_msgs} - - -def bulk_upsert_discord_reactions( - reaction_data_list: Sequence[Union[DiscordReactionPayload, Dict[str, Any]]], - message_map: Dict[int, DiscordMessage], -) -> None: - """Bulk upsert reactions using ``bulk_create(update_conflicts=True)``. - - Entries whose ``discord_message_id`` is missing from ``message_map`` are skipped - silently (no log). Duplicate (message PK, emoji) pairs keep the **last** payload. - - Does not perform Discord HTTP calls; does not emit ``CollectorFailureCategory``. - - Args: - reaction_data_list: Dicts with ``discord_message_id``, ``emoji``, optional ``count``. - message_map: ``message_id -> DiscordMessage`` from ``bulk_upsert_discord_messages``. - - Returns: - None - - Raises: - None intentionally. Invalid payloads raise - :class:`~discord_activity_tracker.api_schemas.DiscordLiveSyncValidationError`. - Django ORM may raise database-related exceptions. - - Side effects: - Writes ``DiscordReaction``. - """ - if not reaction_data_list: - return - - now = django_timezone.now() - # Deduplicate by (message_id, emoji) — keep last - seen = {} - for raw in reaction_data_list: - d = parse_reaction(raw) if isinstance(raw, dict) else raw - msg = message_map.get(d.discord_message_id) - if msg is None: - continue - key = (msg.pk, d.emoji) - seen[key] = DiscordReaction( - message=msg, - emoji=d.emoji, - count=d.count if d.count is not None else 1, - created_at=now, - updated_at=now, - ) - - if not seen: - return - - DiscordReaction.objects.bulk_create( - list(seen.values()), - update_conflicts=True, - unique_fields=["message", "emoji"], - update_fields=["count", "updated_at"], - ) - - -def bulk_process_message_batch( - message_data_list: List[Union[DiscordLivePreparedMessage, Dict[str, Any]]], - channel: DiscordChannel, -) -> int: - """Run user upsert, message upsert, and reaction upsert inside one DB transaction. - - Return value is **always** ``len(message_data_list)`` when the input list is - non-empty, even if some messages were skipped inside ``bulk_upsert_discord_messages`` - (skipped rows do not reduce the returned count). - - Does not perform Discord HTTP calls; does not emit ``CollectorFailureCategory``. - - Args: - message_data_list: Batch of normalized message dicts for one channel. - channel: Target ``DiscordChannel``. - - Returns: - ``0`` if ``message_data_list`` is empty; otherwise ``len(message_data_list)``. - - Raises: - None intentionally. Invalid payloads raise - :class:`~discord_activity_tracker.api_schemas.DiscordLiveSyncValidationError`. - Django ORM may raise database-related exceptions; on failure the whole transaction rolls back. - - Side effects: - One ``transaction.atomic()`` block: writes profiles (via - ``bulk_upsert_discord_users``), messages, and reactions. See those functions - for logging and skip behavior. - """ - if not message_data_list: - return 0 - - with transaction.atomic(): - # Phase 1: users - from .api_schemas import parse_live_message - - prepared: list[DiscordLivePreparedMessage] = [ - parse_live_message(m) if isinstance(m, dict) else m - for m in message_data_list - ] - user_data_by_id: dict[int, DiscordLiveUserPayload] = {} - for msg in prepared: - user_data_by_id[msg.author.user_id] = msg.author - user_map = bulk_upsert_discord_users(list(user_data_by_id.values())) - - # Phase 2: messages - message_map = bulk_upsert_discord_messages(prepared, channel, user_map) - - # Phase 3: reactions - reaction_data: list[DiscordReactionPayload] = [] - for msg in prepared: - for reaction in msg.reactions: - if isinstance(reaction, dict): - emoji = reaction.get("emoji") - count = reaction.get("count", 0) - else: - emoji = getattr(reaction, "emoji", None) - count = getattr(reaction, "count", 0) - if emoji: - reaction_data.append( - parse_reaction( - { - "discord_message_id": msg.message_id, - "emoji": emoji, - "count": count, - } - ) - ) - if reaction_data: - bulk_upsert_discord_reactions(reaction_data, message_map) - - return len(message_data_list) diff --git a/discord_activity_tracker/staging_schema.py b/discord_activity_tracker/staging_schema.py deleted file mode 100644 index adc37fc5..00000000 --- a/discord_activity_tracker/staging_schema.py +++ /dev/null @@ -1,195 +0,0 @@ -"""Pydantic validation for Discord staging / ingestion payloads. - -Runtime validation uses the models in this module only. - -Reviewers who prefer raw JSON Schema may read the optional committed copy at -``discord_activity_tracker/schemas/discord_staging_v1.json`` (see generation -notes in ``docs/discord-tracker-schema.md``, section **JSON Schema artifact vs -runtime validation**). That file can drift if models change; regenerate it with -``python -m discord_activity_tracker.scripts.write_staging_json_schema`` (see -script docstring) or by calling ``write_staging_json_schema`` from a REPL. - -Human-readable field definitions and cross-tracker alignment notes live in -``docs/discord-tracker-schema.md``. -""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Annotated, Any, NoReturn, Union - -from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator - -from core.utils.datetime_parsing import CANONICAL_INSTANT_UTC_Z_PATTERN - -NormalizedMessageInstantUtcZ = Annotated[ - str, Field(pattern=CANONICAL_INSTANT_UTC_Z_PATTERN) -] - - -class StagingValidationError(ValueError): - """Discord staging payload failed Pydantic validation (envelope or normalized message).""" - - -class DiscordExporterGuild(BaseModel): - """Guild object inside a DiscordChatExporter JSON file.""" - - model_config = ConfigDict(extra="allow", populate_by_name=True) - - id: Union[str, int, None] = None - name: str = "" - iconUrl: str | None = Field(default=None, validation_alias="iconUrl") - - -class DiscordExporterChannel(BaseModel): - """Channel object inside a DiscordChatExporter JSON file.""" - - model_config = ConfigDict(extra="allow", populate_by_name=True) - - id: Union[str, int, None] = None - name: str = "" - type: str | None = None - topic: str | None = None - category: str | None = None - categoryId: Union[str, int, None] = Field( - default=None, validation_alias="categoryId" - ) - - -class DiscordChatExporterEnvelope(BaseModel): - """Top-level shape of a DiscordChatExporter ``.json`` export.""" - - model_config = ConfigDict(extra="allow") - - guild: DiscordExporterGuild = Field(default_factory=DiscordExporterGuild) - channel: DiscordExporterChannel = Field(default_factory=DiscordExporterChannel) - messages: list[Any] = Field(default_factory=list) - - @field_validator("messages", mode="before") - @classmethod - def _messages_must_be_list(cls, v: Any) -> Any: - if v is None: - return [] - if not isinstance(v, list): - raise ValueError("messages must be a JSON array") - return v - - -class NormalizedAttachment(BaseModel): - model_config = ConfigDict(extra="allow") - - url: str | None = None - - -class NormalizedAuthorExport(BaseModel): - """Author block after ``convert_exporter_message_to_dict``.""" - - model_config = ConfigDict(extra="allow") - - id: int = 0 - username: str = "unknown" - global_name: str = "" - avatar_url: str = "" - bot: bool = False - - -class NormalizedReaction(BaseModel): - model_config = ConfigDict(extra="allow") - - emoji: str = Field(min_length=1) - count: int = Field(ge=0) - - -class NormalizedDiscordMessage(BaseModel): - """Post-converter message dict (API-shaped + canonical enrichment fields).""" - - model_config = ConfigDict(extra="forbid") - - id: int - content: str = "" - created_at: NormalizedMessageInstantUtcZ - edited_at: NormalizedMessageInstantUtcZ | None = None - message_type: str = "Default" - is_pinned: bool = False - author: NormalizedAuthorExport - attachments: list[NormalizedAttachment] = Field(default_factory=list) - reactions: list[NormalizedReaction] = Field(default_factory=list) - reference: dict[str, Any] | None = None - occurred_at: NormalizedMessageInstantUtcZ | None = None - actor_id: str | None = None - source_url: str | None = None - - @field_validator("edited_at", "occurred_at", mode="before") - @classmethod - def _blank_optional_timestamp_to_none(cls, v: Any) -> Any: - if v is None: - return None - if isinstance(v, str) and not v.strip(): - return None - return v - - -def _validation_error(prefix: str, err: ValidationError) -> NoReturn: - detail = err.errors()[:5] - msg = f"{prefix}: " + "; ".join( - f"{e.get('loc', ())}: {e.get('msg', '')}" for e in detail - ) - if len(err.errors()) > 5: - msg += f" … ({len(err.errors())} errors total)" - raise StagingValidationError(msg) from err - - -def validate_envelope( - data: dict[str, Any], - *, - source: str | None = None, -) -> DiscordChatExporterEnvelope: - """Validate parsed DiscordChatExporter file contents. Raises ``StagingValidationError``.""" - prefix = f"Invalid Discord export envelope{f' ({source})' if source else ''}" - try: - return DiscordChatExporterEnvelope.model_validate(data) - except ValidationError as e: - _validation_error(prefix, e) - - -def validate_normalized_message( - obj: dict[str, Any], - *, - source: str | None = None, -) -> NormalizedDiscordMessage: - """Validate one normalized message dict. Raises ``StagingValidationError``.""" - prefix = f"Invalid normalized Discord message{f' ({source})' if source else ''}" - try: - return NormalizedDiscordMessage.model_validate(obj) - except ValidationError as e: - _validation_error(prefix, e) - - -def build_staging_json_schema_bundle() -> dict[str, Any]: - """Build a JSON-serializable object holding JSON Schemas for reviewer use.""" - return { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "discord_staging_v1", - "description": ( - "Optional JSON Schema bundle for Discord staging data. Runtime " - "validation uses Pydantic models in discord_activity_tracker/staging_schema.py." - ), - "discord_chat_exporter_envelope": DiscordChatExporterEnvelope.model_json_schema( - ref_template="#/discord_chat_exporter_envelope/$defs/{model}" - ), - "normalized_discord_message": NormalizedDiscordMessage.model_json_schema( - ref_template="#/normalized_discord_message/$defs/{model}" - ), - } - - -def write_staging_json_schema(path: Path | None = None) -> Path: - """Write ``discord_staging_v1.json`` next to this package's ``schemas/`` dir.""" - target = path or ( - Path(__file__).resolve().parent / "schemas" / "discord_staging_v1.json" - ) - target.parent.mkdir(parents=True, exist_ok=True) - bundle = build_staging_json_schema_bundle() - target.write_text(json.dumps(bundle, indent=2) + "\n", encoding="utf-8") - return target diff --git a/discord_activity_tracker/sync/__init__.py b/discord_activity_tracker/sync/__init__.py deleted file mode 100644 index 774de713..00000000 --- a/discord_activity_tracker/sync/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Discord ingest and export helpers (not the DB service layer). - -- ``sync.chat_exporter`` — DiscordChatExporter CLI integration and JSON parsing. -- ``sync.messages`` — Normalized message batches and ``discord.py`` client helpers. -- ``sync.client`` — ``DiscordSyncClient`` wrapper. -- ``sync.exporter_window`` — DB-backed lower bounds for incremental exports. -- ``sync.export`` — Markdown export from ORM data. -""" diff --git a/discord_activity_tracker/sync/chat_exporter.py b/discord_activity_tracker/sync/chat_exporter.py deleted file mode 100644 index bfa83409..00000000 --- a/discord_activity_tracker/sync/chat_exporter.py +++ /dev/null @@ -1,897 +0,0 @@ -"""DiscordChatExporter CLI wrapper for configured exporter credentials.""" - -from __future__ import annotations - -import json -import logging -import os -import platform -import re -import shutil -import subprocess -import sys -from dataclasses import dataclass -from datetime import datetime, timezone -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Sequence - -from core.utils.datetime_parsing import ( - CANONICAL_INSTANT_UTC_Z_PATTERN, - format_instant_iso_z, -) - -from discord_activity_tracker.protocol_impl import DiscordActivityRecord - -from .exporter_window import iter_channel_export_days, resolve_channel_export_after -from .utils import format_discord_url -from ..workspace import get_workspace_root - -_SAFE_INT_MAX = 2**63 - 1 # max safe BigIntegerField value - -_INSTANT_Z_RE = re.compile(CANONICAL_INSTANT_UTC_Z_PATTERN) - - -def _coerce_exporter_timestamp(raw: Any, *, optional: bool = False) -> str | None: - """Normalize DiscordChatExporter timestamp strings toward ISO 8601 UTC ``Z``. - - Uses :func:`format_instant_iso_z`. When *optional* is True, missing or blank - values return ``None`` (e.g. ``timestampEdited``). Otherwise empty input is - normalized via ``format_instant_iso_z`` like any other string (typically ``""``). - """ - if optional: - if raw is None or (isinstance(raw, str) and not str(raw).strip()): - return None - return format_instant_iso_z(raw if raw is not None else "") - - -def _safe_int(value: object, default: int = 0) -> int: - """Convert a snowflake string or int to int; clamp to BigIntegerField range.""" - try: - result = int(value) # type: ignore[arg-type] - return result if 0 <= result <= _SAFE_INT_MAX else default - except (TypeError, ValueError): - return default - - -logger = logging.getLogger(__name__) - -# Official releases (GUI + CLI); place the CLI binary locally or set DISCORD_CHAT_EXPORTER_CLI. -DISCORD_CHAT_EXPORTER_RELEASES_URL = ( - "https://github.com/Tyrrrz/DiscordChatExporter/releases/latest" -) - - -class DiscordChatExporterError(Exception): - pass - - -@dataclass(frozen=True) -class ChannelDayExport: - """One DiscordChatExporter JSON file for a channel and UTC calendar day.""" - - path: Path - day_str: str - channel_id: int - - -def _default_cli_basename() -> str: - """DiscordChatExporter ships ``.exe`` on Windows and extensionless ``DiscordChatExporter.Cli`` on macOS/Linux.""" - if sys.platform == "win32": - return "DiscordChatExporter.Cli.exe" - return "DiscordChatExporter.Cli" - - -def _get_parallel_workers() -> int: - """DiscordChatExporter ``--parallel``; clamped to reduce OOM (exit -9 / SIGKILL).""" - from django.conf import settings - - raw = int(getattr(settings, "DISCORD_CHAT_EXPORTER_PARALLEL", 1) or 1) - return max(1, min(16, raw)) - - -def _get_cli_path() -> Path: - """Resolve CLI path at call time. - - Prefer ``DISCORD_CHAT_EXPORTER_CLI`` from Django settings (``.env``), otherwise - ``workspace/discord_activity_tracker/script/`` plus the platform default binary name. - """ - from django.conf import settings - - configured = getattr(settings, "DISCORD_CHAT_EXPORTER_CLI", None) - if configured: - return Path(configured).expanduser().resolve() - return get_workspace_root() / "script" / _default_cli_basename() - - -def _file_command_brief_description(cli_path: Path) -> Optional[str]: - """Return ``file -b`` output for *cli_path*, or None if unavailable.""" - file_bin = shutil.which("file") - if not file_bin: - return None - try: - proc = subprocess.run( - [file_bin, "-b", str(cli_path)], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - timeout=10, - check=False, - ) - except (OSError, ValueError, subprocess.SubprocessError): - return None - if proc.returncode != 0: - return None - return (proc.stdout or "").strip() or None - - -def validate_discord_chat_exporter_cli_architecture(cli_path: Path) -> None: - """Fail fast if the CLI binary ABI clearly mismatches this machine (e.g. Intel build on Apple Silicon). - - Uses ``file(1)`` on Unix when present. Universal binaries containing both slices pass. - """ - if sys.platform == "win32": - return - - host = platform.machine().lower() - host_is_arm = host in ("arm64", "aarch64") - host_is_intel = host in ("x86_64", "amd64", "i386", "i686") - - desc = _file_command_brief_description(cli_path) - if not desc: - logger.debug( - "DiscordChatExporter arch check skipped (no `file` output for %s)", - cli_path, - ) - return - - d = desc.lower() - has_arm = "arm64" in d or "aarch64" in d - has_intel = "x86_64" in d or "i386" in d or "i686" in d or "amd64" in d - - if host_is_arm and has_intel and not has_arm: - raise DiscordChatExporterError( - f"DiscordChatExporter binary is Intel-only ({desc!r}) but this host is " - f"{platform.machine()} (use the osx-arm64 / linux-arm64 build from " - f"{DISCORD_CHAT_EXPORTER_RELEASES_URL})." - ) - if host_is_intel and has_arm and not has_intel: - raise DiscordChatExporterError( - f"DiscordChatExporter binary is arm64-only ({desc!r}) but this host is " - f"{platform.machine()} (use the osx-x64 / linux-x64 build from " - f"{DISCORD_CHAT_EXPORTER_RELEASES_URL})." - ) - - logger.info( - "DiscordChatExporter CLI arch check OK (host=%s, file(1)=%s)", - platform.machine(), - desc, - ) - - -def _utc_wall_clock_for_cli(dt: datetime) -> datetime: - """Normalize to UTC for ``--after`` / ``--before`` strings (DiscordChatExporter uses UTC wall clock).""" - if dt.tzinfo is None: - return dt.replace(tzinfo=timezone.utc) - return dt.astimezone(timezone.utc) - - -def _get_include_voice_channels() -> bool: - from django.conf import settings - - return bool(getattr(settings, "DISCORD_CHAT_EXPORTER_INCLUDE_VC", False)) - - -def _get_sequential_export() -> bool: - from django.conf import settings - - return bool(getattr(settings, "DISCORD_CHAT_EXPORTER_SEQUENTIAL_EXPORT", False)) - - -def _cli_argv_head(cli_path: Path) -> list[str]: - """First argv token(s) for the exporter: native binary, or ``dotnet`` + ``.dll``.""" - from django.conf import settings - - raw = getattr(settings, "DISCORD_CHAT_EXPORTER_DOTNET_DLL", None) - if raw and str(raw).strip(): - dll = Path(str(raw).strip()).expanduser().resolve() - if not dll.exists(): - raise DiscordChatExporterError( - f"DISCORD_CHAT_EXPORTER_DOTNET_DLL points to a missing file: {dll}" - ) - dotnet_raw = getattr(settings, "DISCORD_CHAT_EXPORTER_DOTNET", None) - dotnet_bin = ( - (str(dotnet_raw).strip() if dotnet_raw else "") - or shutil.which("dotnet") - or "" - ) - if not dotnet_bin: - raise DiscordChatExporterError( - "DISCORD_CHAT_EXPORTER_DOTNET_DLL is set but `dotnet` was not found. " - "Install the .NET runtime (e.g. `brew install dotnet`) or set " - "DISCORD_CHAT_EXPORTER_DOTNET to the full path of the `dotnet` executable." - ) - resolved_dotnet = Path(dotnet_bin).expanduser().resolve() - return [str(resolved_dotnet), str(dll)] - return [str(cli_path)] - - -def _maybe_macos_clear_quarantine(bundle_dir: Path) -> None: - """Optionally strip extended attributes (e.g. quarantine) from the CLI bundle directory.""" - from django.conf import settings - - if sys.platform != "darwin": - return - if not getattr(settings, "DISCORD_CHAT_EXPORTER_MACOS_CLEAR_QUARANTINE", False): - return - xattr_bin = shutil.which("xattr") - if not xattr_bin: - logger.warning( - "DISCORD_CHAT_EXPORTER_MACOS_CLEAR_QUARANTINE is true but `xattr` was not found" - ) - return - try: - subprocess.run( - [xattr_bin, "-cr", str(bundle_dir)], - stdin=subprocess.DEVNULL, - capture_output=True, - text=True, - timeout=120, - check=False, - ) - except (OSError, subprocess.SubprocessError) as e: - logger.warning("xattr -cr failed for %s: %s", bundle_dir, e) - return - logger.info( - "Ran `xattr -cr` on %s (DISCORD_CHAT_EXPORTER_MACOS_CLEAR_QUARANTINE)", - bundle_dir, - ) - - -def _stderr_macos_hostfxr_hints(stderr: str, *, cli_path: Path | None) -> str: - if sys.platform != "darwin": - return "" - s = stderr.lower() - if not ( - "libhostfxr" in s - or "library load disallowed by system policy" in s - or ("not valid for use in process" in s and "code signature" in s) - ): - return "" - script_dir = cli_path.parent if cli_path is not None else Path(".") - return ( - "\nmacOS blocked the bundled .NET host library (libhostfxr). Typical fixes:\n" - f" • Clear quarantine: xattr -cr {script_dir}\n" - " • If the project lives on an external disk (/Volumes/...), copy " - "workspace/discord_activity_tracker/script/ to your internal SSD and set " - "DISCORD_CHAT_EXPORTER_CLI (and optionally DISCORD_CHAT_EXPORTER_DOTNET_DLL) to that copy.\n" - " • Or install .NET (`brew install dotnet`) and set DISCORD_CHAT_EXPORTER_DOTNET_DLL to the " - "full path of DiscordChatExporter.Cli.dll next to the CLI (runs via system `dotnet`)." - ) - - -def _exporter_subprocess_env() -> dict[str, str]: - result = { - k: v - for k, v in os.environ.items() - if k.lower() not in ("http_proxy", "https_proxy") - } - # Reduce .NET runtime memory usage to avoid macOS jetsam SIGKILL (-9). - # These are setdefault so any value already in the environment takes precedence. - # - # DOTNET_GCConserveMemory=9 – most aggressive GC compaction (range 0-9). - # DOTNET_GCHighMemPercent=50 – start aggressive GC when machine RAM hits 50 % - # (default 90 %; lower = earlier pressure-relief). - # DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1 – skip loading the ICU data library - # (saves ~50-150 MB at startup; OK for plain ASCII - # channel names used by DiscordChatExporter output). - result.setdefault("DOTNET_GCConserveMemory", "9") - result.setdefault("DOTNET_GCHighMemPercent", "50") - result.setdefault("DOTNET_SYSTEM_GLOBALIZATION_INVARIANT", "1") - return result - - -def _cli_bool(value: bool) -> str: - return "True" if value else "False" - - -def _log_redacted_command(cmd: Sequence[str]) -> None: - safe: List[str] = [] - i = 0 - while i < len(cmd): - if i + 1 < len(cmd) and cmd[i] == "--token": - safe.extend(["--token", ""]) - i += 2 - else: - safe.append(str(cmd[i])) - i += 1 - logger.debug("Command: %s", " ".join(safe)) - - -def _sigkill_suffix() -> str: - return ( - " (SIGKILL: often out-of-memory or macOS memory pressure; " - "set DISCORD_CHAT_EXPORTER_PARALLEL=1, DISCORD_CHAT_EXPORTER_INCLUDE_VC=false; " - "try DISCORD_CHAT_EXPORTER_SEQUENTIAL_EXPORT=true for one channel per CLI process; " - "with sequential export, set DISCORD_CHANNEL_IDS (or --channels) to skip the " - "heavy `channels` listing step; " - "the subprocess already sets DOTNET_GCConserveMemory=9 / DOTNET_GCHighMemPercent=50 / " - "DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1 by default — override in your shell env if needed; " - "also run: xattr -d com.apple.quarantine if the binary was downloaded)" - ) - - -def _raise_cli_failure( - *, - op: str, - returncode: int | None, - stderr: str, - cli_path: Path | None = None, -) -> None: - error_msg = f"DiscordChatExporter {op} failed with exit code {returncode}" - if returncode == -9: - error_msg += _sigkill_suffix() - if stderr.strip(): - error_msg += f"\nError: {stderr.strip()}" - error_msg += _stderr_macos_hostfxr_hints(stderr, cli_path=cli_path) - logger.error(error_msg) - raise DiscordChatExporterError(error_msg) - - -def parse_channels_command_stdout(text: str) -> List[int]: - """Parse ``channels`` subcommand stdout into channel snowflake IDs (excludes thread lines).""" - ids: List[int] = [] - for raw in text.splitlines(): - line = raw.strip() - if not line or line.startswith("*"): - continue - m = re.match(r"^(\d+)\s+\|", line) - if m: - ids.append(int(m.group(1))) - return ids - - -def _run_channels_listing( - cli_path: Path, - user_token: str, - guild_id: int, - include_threads: str, -) -> List[int]: - cmd = _cli_argv_head(cli_path) + [ - "channels", - "--token", - user_token, - "--guild", - str(guild_id), - "--include-vc", - _cli_bool(_get_include_voice_channels()), - "--include-threads", - include_threads, - ] - _log_redacted_command(cmd) - proc = subprocess.run( - cmd, - stdin=subprocess.DEVNULL, - capture_output=True, - text=True, - encoding="utf-8", - errors="replace", - env=_exporter_subprocess_env(), - check=False, - ) - stderr = proc.stderr or "" - if proc.returncode != 0: - _raise_cli_failure( - op="channels", - returncode=proc.returncode, - stderr=stderr, - cli_path=cli_path, - ) - return parse_channels_command_stdout(proc.stdout or "") - - -def _run_exporter_streaming(cmd: list[str], *, cli_path: Path) -> None: - _log_redacted_command(cmd) - try: - process = subprocess.Popen( - cmd, - stdin=subprocess.DEVNULL, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - encoding="utf-8", - errors="replace", - env=_exporter_subprocess_env(), - ) - for line in process.stdout or []: - line = line.rstrip() - if line: - logger.info("[CLI] %s", line) - process.wait() - stderr = process.stderr.read() if process.stderr else "" - if process.returncode != 0: - _raise_cli_failure( - op="export", - returncode=process.returncode, - stderr=stderr, - cli_path=cli_path, - ) - except DiscordChatExporterError: - raise - except OSError as e: - if getattr(e, "errno", None) == 8 and sys.platform != "win32": - raise DiscordChatExporterError( - f"Cannot run {cli_path} on {sys.platform} (wrong executable format). " - "Use the macOS or Linux build from " - f"{DISCORD_CHAT_EXPORTER_RELEASES_URL} " - f"(`{_default_cli_basename()}`), or set DISCORD_CHAT_EXPORTER_CLI to that binary." - ) from e - logger.exception("Unexpected error running DiscordChatExporter: %s", e) - raise DiscordChatExporterError(f"Unexpected error: {e}") from e - except Exception as e: - logger.exception("Unexpected error running DiscordChatExporter: %s", e) - raise DiscordChatExporterError(f"Unexpected error: {e}") from e - - -def _append_export_window( - cmd: list[str], - after_date: Optional[datetime], - before_date: Optional[datetime], -) -> None: - if after_date: - after_utc = _utc_wall_clock_for_cli(after_date) - after_str = after_utc.strftime("%Y-%m-%d %H:%M:%S") - cmd.extend(["--after", after_str]) - logger.info( - "Incremental sync: exporting messages after %s UTC (DiscordChatExporter --after)", - after_str, - ) - if before_date: - before_utc = _utc_wall_clock_for_cli(before_date) - before_str = before_utc.strftime("%Y-%m-%d %H:%M:%S") - cmd.extend(["--before", before_str]) - logger.info( - "Exporting messages before %s UTC (DiscordChatExporter --before)", - before_str, - ) - - -def _is_empty_channel_export_error(message: str) -> bool: - """True when DiscordChatExporter reports no messages for the requested window.""" - lower = message.lower() - return ( - "no messages" in lower - or "channel is empty" in lower - or "does not contain" in lower - ) - - -def _resolve_export_channel_ids( - cli_path: Path, - user_token: str, - guild_id: int, - include_threads: str, - channel_ids: Optional[Sequence[int]], -) -> List[int]: - if channel_ids: - seen: set[int] = set() - ids: List[int] = [] - for i in channel_ids: - if i not in seen: - seen.add(i) - ids.append(i) - logger.info( - "Skipping DiscordChatExporter `channels` listing (%d explicit id(s)); " - "export runs directly (avoids OOM/SIGKILL on huge guilds)", - len(ids), - ) - return ids - return list( - _run_channels_listing( - cli_path, user_token, guild_id, include_threads=include_threads - ) - ) - - -def export_channel_window_to_json( - cli_path: Path, - user_token: str, - channel_id: int, - output_path: Path, - after_date: Optional[datetime], - before_date: Optional[datetime], -) -> Path: - """Run DiscordChatExporter ``export`` for one channel and time window.""" - output_path.parent.mkdir(parents=True, exist_ok=True) - cmd = _cli_argv_head(cli_path) + [ - "export", - "--token", - user_token, - "--channel", - str(channel_id), - "--output", - str(output_path), - "--format", - "Json", - "--parallel", - "1", - "--markdown", - "True", - ] - _append_export_window(cmd, after_date, before_date) - logger.info( - "Running DiscordChatExporter export for channel %s -> %s", - channel_id, - output_path.name, - ) - _run_exporter_streaming(cmd, cli_path=cli_path) - return output_path - - -def _export_guild_by_channel_day( - cli_path: Path, - user_token: str, - guild_id: int, - output_dir: Path, - after_date: Optional[datetime], - before_date: Optional[datetime], - include_threads: str, - channel_ids: Optional[Sequence[int]], - *, - per_channel_incremental: bool = False, -) -> List[ChannelDayExport]: - """Export each channel for each UTC day in the resolved window.""" - ids = _resolve_export_channel_ids( - cli_path, user_token, guild_id, include_threads, channel_ids - ) - if not ids: - raise DiscordChatExporterError( - "No channels to export after listing the guild (check DISCORD_CHANNEL_IDS / " - "--channels filter, token access, or INCLUDE_VC if you need voice channels)." - ) - - explicit_after = after_date - results: List[ChannelDayExport] = [] - for ch_id in ids: - ch_after = resolve_channel_export_after( - guild_id, - ch_id, - explicit_after=explicit_after, - ) - days = iter_channel_export_days(after=ch_after, before=before_date) - if not days: - logger.debug("No UTC day windows for channel %s", ch_id) - continue - logger.info( - "Exporting channel %s x %d UTC day(s) (after=%s)", - ch_id, - len(days), - ch_after.isoformat() if ch_after else "none", - ) - for day_str, window_after, window_before in days: - output_path = output_dir / f"{ch_id}_{day_str}.json" - try: - export_channel_window_to_json( - cli_path, - user_token, - ch_id, - output_path, - window_after, - window_before, - ) - except DiscordChatExporterError as exc: - if _is_empty_channel_export_error(str(exc)): - logger.info( - "No messages for channel %s on %s UTC, skipping", - ch_id, - day_str, - ) - output_path.unlink(missing_ok=True) - continue - raise - if output_path.is_file(): - results.append( - ChannelDayExport( - path=output_path, - day_str=day_str, - channel_id=ch_id, - ) - ) - else: - logger.debug( - "DiscordChatExporter produced no file for channel %s on %s", - ch_id, - day_str, - ) - - logger.info("Per-channel per-day export completed (%d file(s))", len(results)) - return results - - -def filter_discord_export_json_paths(paths: Iterable[Path]) -> List[Path]: - """Exclude macOS AppleDouble resource-fork files (``._*.json``); they are not UTF-8 JSON.""" - return [p for p in paths if not p.name.startswith("._")] - - -def _sorted_discord_export_json_paths(output_dir: Path) -> List[Path]: - """``*.json`` from DiscordChatExporter, excluding macOS AppleDouble sidecars (``._*``).""" - return sorted(filter_discord_export_json_paths(output_dir.glob("*.json"))) - - -def export_guild_to_json( - user_token: str, - guild_id: int, - output_dir: Path, - after_date: Optional[datetime] = None, - before_date: Optional[datetime] = None, - include_threads: str = "None", - channel_ids: Optional[Sequence[int]] = None, - *, - per_channel_incremental: bool = False, -) -> List[ChannelDayExport]: - """Export guild channels one UTC day at a time. Returns per-day export descriptors.""" - from django.conf import settings - - cli_path = _get_cli_path() - dotnet_dll_setting = getattr(settings, "DISCORD_CHAT_EXPORTER_DOTNET_DLL", None) - use_dotnet = bool((dotnet_dll_setting or "").strip()) - - if use_dotnet: - dll_path = Path(str(dotnet_dll_setting).strip()).expanduser().resolve() - if not dll_path.exists(): - raise DiscordChatExporterError( - f"DISCORD_CHAT_EXPORTER_DOTNET_DLL points to a missing file: {dll_path}" - ) - _maybe_macos_clear_quarantine(dll_path.parent) - else: - if not cli_path.exists(): - raise DiscordChatExporterError( - f"DiscordChatExporter CLI not found at {cli_path}. " - f"Download from {DISCORD_CHAT_EXPORTER_RELEASES_URL} " - "(e.g. DiscordChatExporter.Cli.osx-arm64.zip or .osx-x64.zip on Mac; " - ".linux-*.zip on Linux; .exe on Windows). " - "Extract next to the executable, put it under " - "workspace/discord_activity_tracker/script/, " - "or set DISCORD_CHAT_EXPORTER_CLI in .env to the full path of the CLI." - ) - _maybe_macos_clear_quarantine(cli_path.parent) - validate_discord_chat_exporter_cli_architecture(cli_path) - - output_dir.mkdir(parents=True, exist_ok=True) - - export_results: List[ChannelDayExport] = [] - - def _run_export(active_token: str) -> None: - nonlocal export_results - export_results = _export_guild_by_channel_day( - cli_path, - active_token, - guild_id, - output_dir, - after_date, - before_date, - include_threads, - channel_ids, - per_channel_incremental=per_channel_incremental, - ) - - try: - try: - _run_export(user_token) - except DiscordChatExporterError as exc: - from discord_activity_tracker.utils.discord_internal_tokens_store import ( - DISCORD_TOKENS_RELOGIN_HINT, - extract_and_save_discord_internal_tokens, - ) - from discord_activity_tracker.utils.discord_tokens import ( - is_discord_exporter_auth_error, - ) - - allow_internal = getattr(settings, "ALLOW_INTERNAL_DISCORD_TOKENS", False) - if isinstance(allow_internal, str): - allow_internal = allow_internal.strip().lower() == "true" - if allow_internal and is_discord_exporter_auth_error(str(exc)): - logger.info( - "DiscordChatExporter auth failure; refreshing session credentials" - ) - refreshed = extract_and_save_discord_internal_tokens() - if refreshed and refreshed != user_token: - logger.info( - "Retrying DiscordChatExporter with refreshed credentials" - ) - _run_export(refreshed) - else: - logger.error( - "Discord export auth failed and credential refresh did not help. %s", - DISCORD_TOKENS_RELOGIN_HINT, - ) - raise - else: - raise - except DiscordChatExporterError: - raise - except OSError as e: - if getattr(e, "errno", None) == 8 and sys.platform != "win32": - raise DiscordChatExporterError( - f"Cannot run {cli_path} on {sys.platform} (wrong executable format). " - "Use the macOS or Linux build from " - f"{DISCORD_CHAT_EXPORTER_RELEASES_URL} " - f"(`{_default_cli_basename()}`), or set DISCORD_CHAT_EXPORTER_CLI to that binary." - ) from e - logger.exception("Unexpected error running DiscordChatExporter: %s", e) - raise DiscordChatExporterError(f"Unexpected error: {e}") from e - except Exception as e: - logger.exception("Unexpected error running DiscordChatExporter: %s", e) - raise DiscordChatExporterError(f"Unexpected error: {e}") from e - - logger.info("Found %d exported JSON files", len(export_results)) - return export_results - - -def parse_exported_json(json_path: Path) -> Dict[str, Any]: - """Parse a DiscordChatExporter JSON file into a dict with guild, channel, messages.""" - logger.debug(f"Parsing {json_path.name}") - - try: - with open(json_path, "r", encoding="utf-8") as f: - data = json.load(f) - - return data - - except json.JSONDecodeError as e: - logger.error(f"Failed to parse JSON from {json_path}: {e}") - raise - except Exception as e: - logger.exception(f"Error reading {json_path}: {e}") - raise - - -def convert_exporter_message_to_dict( - msg_data: Dict[str, Any], - *, - server_id: Optional[int] = None, - channel_id: Optional[int] = None, -) -> Dict[str, Any]: - """Convert DiscordChatExporter message format to our internal format. - - Key normalizations applied here: - - All snowflake IDs coerced from string → int via _safe_int. - - Reaction emoji extracted from nested {"name": ...} dict to plain string; - reactions with no resolvable emoji are dropped (not persisted). - - Reaction ``count`` coerced via :func:`_safe_int` (malformed values → ``0``). - - Author avatarUrl mapped to avatar_url. - - message_type and is_pinned mapped from DiscordChatExporter fields. - - When ``server_id`` and ``channel_id`` are set, adds ``source_url``; - ``occurred_at`` (ISO 8601 UTC ``Z``) and ``actor_id`` when derivable. - - ``created_at`` / ``edited_at`` are normalized with :func:`format_instant_iso_z` - where possible so values match the canonical ``Z`` instant pattern used at - validation time. - """ - author = msg_data.get("author", {}) - - reactions_out: List[Dict[str, Any]] = [] - for reaction in msg_data.get("reactions", []): - emoji_raw = reaction.get("emoji") - if isinstance(emoji_raw, dict): - emoji_name = emoji_raw.get("name") or "" - elif emoji_raw is None: - emoji_name = "" - else: - emoji_name = str(emoji_raw) - emoji_name = (emoji_name or "").strip() - if not emoji_name: - continue - count = _safe_int(reaction.get("count", 0), default=0) - reactions_out.append( - { - "emoji": emoji_name, - "count": max(0, count), - } - ) - - created_at_z = _coerce_exporter_timestamp(msg_data.get("timestamp", "")) - edited_at_z = _coerce_exporter_timestamp( - msg_data.get("timestampEdited"), optional=True - ) - - converted: Dict[str, Any] = { - "id": _safe_int(msg_data.get("id", 0)), - "content": msg_data.get("content", ""), - "created_at": created_at_z, - "edited_at": edited_at_z, - "message_type": msg_data.get("type", "Default") or "Default", - "is_pinned": bool(msg_data.get("isPinned", False)), - "author": { - "id": _safe_int(author.get("id", 0)), - "username": author.get("name", "unknown") or "unknown", - "global_name": author.get("nickname") or author.get("name", "unknown"), - "avatar_url": author.get("avatarUrl", ""), - "bot": bool(author.get("isBot", False)), - }, - "attachments": [ - {"url": att.get("url")} for att in msg_data.get("attachments", []) - ], - "reactions": reactions_out, - "reference": None, - } - - if msg_data.get("reference"): - ref = msg_data["reference"] - ref_id = ref.get("messageId") or ref.get("message_id") - converted["reference"] = {"message_id": _safe_int(ref_id) if ref_id else None} - - if created_at_z and _INSTANT_Z_RE.fullmatch(created_at_z): - converted["occurred_at"] = created_at_z - - author_id = _safe_int(author.get("id", 0)) - if author_id: - converted["actor_id"] = str(author_id) - - mid = converted["id"] - if ( - server_id is not None - and channel_id is not None - and mid - and int(server_id) > 0 - and int(channel_id) > 0 - ): - converted["source_url"] = format_discord_url( - int(server_id), int(channel_id), int(mid) - ) - - return converted - - -def export_and_parse_guild( - user_token: str, - guild_id: int, - output_dir: Path, - after_date: Optional[datetime] = None, -) -> List[Dict[str, Any]]: - """Export guild via CLI and parse all resulting JSON files.""" - exports = export_guild_to_json( - user_token=user_token, - guild_id=guild_id, - output_dir=output_dir, - after_date=after_date, - ) - - parsed_channels = [] - - for export in exports: - json_path = export.path - try: - data = parse_exported_json(json_path) - - parsed_channels.append( - { - "guild": data.get("guild", {}), - "channel": data.get("channel", {}), - "messages": data.get("messages", []), - "file_path": json_path, - } - ) - - except Exception as e: - logger.error(f"Failed to process {json_path.name}: {e}") - continue - - return parsed_channels - - -def exporter_message_to_activity_record( - msg_data: Dict[str, Any], - *, - server_id: int, - channel_id: int, -) -> DiscordActivityRecord: - """Convert exporter JSON to :class:`~discord_activity_tracker.protocol_impl.DiscordActivityRecord`. - - Thin bridge for :mod:`core.protocols` without changing :func:`convert_exporter_message_to_dict` - return shape for existing callers. - """ - converted = convert_exporter_message_to_dict( - msg_data, server_id=server_id, channel_id=channel_id - ) - return DiscordActivityRecord.from_converted_export_dict( - converted, server_id=server_id, channel_id=channel_id - ) diff --git a/discord_activity_tracker/sync/client.py b/discord_activity_tracker/sync/client.py deleted file mode 100644 index 3f543f7c..00000000 --- a/discord_activity_tracker/sync/client.py +++ /dev/null @@ -1,237 +0,0 @@ -"""Discord API client wrapper.""" - -import asyncio -import logging -from datetime import datetime -from typing import Optional, List, Dict, Any - -import discord - -logger = logging.getLogger(__name__) - - -def _message_type_label(message_type: Any) -> str: - """Map discord.MessageType (or duck-typed ``.name``) to exporter-style labels.""" - mt_cls = getattr(discord, "MessageType", None) - if isinstance(mt_cls, type) and isinstance(message_type, mt_cls): - name = message_type.name - else: - name = getattr(message_type, "name", None) - if not isinstance(name, str) or not name: - return "Default" - return "".join(part.capitalize() for part in name.split("_")) - - -def discord_message_to_sync_dict(message: Any) -> Dict[str, Any]: - """Convert a ``discord.Message`` (or duck-typed test double) to sync pipeline dict. - - Module-level so unit tests can validate mapping without constructing - :class:`DiscordSyncClient` (avoids async ``close`` / client lifecycle warnings). - """ - return { - "id": message.id, - "content": message.content, - "author": { - "id": message.author.id, - "username": message.author.name, - "display_name": ( - message.author.display_name - if hasattr(message.author, "display_name") - else "" - ), - "avatar_url": ( - str(message.author.avatar.url) if message.author.avatar else "" - ), - "bot": message.author.bot, - }, - "created_at": message.created_at.isoformat(), - "edited_at": message.edited_at.isoformat() if message.edited_at else None, - "message_type": _message_type_label(message.type), - "is_pinned": bool(message.pinned), - "reference": { - "message_id": (message.reference.message_id if message.reference else None), - }, - "attachments": [ - { - "url": attachment.url, - "filename": attachment.filename, - "size": attachment.size, - } - for attachment in message.attachments - ], - "reactions": [ - { - "emoji": str(reaction.emoji), - "count": reaction.count, - } - for reaction in message.reactions - ], - } - - -class DiscordSyncClient: - """Discord client wrapper for syncing messages.""" - - def __init__(self, token: str): - """Initialize client with token.""" - intents = discord.Intents.default() - intents.message_content = True - intents.members = True - intents.guilds = True - - self.client = discord.Client(intents=intents) - self.token = token - self._ready = False - self._asyncio_loop: Optional[asyncio.AbstractEventLoop] = None - - def run(self, coro): - """Run *coro* on this client's dedicated event loop. - - discord.py binds aiohttp to the loop used at login; reuse this loop for - all operations on this client until :meth:`shutdown_sync`. - """ - if self._asyncio_loop is None or self._asyncio_loop.is_closed(): - self._asyncio_loop = asyncio.new_event_loop() - return self._asyncio_loop.run_until_complete(coro) - - def shutdown_sync(self) -> None: - """Close the Discord client and tear down the loop (sync ``finally`` helper).""" - loop = self._asyncio_loop - if loop is not None and not loop.is_closed(): - try: - # Always drain ``close()`` on this loop so the coroutine is not left - # un-awaited (RuntimeWarning on Py3.12+); ``close`` no-ops when not _ready. - loop.run_until_complete(self.close()) - except Exception: - logger.exception("Error while closing Discord client") - finally: - loop.close() - self._asyncio_loop = None - return - if self._ready: - run_async(self.close()) - - async def _ensure_ready(self): - """Ensure client is logged in and ready.""" - if not self._ready: - await self.client.login(self.token) - self._ready = True - - async def get_guild(self, guild_id: int) -> Optional[discord.Guild]: - """Get guild by ID.""" - await self._ensure_ready() - try: - guild = await self.client.fetch_guild(guild_id) - except discord.NotFound: - logger.error(f"Guild {guild_id} not found") - return None - except discord.Forbidden: - logger.error(f"No access to guild {guild_id}") - return None - return guild - - async def get_channels(self, guild_id: int) -> List[discord.TextChannel]: - """Get all text channels in guild.""" - guild = await self.get_guild(guild_id) - if guild is None: - return [] - - try: - all_channels = await guild.fetch_channels() - except Exception as e: - logger.error(f"Error fetching channels for guild {guild.name}: {e}") - return [] - - channels = [ - channel - for channel in all_channels - if isinstance(channel, discord.TextChannel) - ] - - logger.info(f"Found {len(channels)} text channels in guild {guild.name}") - return channels - - async def get_channel(self, channel_id: int) -> Optional[discord.TextChannel]: - """Get channel by ID.""" - await self._ensure_ready() - try: - channel = await self.client.fetch_channel(channel_id) - if isinstance(channel, discord.TextChannel): - return channel - logger.error(f"Channel {channel_id} is not a text channel") - return None - except discord.NotFound: - logger.error(f"Channel {channel_id} not found") - return None - except discord.Forbidden: - logger.error(f"No access to channel {channel_id}") - return None - - async def fetch_messages_since( - self, - channel: discord.TextChannel, - after: Optional[datetime] = None, - limit: Optional[int] = None, - ) -> List[Dict[str, Any]]: - """Fetch messages from channel since datetime (None = all).""" - messages = [] - count = 0 - - try: - logger.info( - f"Fetching messages from #{channel.name} (after={after}, limit={limit})" - ) - - async for message in channel.history( - limit=limit, after=after, oldest_first=True - ): - msg_data = discord_message_to_sync_dict(message) - messages.append(msg_data) - count += 1 - - if count % 100 == 0: - logger.debug(f"Fetched {count} messages from #{channel.name}") - - logger.info(f"Fetched {len(messages)} messages from #{channel.name}") - - except discord.Forbidden: - logger.error(f"No permission to read messages in #{channel.name}") - except discord.HTTPException as e: - logger.error(f"HTTP error fetching messages from #{channel.name}: {e}") - except Exception as e: - logger.exception( - f"Unexpected error fetching messages from #{channel.name}: {e}" - ) - - return messages - - def _message_to_dict(self, message: discord.Message) -> Dict[str, Any]: - """Convert message to dict (delegates to :func:`discord_message_to_sync_dict`).""" - return discord_message_to_sync_dict(message) - - async def close(self): - """Close the client connection.""" - if self._ready: - await self.client.close() - self._ready = False - - def __enter__(self): - """Context manager entry.""" - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - """Context manager exit.""" - self.shutdown_sync() - - -def run_async(coro): - """Run *coro* in a fresh event loop and close it. - - Use only for coroutines not tied to a :class:`DiscordSyncClient`. For - client work, use :meth:`DiscordSyncClient.run` and :meth:`DiscordSyncClient.shutdown_sync`. - """ - loop = asyncio.new_event_loop() - try: - return loop.run_until_complete(coro) - finally: - loop.close() diff --git a/discord_activity_tracker/sync/export.py b/discord_activity_tracker/sync/export.py deleted file mode 100644 index 02f54008..00000000 --- a/discord_activity_tracker/sync/export.py +++ /dev/null @@ -1,403 +0,0 @@ -"""Export Discord messages to markdown files.""" - -import logging -import re -import subprocess -from datetime import datetime, timedelta, timezone -from pathlib import Path -from typing import Optional, List -from collections import defaultdict - -from dateutil.relativedelta import relativedelta -from django.utils import timezone as django_timezone - -from ..models import DiscordServer, DiscordChannel, DiscordMessage -from ..services import queryset_channels_with_recent_messages -from .utils import sanitize_channel_name, format_discord_url - -logger = logging.getLogger(__name__) - -DAY_SPLIT_THRESHOLD = 200 - - -def _make_github_anchor(timestamp: str, username: str) -> str: - """Build anchor that matches GitHub's auto-generated one for a heading.""" - safe_ts = timestamp.replace(":", "").replace(".", "") - safe_user = re.sub(r"[^a-z0-9]", "", username.lower()) - return f"{safe_ts}-utc--{safe_user}" - - -_INVISIBLE_UNICODE = re.compile("[\u200b-\u200d\u2060-\u2064\u2066-\u2069\ufeff]+") - - -def _strip_invisible_unicode(text: str) -> str: - """Strip invisible chars (zero-width, isolates, BOM) that mess up markdown.""" - if not text: - return text - return _INVISIBLE_UNICODE.sub("", text) - - -def _sanitize_discord_content(content: str) -> str: - """Convert Discord mentions to plain text, keep code blocks intact.""" - if not content: - return "" - content = _strip_invisible_unicode(content) - - def replace_mentions(text: str) -> str: - text = re.sub(r"<@!?(\d+)>", r"@user-\1", text) - text = re.sub(r"<@&(\d+)>", r"@role-\1", text) - text = re.sub(r"<#(\d+)>", r"#channel-\1", text) - text = re.sub(r"", r":\1:", text) - return text - - parts = re.split(r"(```[\s\S]*?```|`[^`]*`)", content) - result = [] - for part in parts: - if part.startswith("`") and part.endswith("`"): - result.append(part) - else: - result.append(replace_mentions(part)) - return "".join(result) - - -def generate_markdown_content( - channel: DiscordChannel, - year_month: str, - messages: List[DiscordMessage], - date_str: Optional[str] = None, - split_by_day: bool = False, -) -> str: - """Build markdown for a channel-month or channel-day.""" - lines = [] - - if messages: - first_msg = messages[0] - last_msg = messages[-1] - message_count = len(messages) - unique_authors = {getattr(msg, "author_id", None) for msg in messages} - unique_authors.discard(None) - active_users = len(unique_authors) - else: - first_msg = last_msg = None - message_count = active_users = 0 - - # YAML frontmatter - lines.append("---") - lines.append(f"channel: {channel.channel_name}") - if date_str: - lines.append(f"date: {date_str}") - else: - lines.append(f"month: {year_month}") - lines.append(f"server: {channel.server.server_name}") - lines.append(f"message_count: {message_count}") - lines.append(f"active_users: {active_users}") - - if first_msg: - first_utc = first_msg.message_created_at.astimezone(timezone.utc) - lines.append(f"first_message: {first_utc.strftime('%Y-%m-%dT%H:%M:%SZ')}") - if last_msg: - last_utc = last_msg.message_created_at.astimezone(timezone.utc) - lines.append(f"last_message: {last_utc.strftime('%Y-%m-%dT%H:%M:%SZ')}") - - discord_url = format_discord_url( - channel.server.server_id, channel.channel_id, 0 - ).rsplit("/", 1)[0] - lines.append(f"discord_channel_url: {discord_url}") - lines.append("---") - lines.append("") - - # Title - if date_str: - lines.append(f"# #{channel.channel_name} - {date_str}") - else: - month_name = datetime.strptime(year_month, "%Y-%m").strftime("%B %Y") - lines.append(f"# #{channel.channel_name} - {month_name}") - lines.append("") - - # Group by date (UTC) - messages_by_date = defaultdict(list) - for msg in messages: - utc_time = msg.message_created_at.astimezone(timezone.utc) - d = utc_time.strftime("%Y-%m-%d") - messages_by_date[d].append(msg) - - safe_channel_name = sanitize_channel_name(channel.channel_name) - - for d in sorted(messages_by_date.keys()): - lines.append(f"## {d}") - lines.append("") - - for msg in messages_by_date[d]: - # UTC timestamp (with ms) - utc_time = msg.message_created_at.astimezone(timezone.utc) - timestamp = utc_time.strftime("%H:%M:%S") - if utc_time.microsecond: - timestamp += f".{utc_time.microsecond // 1000:03d}" - - author_label = f"@{msg.author.username}" - if getattr(msg.author, "is_bot", False): - author_label += " (bot)" - lines.append(f"### {timestamp} UTC — {author_label}") - lines.append("") - - msg_url = format_discord_url( - channel.server.server_id, channel.channel_id, msg.message_id - ) - - # Reply, Url (blockquoted) - metadata_lines = [] - if msg.reply_to_message_id: - try: - reply_to = DiscordMessage.objects.get( - message_id=msg.reply_to_message_id - ) - reply_utc = reply_to.message_created_at.astimezone(timezone.utc) - reply_time = reply_utc.strftime("%H:%M:%S") - if reply_utc.microsecond: - reply_time += f".{reply_utc.microsecond // 1000:03d}" - reply_date = reply_utc.strftime("%Y-%m-%d") - reply_anchor = _make_github_anchor( - reply_time, reply_to.author.username - ) - - if reply_date == d: - link_target = f"#{reply_anchor}" - elif split_by_day: - link_target = ( - f"../{reply_date}/{safe_channel_name}.md#{reply_anchor}" - ) - elif reply_date.startswith(year_month): - link_target = f"#{reply_anchor}" - else: - reply_ym = reply_date[:7] - link_target = f"../{reply_ym}/{reply_ym}-{safe_channel_name}.md#{reply_anchor}" - - metadata_lines.append( - f"> Reply to: [@{reply_to.author.username} ({reply_time} UTC)]({link_target}) " - ) - if reply_to.content: - preview = _sanitize_discord_content( - reply_to.content.replace("\n", " ").strip()[:80] - ) - if len(reply_to.content.strip()) > 80: - preview += "..." - metadata_lines.append(f"> Original: {preview} ") - except DiscordMessage.DoesNotExist: - pass - metadata_lines.append(f"> Url: {msg_url} ") - for m in metadata_lines: - lines.append(m) - lines.append("") - lines.append("") - - if msg.content: - sanitized = _sanitize_discord_content(msg.content) - if sanitized.strip().startswith("```"): - lines.append("") - lines.append("") - in_code_fence = False - for content_line in sanitized.splitlines(): - if content_line.startswith("```"): - in_code_fence = not in_code_fence - lines.append(content_line) - elif in_code_fence: - lines.append(content_line) - else: - lines.append(content_line + " ") - if in_code_fence: - lines.append("```") # close unclosed block - lines.append("") - - if msg.attachment_urls: - lines.append("> Attachments: ") - for url in msg.attachment_urls: - filename = url.split("/")[-1].split("?")[0] - lines.append(f"> - [{filename}]({url}) ") - lines.append("") - - lines.append("") - - return "\n".join(lines) - - -def export_channel_to_markdown( - channel: DiscordChannel, year_month: str, output_dir: Path -) -> Optional[List[Path]]: - """Export a channel-month to markdown. Splits into per-day files.""" - logger.info(f"Exporting #{channel.channel_name} for {year_month}") - - start_date = datetime.strptime(f"{year_month}-01", "%Y-%m-%d") - start_date = django_timezone.make_aware(start_date) - end_date = start_date + relativedelta(months=1) - - messages = ( - DiscordMessage.objects.filter( - channel=channel, - message_created_at__gte=start_date, - message_created_at__lt=end_date, - is_deleted=False, - ) - .select_related("author") - .order_by("message_created_at") - ) - - message_list = list(messages) - - if not message_list: - logger.debug( - f"No messages for #{channel.channel_name} in {year_month}, skipping" - ) - return None - - year = year_month.split("-")[0] - month_dir = output_dir / year / year_month - month_dir.mkdir(parents=True, exist_ok=True) - safe_channel_name = sanitize_channel_name(channel.channel_name) - - # Per-day: yyyy/yyyy-MM/yyyy-MM-DD/channel.md - messages_by_date = defaultdict(list) - for msg in message_list: - utc_time = msg.message_created_at.astimezone(timezone.utc) - d = utc_time.strftime("%Y-%m-%d") - messages_by_date[d].append(msg) - - exported_paths = [] - - for date_str in sorted(messages_by_date.keys()): - day_messages = messages_by_date[date_str] - md_content = generate_markdown_content( - channel, year_month, day_messages, date_str=date_str, split_by_day=True - ) - day_dir = month_dir / date_str - day_dir.mkdir(parents=True, exist_ok=True) - file_path = day_dir / f"{safe_channel_name}.md" - file_path.write_text(md_content, encoding="utf-8") - exported_paths.append(file_path) - logger.info(f"Exported {len(day_messages)} messages to {file_path}") - - return exported_paths - - -def export_all_active_channels( - context_repo_path: Path, - server: DiscordServer, - months_back: int = 12, - active_days: int = 30, -) -> List[Path]: - """Export active channels for the last N months.""" - logger.info(f"Exporting all active channels for last {months_back} months") - - cutoff = django_timezone.now() - timedelta(days=active_days) - channels = queryset_channels_with_recent_messages(server, cutoff).select_related( - "server" - ) - - logger.info(f"Found {channels.count()} active channels") - - exported_files = [] - - today = django_timezone.now() - for i in range(months_back): - month_date = today - relativedelta(months=i) - year_month = month_date.strftime("%Y-%m") - - for channel in channels: - try: - result = export_channel_to_markdown( - channel, year_month, context_repo_path - ) - if result: - exported_files.extend(result) - except Exception as e: - logger.error( - f"Failed to export #{channel.channel_name} for {year_month}: {e}" - ) - continue - - logger.info(f"Exported {len(exported_files)} files") - return exported_files - - -def commit_and_push_context_repo( - repo_path: Path, commit_message: Optional[str] = None -) -> bool: - """Commit and push to the context repo.""" - if commit_message is None: - timestamp = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC") - commit_message = f"Update Discord archive - {timestamp}" - - logger.info(f"Committing and pushing to {repo_path}") - - try: - result = subprocess.run( - ["git", "add", "."], - cwd=repo_path, - check=True, - capture_output=True, - text=True, - ) - logger.debug(f"git add: {result.stdout}") - - result = subprocess.run( - ["git", "status", "--porcelain"], - cwd=repo_path, - check=True, - capture_output=True, - text=True, - ) - - if not result.stdout.strip(): - logger.info("No changes to commit") - return True - - result = subprocess.run( - ["git", "commit", "-m", commit_message], - cwd=repo_path, - check=True, - capture_output=True, - text=True, - ) - logger.info(f"git commit: {result.stdout}") - - result = subprocess.run( - ["git", "push"], cwd=repo_path, check=True, capture_output=True, text=True - ) - logger.info(f"git push: {result.stdout}") - - logger.info("Successfully committed and pushed changes") - return True - - except subprocess.CalledProcessError as e: - logger.error(f"Git operation failed: {e.stderr}") - return False - except Exception as e: - logger.exception(f"Error committing and pushing: {e}") - return False - - -def export_and_push( - context_repo_path: Path, - server: DiscordServer, - months_back: int = 12, - active_days: int = 30, - commit_message: Optional[str] = None, - auto_commit: bool = False, -) -> bool: - """Export channels, optionally commit and push.""" - exported_files = export_all_active_channels( - context_repo_path=context_repo_path, - server=server, - months_back=months_back, - active_days=active_days, - ) - - if not exported_files: - logger.warning("No files exported, skipping git operations") - return False - - if auto_commit: - success = commit_and_push_context_repo(context_repo_path, commit_message) - return success - else: - logger.info(f"Exported {len(exported_files)} files (auto-commit disabled)") - return True diff --git a/discord_activity_tracker/sync/exporter_window.py b/discord_activity_tracker/sync/exporter_window.py deleted file mode 100644 index 0b00085f..00000000 --- a/discord_activity_tracker/sync/exporter_window.py +++ /dev/null @@ -1,156 +0,0 @@ -"""DiscordChatExporter date-window helpers (scheduled sync + backfill).""" - -from __future__ import annotations - -from datetime import datetime, timedelta, timezone - -from django.db.models import Max - -from discord_activity_tracker.models import DiscordMessage - - -def latest_message_created_at_for_guild( - guild_snowflake: int, - *, - channel_ids: list[int] | None, -) -> datetime | None: - """Latest ``message_created_at`` for non-deleted messages (optional channel scope).""" - qs = DiscordMessage.objects.filter( - channel__server__server_id=guild_snowflake, - is_deleted=False, - ) - if channel_ids: - qs = qs.filter(channel__channel_id__in=channel_ids) - return qs.aggregate(m=Max("message_created_at"))["m"] - - -def latest_message_created_at_for_channel( - guild_snowflake: int, - channel_snowflake: int, -) -> datetime | None: - """Latest ``message_created_at`` for one channel (non-deleted messages).""" - return DiscordMessage.objects.filter( - channel__server__server_id=guild_snowflake, - channel__channel_id=channel_snowflake, - is_deleted=False, - ).aggregate(m=Max("message_created_at"))["m"] - - -def incremental_export_after(latest: datetime) -> datetime: - """Lower bound for the next scheduled export with overlap. - - Returns UTC midnight on the calendar day of *latest* so the full day is - re-exported. Duplicate messages are merged by snowflake id; gaps are not. - """ - return utc_day_start(latest) - - -def resolve_channel_export_after( - guild_snowflake: int, - channel_snowflake: int, - *, - explicit_after: datetime | None, -) -> datetime | None: - """Per-channel ``--after`` for DiscordChatExporter. - - When *explicit_after* is set (``--since``), it applies to every channel. - Otherwise resumes from the UTC day start of that channel's latest stored - message, or ``None`` when the channel has no rows (today-only export). - """ - if explicit_after is not None: - return explicit_after - latest = latest_message_created_at_for_channel( - guild_snowflake, - channel_snowflake, - ) - if latest is None: - return None - return incremental_export_after(latest) - - -def utc_day_start(dt: datetime) -> datetime: - """UTC midnight for the calendar day containing *dt*.""" - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - else: - dt = dt.astimezone(timezone.utc) - return dt.replace(hour=0, minute=0, second=0, microsecond=0) - - -def iter_channel_export_days( - *, - after: datetime | None, - before: datetime | None, - now: datetime | None = None, -) -> list[tuple[str, datetime, datetime]]: - """Build per-day UTC export windows for DiscordChatExporter. - - Returns ``(YYYY-MM-DD, window_after, window_before)`` in chronological order. - - - When *after* is ``None`` (empty DB, no ``--since``): **today only** (UTC). - - Otherwise: from ``floor(after)`` through ``floor(before or now)`` inclusive. - - Each window is clipped to ``[max(day_start, after), min(day_end, before or now)]``. - - Skips days where the clipped window is empty (``after >= before``). - """ - if now is None: - now = datetime.now(timezone.utc) - elif now.tzinfo is None: - now = now.replace(tzinfo=timezone.utc) - else: - now = now.astimezone(timezone.utc) - - upper = now - if before is not None: - upper = ( - before.astimezone(timezone.utc) - if before.tzinfo is not None - else before.replace(tzinfo=timezone.utc) - ) - - if after is None: - first_day = utc_day_start(now) - last_day = first_day - else: - after_utc = ( - after.astimezone(timezone.utc) - if after.tzinfo is not None - else after.replace(tzinfo=timezone.utc) - ) - first_day = utc_day_start(after_utc) - last_day = utc_day_start(upper) - - after_utc: datetime | None = None - if after is not None: - after_utc = ( - after.astimezone(timezone.utc) - if after.tzinfo is not None - else after.replace(tzinfo=timezone.utc) - ) - - before_utc: datetime | None = None - if before is not None: - before_utc = ( - before.astimezone(timezone.utc) - if before.tzinfo is not None - else before.replace(tzinfo=timezone.utc) - ) - - result: list[tuple[str, datetime, datetime]] = [] - day = first_day - while day <= last_day: - day_end = day + timedelta(days=1) - window_after = day - window_before = day_end - - if after_utc is not None and day == first_day: - window_after = max(day, after_utc) - if before_utc is not None and day == last_day: - window_before = min(day_end, before_utc) - elif before is None and day == last_day: - window_before = min(day_end, now) - - if window_after < window_before: - result.append((day.strftime("%Y-%m-%d"), window_after, window_before)) - day += timedelta(days=1) - - return result diff --git a/discord_activity_tracker/sync/messages.py b/discord_activity_tracker/sync/messages.py deleted file mode 100644 index f8892c5d..00000000 --- a/discord_activity_tracker/sync/messages.py +++ /dev/null @@ -1,380 +0,0 @@ -"""Message sync logic - fetch from Discord and store in DB.""" - -import asyncio -import logging -from datetime import datetime, timedelta -from typing import Any, Dict, List, Optional, Union - -from django.utils import timezone as django_timezone -from asgiref.sync import sync_to_async - -from cppa_user_tracker.services import get_or_create_discord_profile -from core.utils.datetime_parsing import parse_iso_datetime_lenient - -from ..api_schemas import ( - DiscordLivePreparedMessage, - parse_live_message, -) -from ..models import DiscordServer, DiscordChannel, DiscordMessage -from ..services import ( - get_or_create_discord_server, - get_or_create_discord_channel, - create_or_update_discord_message, - add_or_update_reaction, - get_channel_latest_message_at, - bulk_process_message_batch, -) -from .client import DiscordSyncClient -from .utils import parse_discord_user - -logger = logging.getLogger(__name__) - - -async def sync_guild_async(client: DiscordSyncClient, guild_id: int): - """Sync guild/server info.""" - guild = await client.get_guild(guild_id) - if guild is None: - raise ValueError(f"Guild {guild_id} not found or not accessible") - - logger.info(f"Syncing guild: {guild.name} ({guild.id})") - - icon_url = str(guild.icon.url) if guild.icon else "" - server, created = await sync_to_async(get_or_create_discord_server)( - server_id=guild.id, server_name=guild.name, icon_url=icon_url - ) - - if created: - logger.info(f"Created new server: {guild.name}") - else: - logger.debug(f"Server already exists: {guild.name}") - - return server - - -async def sync_channels_async( - client: DiscordSyncClient, server: DiscordServer, guild_id: int -): - """Sync all text channels in guild.""" - channels = await client.get_channels(guild_id) - logger.info(f"Found {len(channels)} text channels to sync") - - synced_channels = [] - - for channel in channels: - logger.debug(f"Syncing channel: #{channel.name}") - - discord_channel, created = await sync_to_async(get_or_create_discord_channel)( - server=server, - channel_id=channel.id, - channel_name=channel.name, - channel_type=str(channel.type), - topic=channel.topic or "", - position=channel.position, - ) - - if created: - logger.info(f"Created new channel: #{channel.name}") - - synced_channels.append(discord_channel) - - logger.info(f"Synced {len(synced_channels)} channels") - return synced_channels - - -async def _process_message_data( - channel: DiscordChannel, - message_data: Union[DiscordLivePreparedMessage, Dict[str, Any]], -): - """Process message dict and store in DB.""" - try: - if isinstance(message_data, dict): - prepared = _prepare_message_data(message_data) - if prepared is None: - return - message_data = prepared - author = message_data.author - - profile, _ = await sync_to_async(get_or_create_discord_profile)( - discord_user_id=author.user_id, - username=author.username, - display_name=author.display_name, - avatar_url=author.avatar_url, - is_bot=author.is_bot, - ) - - message, created = await sync_to_async(create_or_update_discord_message)( - message_id=message_data.message_id, - channel=channel, - author=profile, - content=message_data.content, - message_created_at=message_data.message_created_at, - message_edited_at=message_data.message_edited_at, - reply_to_message_id=message_data.reply_to_message_id, - attachment_urls=message_data.attachment_urls, - ) - - for reaction_data in message_data.reactions: - if isinstance(reaction_data, dict): - emoji = reaction_data.get("emoji") - count = reaction_data.get("count", 0) - else: - emoji = getattr(reaction_data, "emoji", None) - count = getattr(reaction_data, "count", 0) - if emoji: - await sync_to_async(add_or_update_reaction)(message, emoji, count) - - if created: - logger.debug( - f"Created message {message.message_id} in #{channel.channel_name}" - ) - - except Exception as e: - mid = ( - message_data.message_id - if isinstance(message_data, DiscordLivePreparedMessage) - else message_data.get("id") - ) - logger.exception("Error processing message %s: %s", mid, e) - - -BATCH_SIZE = 500 - - -def _prepare_message_data( - message_data: Dict[str, Any], -) -> Optional[DiscordLivePreparedMessage]: - """Parse raw Discord message dict into normalized format for bulk processing.""" - author_info = parse_discord_user(message_data.get("author", {})) - - created_at = parse_iso_datetime_lenient(message_data.get("created_at")) - edited_at = parse_iso_datetime_lenient(message_data.get("edited_at")) - - if created_at is None: - logger.error("Message %s has no created_at timestamp", message_data.get("id")) - return None - - attachments = message_data.get("attachments", []) - attachment_urls = [att.get("url") for att in attachments if att.get("url")] - - reference = message_data.get("reference", {}) - reply_to_message_id = reference.get("message_id") if reference else None - - raw_id = message_data.get("id") - if raw_id is None: - return None - try: - message_id = int(raw_id) - except (TypeError, ValueError): - return None - - return parse_live_message( - { - "message_id": message_id, - "author": author_info.model_dump(), - "content": message_data.get("content", ""), - "message_type": message_data.get("message_type") or "Default", - "is_pinned": bool(message_data.get("is_pinned", False)), - "message_created_at": created_at, - "message_edited_at": edited_at, - "reply_to_message_id": reply_to_message_id, - "attachment_urls": attachment_urls, - "reactions": message_data.get("reactions", []), - } - ) - - -async def _process_messages_in_batches( - channel: DiscordChannel, - messages: List[Any], - batch_size: int = BATCH_SIZE, -) -> int: - """Process messages in batches using bulk DB operations.""" - total_processed = 0 - - for i in range(0, len(messages), batch_size): - batch_raw = messages[i : i + batch_size] - - batch_prepared = [] - for msg_data in batch_raw: - prepared = _prepare_message_data(msg_data) - if prepared is not None: - batch_prepared.append(prepared) - - if not batch_prepared: - continue - - count = await sync_to_async(bulk_process_message_batch)(batch_prepared, channel) - total_processed += count - - logger.info( - f"Batch {i // batch_size + 1}: {total_processed}/{len(messages)} " - f"messages for #{channel.channel_name}" - ) - - return total_processed - - -async def sync_channel_messages_async( - client: DiscordSyncClient, - channel: DiscordChannel, - guild_id: int, - since_date: Optional[datetime] = None, - full_sync: bool = False, -): - """Sync messages from channel (incremental or full).""" - logger.info(f"Syncing messages for channel: #{channel.channel_name}") - - # Determine sync start point - if full_sync: - after = None - logger.info("Full sync mode: fetching all messages") - elif since_date: - after = since_date - logger.info(f"Syncing messages since: {after}") - else: - latest = await sync_to_async(get_channel_latest_message_at)(channel) - if latest: - after = latest - logger.info(f"Syncing messages since last stored message: {after}") - else: - # Default: fetch last 30 days - after = django_timezone.now() - timedelta(days=30) - logger.info(f"First sync: fetching messages from last 30 days ({after})") - - discord_channel = await client.get_channel(channel.channel_id) - if discord_channel is None: - return - - # Fetch messages - try: - messages = await client.fetch_messages_since( - channel=discord_channel, - after=after, - limit=None, # No limit - fetch all messages - ) - - logger.info(f"Fetched {len(messages)} messages from #{channel.channel_name}") - - processed = await _process_messages_in_batches(channel, messages) - logger.info(f"Bulk-processed {processed} messages for #{channel.channel_name}") - - logger.info( - f"Successfully synced {len(messages)} messages for #{channel.channel_name}" - ) - - except Exception as e: - logger.exception(f"Error syncing messages for #{channel.channel_name}: {e}") - raise - - -def sync_guild(token: str, guild_id: int): - """Sync guild/server (sync wrapper).""" - client = DiscordSyncClient(token) - try: - return client.run(sync_guild_async(client, guild_id)) - finally: - client.shutdown_sync() - - -def sync_channels(token: str, server: DiscordServer, guild_id: int): - """Sync channels (sync wrapper).""" - client = DiscordSyncClient(token) - try: - return client.run(sync_channels_async(client, server, guild_id)) - finally: - client.shutdown_sync() - - -def sync_channel_messages( - token: str, - channel: DiscordChannel, - guild_id: int, - since_date: Optional[datetime] = None, - full_sync: bool = False, -): - """Sync channel messages (sync wrapper).""" - client = DiscordSyncClient(token) - try: - client.run( - sync_channel_messages_async( - client, channel, guild_id, since_date, full_sync - ) - ) - finally: - client.shutdown_sync() - - -MAX_CONCURRENT_CHANNELS = 5 - - -async def _sync_all_channels_async( - client: DiscordSyncClient, - channels: List[DiscordChannel], - guild_id: int, - since_date: Optional[datetime] = None, - full_sync: bool = False, -): - """Sync multiple channels concurrently with a semaphore.""" - sem = asyncio.Semaphore(MAX_CONCURRENT_CHANNELS) - - async def _sync_one(channel: DiscordChannel): - async with sem: - try: - await sync_channel_messages_async( - client, channel, guild_id, since_date, full_sync - ) - except Exception as e: - logger.error(f"Failed to sync channel #{channel.channel_name}: {e}") - - await asyncio.gather(*[_sync_one(ch) for ch in channels]) - - -def sync_all_channels( - token: str, - guild_id: int, - since_date: Optional[datetime] = None, - full_sync: bool = False, - active_only: bool = True, - active_days: int = 30, -): - """Sync all channels in guild (parallel fetch, single client).""" - logger.info(f"Starting sync for guild {guild_id}") - - client = DiscordSyncClient(token) - try: - # Sync guild - server = client.run(sync_guild_async(client, guild_id)) - - # Sync channels - channels = client.run(sync_channels_async(client, server, guild_id)) - - # Filter for active channels if requested - if active_only and not full_sync: - cutoff = django_timezone.now() - timedelta(days=active_days) - recent_pks = set( - DiscordMessage.objects.filter( - channel__server=server, - message_created_at__gte=cutoff, - is_deleted=False, - ) - .values_list("channel_id", flat=True) - .distinct() - ) - channels = [ch for ch in channels if ch.pk in recent_pks] - logger.info( - f"Filtered to {len(channels)} active channels " - f"(last {active_days} days)" - ) - - # Sync messages for channels concurrently (up to MAX_CONCURRENT_CHANNELS) - logger.info( - f"Syncing {len(channels)} channels " - f"(max {MAX_CONCURRENT_CHANNELS} concurrent)" - ) - client.run( - _sync_all_channels_async(client, channels, guild_id, since_date, full_sync) - ) - finally: - client.shutdown_sync() - - logger.info(f"Completed sync for guild {guild_id}") diff --git a/discord_activity_tracker/sync/raw_archive.py b/discord_activity_tracker/sync/raw_archive.py deleted file mode 100644 index 13fa58a5..00000000 --- a/discord_activity_tracker/sync/raw_archive.py +++ /dev/null @@ -1,136 +0,0 @@ -"""Merge DiscordChatExporter JSON into per-day raw archives.""" - -from __future__ import annotations - -import json -import logging -import os -import tempfile -from datetime import datetime, timezone -from pathlib import Path -from typing import Any - -from core.utils.datetime_parsing import parse_iso_datetime_lenient - -logger = logging.getLogger(__name__) - - -def message_utc_date_str(msg: dict[str, Any]) -> str | None: - """Return ``YYYY-MM-DD`` (UTC) for an exporter message dict, or ``None`` if unparseable.""" - raw_ts = msg.get("timestamp") - if not raw_ts: - return None - dt = parse_iso_datetime_lenient(str(raw_ts)) - if dt is None: - return None - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - else: - dt = dt.astimezone(timezone.utc) - return dt.strftime("%Y-%m-%d") - - -def _message_sort_key(msg: dict[str, Any]) -> tuple[str, str]: - ts = str(msg.get("timestamp") or "") - mid = str(msg.get("id") or "") - return (ts, mid) - - -def _filter_messages_for_day( - messages: list[dict[str, Any]], day: str -) -> list[dict[str, Any]]: - return [m for m in messages if message_utc_date_str(m) == day] - - -def _merge_message_lists( - existing: list[dict[str, Any]], - incoming: list[dict[str, Any]], -) -> list[dict[str, Any]]: - by_id: dict[str, dict[str, Any]] = {} - for msg in existing: - mid = str(msg.get("id", "")) - if mid: - by_id[mid] = msg - for msg in incoming: - mid = str(msg.get("id", "")) - if mid: - by_id[mid] = msg - return sorted(by_id.values(), key=_message_sort_key) - - -def _refresh_envelope_metadata(merged: dict[str, Any]) -> None: - messages: list[dict[str, Any]] = merged.get("messages") or [] - now_iso = datetime.now(timezone.utc).isoformat() - merged["exportedAt"] = now_iso - - if not messages: - date_range = merged.setdefault("dateRange", {}) - if not isinstance(date_range, dict): - merged["dateRange"] = date_range = {} - return - - timestamps = [] - for msg in messages: - dt = parse_iso_datetime_lenient(str(msg.get("timestamp") or "")) - if dt is not None: - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - else: - dt = dt.astimezone(timezone.utc) - timestamps.append(dt) - - if timestamps: - earliest = min(timestamps) - latest = max(timestamps) - merged["dateRange"] = { - "after": earliest.isoformat(), - "before": latest.isoformat(), - } - - -def merge_exporter_json(dest: Path, incoming: dict[str, Any], *, day: str) -> int: - """Merge *incoming* exporter JSON into *dest* for UTC calendar day *day*. - - Messages are keyed by snowflake ``id``; incoming overwrites existing entries. - Only messages on *day* (UTC) are kept in the archive. - - Returns the number of messages written to the merged file. - """ - incoming_msgs = _filter_messages_for_day(incoming.get("messages") or [], day) - - if dest.is_file(): - with open(dest, "r", encoding="utf-8") as f: - existing = json.load(f) - existing_msgs = _filter_messages_for_day(existing.get("messages") or [], day) - merged_msgs = _merge_message_lists(existing_msgs, incoming_msgs) - merged = dict(existing) - merged["guild"] = incoming.get("guild") or existing.get("guild") or {} - merged["channel"] = incoming.get("channel") or existing.get("channel") or {} - else: - merged_msgs = _merge_message_lists([], incoming_msgs) - merged = { - "guild": incoming.get("guild") or {}, - "channel": incoming.get("channel") or {}, - } - - merged["messages"] = merged_msgs - _refresh_envelope_metadata(merged) - - dest.parent.mkdir(parents=True, exist_ok=True) - fd, tmp_path = tempfile.mkstemp( - suffix=".json", dir=dest.parent, prefix=f".{dest.stem}." - ) - try: - with os.fdopen(fd, "w", encoding="utf-8") as f: - json.dump(merged, f, ensure_ascii=False, indent=2) - f.write("\n") - os.replace(tmp_path, dest) - except Exception: - try: - os.unlink(tmp_path) - except OSError: - pass - raise - - logger.debug("Merged %d message(s) into %s", len(merged_msgs), dest) - return len(merged_msgs) diff --git a/discord_activity_tracker/sync/utils.py b/discord_activity_tracker/sync/utils.py deleted file mode 100644 index 0bb7b24d..00000000 --- a/discord_activity_tracker/sync/utils.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Helpers for Discord sync.""" - -from typing import Any, Optional - -from discord_activity_tracker.api_schemas import DiscordLiveUserPayload - - -def parse_discord_user(user_data: Optional[dict[str, Any]]) -> DiscordLiveUserPayload: - """Normalize user dict from Bot API or DiscordChatExporter. - - Handles both sources: - - Bot API: keys ``id`` (int), ``username``, ``display_name``, ``avatar_url``, ``bot`` - - DiscordChatExporter: keys ``id`` (str), ``name``, ``nickname``, ``avatarUrl``, ``isBot`` - All snowflake IDs are coerced to int. - """ - if not user_data: - return DiscordLiveUserPayload( - user_id=0, - username="unknown", - display_name="", - avatar_url="", - is_bot=False, - ) - - raw_id = user_data.get("id", 0) - try: - user_id = int(raw_id) if raw_id is not None else 0 - except (TypeError, ValueError): - user_id = 0 - - avatar_url = user_data.get("avatar_url") or user_data.get("avatarUrl") or "" - - return DiscordLiveUserPayload( - user_id=user_id, - username=user_data.get("username") or user_data.get("name") or "unknown", - display_name=( - user_data.get("display_name") - or user_data.get("global_name") - or user_data.get("nickname") - or "" - ), - avatar_url=avatar_url, - is_bot=bool(user_data.get("bot") or user_data.get("isBot", False)), - ) - - -def sanitize_channel_name(channel_name: str) -> str: - """Make channel name safe for use in filenames.""" - safe_name = channel_name.replace("/", "-").replace("\\", "-") - safe_name = safe_name.replace(":", "-").replace("*", "-") - safe_name = safe_name.replace("?", "").replace('"', "") - safe_name = safe_name.replace("<", "").replace(">", "") - safe_name = safe_name.replace("|", "-") - return safe_name.strip() - - -def format_discord_url(server_id: int, channel_id: int, message_id: int) -> str: - """Build Discord message URL.""" - return f"https://discord.com/channels/{server_id}/{channel_id}/{message_id}" diff --git a/discord_activity_tracker/tests/__init__.py b/discord_activity_tracker/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/discord_activity_tracker/tests/conftest.py b/discord_activity_tracker/tests/conftest.py deleted file mode 100644 index 33cdd0c4..00000000 --- a/discord_activity_tracker/tests/conftest.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Discord tracker tests: stub discord.py when optional dependency is missing.""" - -import sys -from unittest.mock import MagicMock - -import pytest - -pytestmark = pytest.mark.filterwarnings( - "ignore:coroutine 'DiscordSyncClient.close' was never awaited:RuntimeWarning" -) - -try: - import discord as _discord_check # noqa: F401 -except ImportError: - _stub = MagicMock() - for _exc in ("NotFound", "Forbidden", "HTTPException"): - setattr(_stub, _exc, type(_exc, (Exception,), {})) - _stub.TextChannel = type("TextChannel", (), {}) - _stub.Guild = type("Guild", (), {}) - _stub.Message = type("Message", (), {}) - _stub.Intents.default.return_value = MagicMock() - _stub.Client.return_value = MagicMock() - sys.modules["discord"] = _stub diff --git a/discord_activity_tracker/tests/test_admin.py b/discord_activity_tracker/tests/test_admin.py deleted file mode 100644 index aae1c62b..00000000 --- a/discord_activity_tracker/tests/test_admin.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Smoke tests for Discord admin registrations and list_display fields.""" - - -def test_discord_server_admin_registered(): - from django.contrib import admin - from discord_activity_tracker.models import DiscordServer - - ma = admin.site._registry.get(DiscordServer) - assert ma is not None - - -def test_discord_channel_admin_list_display_includes_category(): - from discord_activity_tracker.admin import DiscordChannelAdmin - - assert "category_name" in DiscordChannelAdmin.list_display - assert "channel_id" in DiscordChannelAdmin.list_display - - -def test_discord_message_admin_list_display_includes_new_fields(): - from discord_activity_tracker.admin import DiscordMessageAdmin - - assert "message_type" in DiscordMessageAdmin.list_display - assert "is_pinned" in DiscordMessageAdmin.list_display - - -def test_discord_message_admin_list_filter_includes_new_fields(): - from discord_activity_tracker.admin import DiscordMessageAdmin - - assert "message_type" in DiscordMessageAdmin.list_filter - assert "is_pinned" in DiscordMessageAdmin.list_filter diff --git a/discord_activity_tracker/tests/test_api_schemas.py b/discord_activity_tracker/tests/test_api_schemas.py deleted file mode 100644 index 808b27b1..00000000 --- a/discord_activity_tracker/tests/test_api_schemas.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Tests for discord_activity_tracker.api_schemas.""" - -from datetime import datetime, timezone - -import pytest - -from discord_activity_tracker.api_schemas import ( - DiscordLiveSyncValidationError, - parse_live_message, - parse_live_user, - parse_reaction, -) -from discord_activity_tracker.services import bulk_upsert_discord_users -from discord_activity_tracker.sync.utils import parse_discord_user - - -def test_parse_discord_user_returns_payload(): - user = parse_discord_user( - { - "id": 99, - "username": "bot", - "display_name": "Bot", - "avatar_url": "", - "bot": True, - } - ) - assert user.user_id == 99 - assert user.is_bot is True - - -def test_parse_live_message(): - created = datetime(2024, 1, 1, tzinfo=timezone.utc) - msg = parse_live_message( - { - "message_id": 42, - "author": { - "user_id": 1, - "username": "u", - "display_name": "", - "avatar_url": "", - "is_bot": False, - }, - "message_created_at": created, - "content": "hello", - } - ) - assert msg.message_id == 42 - assert msg.author.user_id == 1 - - -def test_parse_live_message_missing_id_raises(): - with pytest.raises(DiscordLiveSyncValidationError): - parse_live_message( - { - "author": {"user_id": 1, "username": "u"}, - "message_created_at": datetime.now(timezone.utc), - } - ) - - -def test_parse_reaction(): - r = parse_reaction({"discord_message_id": 1, "emoji": "thumbsup", "count": 2}) - assert r.emoji == "thumbsup" - - -@pytest.mark.django_db -def test_bulk_upsert_discord_users_no_keyerror_on_typed_payload(): - users = bulk_upsert_discord_users( - [ - parse_live_user( - { - "user_id": 900001, - "username": "typed", - "display_name": "", - "avatar_url": "", - "is_bot": False, - } - ) - ] - ) - assert 900001 in users diff --git a/discord_activity_tracker/tests/test_backfill_command_extra.py b/discord_activity_tracker/tests/test_backfill_command_extra.py deleted file mode 100644 index cdaded74..00000000 --- a/discord_activity_tracker/tests/test_backfill_command_extra.py +++ /dev/null @@ -1,120 +0,0 @@ -"""Extra coverage for backfill_discord_activity_tracker command.""" - -from __future__ import annotations - -import asyncio -from io import StringIO -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest - -from discord_activity_tracker.management.commands.backfill_discord_activity_tracker import ( - Command, - DiscordBackfillCollector, - _json_display_path, -) - - -def test_json_display_path_outside_import_root_returns_basename(): - assert _json_display_path(Path("/a/b"), Path("/x/other.json")) == "other.json" - - -@pytest.mark.django_db -def test_backfill_collector_sync_pinecone_calls_runner(): - style = MagicMock() - style.SUCCESS = lambda x: x - c = DiscordBackfillCollector( - stdout=StringIO(), style=style, dry_run=False, skip_pinecone=False - ) - with patch( - "discord_activity_tracker.management.commands.backfill_discord_activity_tracker.task_discord_pinecone_sync" - ) as t: - c.sync_pinecone() - t.assert_called_once_with(dry_run=False) - - -@pytest.mark.django_db -def test_backfill_collector_sync_pinecone_skipped_when_dry_run(): - style = MagicMock() - c = DiscordBackfillCollector( - stdout=StringIO(), style=style, dry_run=True, skip_pinecone=False - ) - with patch( - "discord_activity_tracker.management.commands.backfill_discord_activity_tracker.task_discord_pinecone_sync" - ) as t: - c.sync_pinecone() - t.assert_not_called() - - -def test_backfill_get_collector_skip_pinecone_none(): - cmd = Command() - cmd.stdout = StringIO() - cmd.style = MagicMock() - c = cmd.get_collector(dry_run=False, skip_pinecone=None) - assert c.skip_pinecone is False - - -@pytest.mark.django_db -def test_backfill_run_handles_bad_json(tmp_path, settings): - settings.WORKSPACE_DIR = tmp_path / "ws" - settings.WORKSPACE_DIR.mkdir(parents=True) - imp = tmp_path / "import_here" - imp.mkdir() - bad = imp / "bad.json" - bad.write_text("{", encoding="utf-8") - - style = MagicMock() - style.WARNING = lambda x: x - style.SUCCESS = lambda x: x - style.ERROR = lambda x: x - out = StringIO() - - with patch( - "discord_activity_tracker.management.commands.backfill_discord_activity_tracker.get_cpp_discussion_import_dir", - return_value=imp, - ): - DiscordBackfillCollector( - stdout=out, style=style, dry_run=False, skip_pinecone=True - ).run() - - output = out.getvalue() - assert "bad.json" in output - assert "Failed bad.json:" in output - assert "Import complete: 0 messages from 1 file(s)" in output - assert "(1 failed)" in output - - -@pytest.mark.django_db -def test_backfill_persist_channel_writes(settings, tmp_path): - settings.WORKSPACE_DIR = tmp_path / "ws" - settings.WORKSPACE_DIR.mkdir(parents=True) - gid, cid = 220011, 220022 - guild_info = {"id": gid, "name": "G", "iconUrl": ""} - channel_info = { - "id": cid, - "name": "c", - "type": "GuildTextChat", - "topic": "", - "category": "", - "categoryId": None, - } - messages = [ - { - "id": str(10**12 + 3), - "type": "Default", - "isPinned": False, - "timestamp": "2026-01-15T12:00:00Z", - "content": "hello world example text long enough for validation", - "author": {"id": "1082347485026070548", "name": "u"}, - "attachments": [], - "reactions": [], - } - ] - style = MagicMock() - style.SUCCESS = lambda x: x - c = DiscordBackfillCollector( - stdout=StringIO(), style=style, dry_run=False, skip_pinecone=True - ) - n = asyncio.run(c._persist_channel(guild_info, channel_info, messages)) - assert n >= 1 diff --git a/discord_activity_tracker/tests/test_backfill_discord_activity_tracker_command.py b/discord_activity_tracker/tests/test_backfill_discord_activity_tracker_command.py deleted file mode 100644 index 2780893b..00000000 --- a/discord_activity_tracker/tests/test_backfill_discord_activity_tracker_command.py +++ /dev/null @@ -1,194 +0,0 @@ -"""Tests for backfill_discord_activity_tracker management command.""" - -import json -from io import StringIO -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest -from django.core.management import call_command - -from discord_activity_tracker.management.commands.backfill_discord_activity_tracker import ( - Command, - DiscordBackfillCollector, -) - - -def _collector(**overrides): - defaults = { - "stdout": StringIO(), - "style": MagicMock(), - "dry_run": False, - "skip_pinecone": False, - } - defaults.update(overrides) - c = DiscordBackfillCollector(**defaults) - c.style.SUCCESS = lambda x: x - c.style.WARNING = lambda x: x - c.style.ERROR = lambda x: x - return c - - -def _minimal_export_payload(): - return { - "guild": {"id": "900", "name": "G", "iconUrl": ""}, - "channel": { - "id": "851121440425639956", - "name": "discussion", - "type": "GuildTextChat", - "topic": "", - "category": "", - }, - "messages": [ - { - "id": "1399663560723923005", - "type": "Default", - "isPinned": False, - "timestamp": "2026-01-01T12:00:00Z", - "content": "hello world example text long enough", - "author": {"id": "1082347485026070548", "name": "user"}, - "attachments": [], - "reactions": [], - } - ], - } - - -def test_run_removes_json_after_successful_persist(monkeypatch, tmp_path, settings): - """After DB persist succeeds, the source JSON file is deleted.""" - monkeypatch.setattr(settings, "WORKSPACE_DIR", str(tmp_path)) - drop = tmp_path / "discord_activity_tracker" / "Discussion - c-cpp-discussion" - drop.mkdir(parents=True) - j = drop / "batch.json" - j.write_text(json.dumps(_minimal_export_payload()), encoding="utf-8") - - c = _collector(skip_pinecone=True) - with patch.object( - DiscordBackfillCollector, - "_persist_channel", - new_callable=AsyncMock, - return_value=1, - ): - c.run() - - assert not j.exists() - - -def test_run_finds_json_in_nested_subfolders(monkeypatch, tmp_path, settings): - monkeypatch.setattr(settings, "WORKSPACE_DIR", str(tmp_path)) - drop = tmp_path / "discord_activity_tracker" / "Discussion - c-cpp-discussion" - nested = drop / "a" / "b" / "c" - nested.mkdir(parents=True) - j = nested / "deep.json" - j.write_text(json.dumps(_minimal_export_payload()), encoding="utf-8") - - c = _collector(skip_pinecone=True) - with patch.object( - DiscordBackfillCollector, - "_persist_channel", - new_callable=AsyncMock, - return_value=1, - ): - c.run() - - assert not j.exists() - - -def test_run_keeps_file_on_invalid_json(monkeypatch, tmp_path, settings): - monkeypatch.setattr(settings, "WORKSPACE_DIR", str(tmp_path)) - drop = tmp_path / "discord_activity_tracker" / "Discussion - c-cpp-discussion" - drop.mkdir(parents=True) - bad = drop / "bad.json" - bad.write_text("{", encoding="utf-8") - - c = _collector(skip_pinecone=True) - c.run() - - assert bad.exists() - result = c.last_result - assert result is not None - assert result.success is False - assert result.counts["failed_files"] == 1 - assert len(result.errors) == 1 - assert "bad.json" in result.errors[0] - - -def test_run_result_success_when_all_files_import(monkeypatch, tmp_path, settings): - monkeypatch.setattr(settings, "WORKSPACE_DIR", str(tmp_path)) - drop = tmp_path / "discord_activity_tracker" / "Discussion - c-cpp-discussion" - drop.mkdir(parents=True) - j = drop / "batch.json" - j.write_text(json.dumps(_minimal_export_payload()), encoding="utf-8") - - c = _collector(skip_pinecone=True) - with patch.object( - DiscordBackfillCollector, - "_persist_channel", - new_callable=AsyncMock, - return_value=1, - ): - c.run() - - result = c.last_result - assert result is not None - assert result.success is True - assert result.counts["failed_files"] == 0 - assert result.errors == () - - -def test_dry_run_lists_files_no_delete(monkeypatch, tmp_path, settings): - monkeypatch.setattr(settings, "WORKSPACE_DIR", str(tmp_path)) - drop = tmp_path / "discord_activity_tracker" / "Discussion - c-cpp-discussion" - drop.mkdir(parents=True) - j = drop / "batch.json" - j.write_text(json.dumps(_minimal_export_payload()), encoding="utf-8") - - out = StringIO() - c = DiscordBackfillCollector( - stdout=out, - style=MagicMock(), - dry_run=True, - skip_pinecone=True, - ) - c.style.WARNING = lambda x: x - c.run() - - assert j.exists() - assert "dry-run" in out.getvalue().lower() or "DRY RUN" in out.getvalue() - - -@pytest.mark.django_db -def test_sync_pinecone_skipped_when_skip_pinecone(): - c = _collector(skip_pinecone=True) - c.sync_pinecone() - - -@pytest.mark.django_db -def test_sync_pinecone_skipped_when_dry_run(): - c = _collector(dry_run=True) - c.sync_pinecone() - - -@pytest.mark.django_db -def test_get_collector_returns_backfill_collector(): - cmd = Command() - cmd.stdout = StringIO() - cmd.style = MagicMock() - collector = cmd.get_collector(dry_run=True, skip_pinecone=True) - assert isinstance(collector, DiscordBackfillCollector) - assert collector.dry_run is True - - -@pytest.mark.django_db -def test_call_command_dry_run(monkeypatch, tmp_path, settings): - monkeypatch.setattr(settings, "WORKSPACE_DIR", str(tmp_path)) - tmp_path.joinpath( - "discord_activity_tracker", "Discussion - c-cpp-discussion" - ).mkdir(parents=True) - out = StringIO() - call_command( - "backfill_discord_activity_tracker", - dry_run=True, - stdout=out, - verbosity=0, - ) - assert "DRY RUN" in out.getvalue() or "dry-run" in out.getvalue().lower() diff --git a/discord_activity_tracker/tests/test_bulk_services.py b/discord_activity_tracker/tests/test_bulk_services.py deleted file mode 100644 index ae4964c7..00000000 --- a/discord_activity_tracker/tests/test_bulk_services.py +++ /dev/null @@ -1,364 +0,0 @@ -"""Tests for bulk DB operations in services.py.""" - -import uuid - -import pytest -from datetime import datetime, timezone - -from cppa_user_tracker.models import DiscordProfile -from discord_activity_tracker.models import ( - DiscordServer, - DiscordChannel, - DiscordMessage, - DiscordReaction, -) -from discord_activity_tracker.services import ( - bulk_upsert_discord_users, - bulk_upsert_discord_messages, - bulk_upsert_discord_reactions, - bulk_process_message_batch, -) - - -def _user(uid, name, display="", bot=False): - return { - "user_id": uid, - "username": name, - "display_name": display, - "avatar_url": "", - "is_bot": bot, - } - - -def _msg(mid, author_uid, content="", ts=None, **kwargs): - if ts is None: - ts = datetime(2026, 2, 17, 12, 0, 0, tzinfo=timezone.utc) - return { - "message_id": mid, - "author": {"user_id": author_uid, **kwargs.pop("author_extra", {})}, - "content": content, - "message_type": kwargs.get("message_type", "Default"), - "is_pinned": kwargs.get("is_pinned", False), - "message_created_at": ts, - "message_edited_at": kwargs.get("edited_at"), - "reply_to_message_id": kwargs.get("reply_to"), - "attachment_urls": kwargs.get("attachments", []), - "reactions": kwargs.get("reactions", []), - } - - -def _uniq_id() -> int: - return uuid.uuid4().int % (2**50) - - -@pytest.fixture -def server(db): - return DiscordServer.objects.create( - server_id=_uniq_id(), server_name="TestServer", icon_url="" - ) - - -@pytest.fixture -def channel(server): - return DiscordChannel.objects.create( - server=server, - channel_id=_uniq_id(), - channel_name="general", - channel_type="text", - topic="", - position=0, - ) - - -# ------------------------------------------------------------------- -# bulk_upsert_discord_users -# ------------------------------------------------------------------- - - -@pytest.mark.django_db -class TestBulkUpsertUsers: - def test_insert_new_users(self): - before = DiscordProfile.objects.count() - user_data = [ - _user(1001, "alice", display="Alice"), - _user(1002, "bob", display="Bob", bot=True), - ] - result = bulk_upsert_discord_users(user_data) - - assert len(result) == 2 - assert 1001 in result - assert 1002 in result - assert result[1001].discord_user_id == 1001 - assert DiscordProfile.objects.count() == before + 2 - - def test_update_existing_users(self): - DiscordProfile.objects.create( - discord_user_id=1001, - type="discord", - username="alice_old", - display_name="Old", - is_bot=False, - ) - - result = bulk_upsert_discord_users( - [_user(1001, "alice_new", display="New Alice")] - ) - - assert len(result) == 1 - refreshed = DiscordProfile.objects.get(discord_user_id=1001) - assert refreshed.username == "alice_new" - assert refreshed.display_name == "New Alice" - - def test_deduplicates_by_user_id(self): - before = DiscordProfile.objects.count() - user_data = [ - _user(1001, "first"), - _user(1001, "second"), - ] - result = bulk_upsert_discord_users(user_data) - - assert len(result) == 1 - assert DiscordProfile.objects.count() == before + 1 - # Last-seen wins - assert DiscordProfile.objects.get(discord_user_id=1001).username == "second" - - def test_empty_input(self): - result = bulk_upsert_discord_users([]) - assert result == {} - - -# ------------------------------------------------------------------- -# bulk_upsert_discord_messages -# ------------------------------------------------------------------- - - -@pytest.mark.django_db -class TestBulkUpsertMessages: - def test_insert_new_messages(self, channel): - user_map = bulk_upsert_discord_users([_user(1001, "alice", display="Alice")]) - - now = datetime(2026, 2, 17, 12, 0, 0, tzinfo=timezone.utc) - msg_data = [ - _msg(5001, 1001, content="Hello world", ts=now), - _msg( - 5002, - 1001, - content="Second message", - ts=now, - attachments=["https://example.com/file.png"], - ), - ] - - mc_before = DiscordMessage.objects.count() - result = bulk_upsert_discord_messages(msg_data, channel, user_map) - assert len(result) == 2 - assert DiscordMessage.objects.count() == mc_before + 2 - - msg1 = DiscordMessage.objects.get(message_id=5001) - assert msg1.content == "Hello world" - assert msg1.has_attachments is False - - msg2 = DiscordMessage.objects.get(message_id=5002) - assert msg2.has_attachments is True - assert msg2.attachment_urls == ["https://example.com/file.png"] - - def test_update_existing_messages(self, channel): - user_map = bulk_upsert_discord_users([_user(1001, "alice")]) - now = datetime(2026, 2, 17, 12, 0, 0, tzinfo=timezone.utc) - mc_before = DiscordMessage.objects.count() - - # Insert first - bulk_upsert_discord_messages( - [_msg(5001, 1001, content="Original", ts=now)], - channel, - user_map, - ) - - # Update - edited_at = datetime(2026, 2, 17, 13, 0, 0, tzinfo=timezone.utc) - bulk_upsert_discord_messages( - [ - _msg( - 5001, - 1001, - content="Edited content", - ts=now, - edited_at=edited_at, - ) - ], - channel, - user_map, - ) - - assert DiscordMessage.objects.count() == mc_before + 1 - msg = DiscordMessage.objects.get(message_id=5001) - assert msg.content == "Edited content" - assert msg.message_edited_at == edited_at - - def test_empty_input(self, channel): - result = bulk_upsert_discord_messages([], channel, {}) - assert result == {} - - def test_message_type_and_is_pinned_persisted(self, channel): - """bulk_upsert_discord_messages must persist message_type and is_pinned.""" - user_map = bulk_upsert_discord_users([_user(1001, "alice")]) - now = datetime(2026, 2, 17, 12, 0, 0, tzinfo=timezone.utc) - bulk_upsert_discord_messages( - [ - _msg( - 7001, - 1001, - content="pinned reply", - message_type="Reply", - is_pinned=True, - ts=now, - ) - ], - channel, - user_map, - ) - msg = DiscordMessage.objects.get(message_id=7001) - assert msg.message_type == "Reply" - assert msg.is_pinned is True - - -# ------------------------------------------------------------------- -# bulk_upsert_discord_reactions -# ------------------------------------------------------------------- - - -@pytest.mark.django_db -class TestBulkUpsertReactions: - def test_insert_reactions(self, channel): - mid = _uniq_id() - user_map = bulk_upsert_discord_users([_user(1001, "alice")]) - now = datetime(2026, 2, 17, 12, 0, 0, tzinfo=timezone.utc) - message_map = bulk_upsert_discord_messages( - [_msg(mid, 1001, content="Test", ts=now)], - channel, - user_map, - ) - - reaction_data = [ - {"discord_message_id": mid, "emoji": "\U0001f44d", "count": 3}, - {"discord_message_id": mid, "emoji": "\U0001f389", "count": 1}, - ] - rc_before = DiscordReaction.objects.count() - bulk_upsert_discord_reactions(reaction_data, message_map) - - assert DiscordReaction.objects.count() == rc_before + 2 - db_msg = DiscordMessage.objects.get(message_id=mid) - thumbs = DiscordReaction.objects.get(message=db_msg, emoji="\U0001f44d") - assert thumbs.count == 3 - - def test_update_reaction_count(self, channel): - mid = _uniq_id() - user_map = bulk_upsert_discord_users([_user(1001, "alice")]) - now = datetime(2026, 2, 17, 12, 0, 0, tzinfo=timezone.utc) - message_map = bulk_upsert_discord_messages( - [_msg(mid, 1001, content="Test", ts=now)], - channel, - user_map, - ) - - rc_before = DiscordReaction.objects.count() - # Insert - bulk_upsert_discord_reactions( - [{"discord_message_id": mid, "emoji": "\U0001f44d", "count": 1}], - message_map, - ) - # Update - bulk_upsert_discord_reactions( - [{"discord_message_id": mid, "emoji": "\U0001f44d", "count": 5}], - message_map, - ) - - assert DiscordReaction.objects.count() == rc_before + 1 - db_msg = DiscordMessage.objects.get(message_id=mid) - assert ( - DiscordReaction.objects.get(message=db_msg, emoji="\U0001f44d").count == 5 - ) - - -# ------------------------------------------------------------------- -# bulk_process_message_batch (end-to-end orchestrator) -# ------------------------------------------------------------------- - - -@pytest.mark.django_db -class TestBulkProcessMessageBatch: - def test_full_batch(self, channel): - now = datetime(2026, 2, 17, 12, 0, 0, tzinfo=timezone.utc) - messages = [ - { - "message_id": 5001, - "author": _user(1001, "alice", display="Alice"), - "content": "Hello!", - "message_created_at": now, - "message_edited_at": None, - "reply_to_message_id": None, - "attachment_urls": [], - "reactions": [ - {"emoji": "\U0001f44d", "count": 2}, - {"emoji": "\u2764\ufe0f", "count": 1}, - ], - }, - { - "message_id": 5002, - "author": _user(1002, "bob", display="Bob"), - "content": "Hi there!", - "message_created_at": now, - "message_edited_at": None, - "reply_to_message_id": 5001, - "attachment_urls": ["https://example.com/img.png"], - "reactions": [], - }, - ] - - pc_before = DiscordProfile.objects.count() - mc_before = DiscordMessage.objects.count() - rc_before = DiscordReaction.objects.count() - count = bulk_process_message_batch(messages, channel) - - assert count == 2 - assert DiscordProfile.objects.count() == pc_before + 2 - assert DiscordMessage.objects.count() == mc_before + 2 - assert DiscordReaction.objects.count() == rc_before + 2 - - msg1 = DiscordMessage.objects.get(message_id=5001) - assert msg1.content == "Hello!" - assert msg1.author.username == "alice" - - msg2 = DiscordMessage.objects.get(message_id=5002) - assert msg2.reply_to_message_id == 5001 - assert msg2.has_attachments is True - - def test_empty_batch(self, channel): - count = bulk_process_message_batch([], channel) - assert count == 0 - - def test_idempotent(self, channel): - """Running same batch twice should not create duplicates.""" - now = datetime(2026, 2, 17, 12, 0, 0, tzinfo=timezone.utc) - messages = [ - { - "message_id": 5001, - "author": _user(1001, "alice"), - "content": "Test", - "message_created_at": now, - "message_edited_at": None, - "reply_to_message_id": None, - "attachment_urls": [], - "reactions": [{"emoji": "\U0001f44d", "count": 1}], - }, - ] - - pc_before = DiscordProfile.objects.count() - mc_before = DiscordMessage.objects.count() - rc_before = DiscordReaction.objects.count() - bulk_process_message_batch(messages, channel) - bulk_process_message_batch(messages, channel) - - assert DiscordProfile.objects.count() == pc_before + 1 - assert DiscordMessage.objects.count() == mc_before + 1 - assert DiscordReaction.objects.count() == rc_before + 1 diff --git a/discord_activity_tracker/tests/test_chat_exporter_branch_coverage.py b/discord_activity_tracker/tests/test_chat_exporter_branch_coverage.py deleted file mode 100644 index 50144731..00000000 --- a/discord_activity_tracker/tests/test_chat_exporter_branch_coverage.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Extra branch coverage for sync/chat_exporter.py.""" - -from __future__ import annotations - -import sys -from unittest.mock import MagicMock, patch - -import pytest -from django.conf import settings - -from discord_activity_tracker.sync.chat_exporter import ( - DiscordChatExporterError, - _file_command_brief_description, - _run_channels_listing, - export_guild_to_json, - parse_channels_command_stdout, -) - - -def test_file_command_brief_description_no_file_binary(tmp_path): - with patch( - "discord_activity_tracker.sync.chat_exporter.shutil.which", return_value=None - ): - assert _file_command_brief_description(tmp_path / "x") is None - - -def test_file_command_brief_description_subprocess_error(tmp_path): - with ( - patch( - "discord_activity_tracker.sync.chat_exporter.shutil.which", - return_value="/bin/file", - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.run", - side_effect=OSError("nope"), - ), - ): - assert _file_command_brief_description(tmp_path / "x") is None - - -def test_file_command_brief_description_nonzero_return(tmp_path): - proc = MagicMock(returncode=1, stdout="", stderr="") - with ( - patch( - "discord_activity_tracker.sync.chat_exporter.shutil.which", - return_value="/bin/file", - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.run", - return_value=proc, - ), - ): - assert _file_command_brief_description(tmp_path / "x") is None - - -def test_run_channels_listing_failure_raises(tmp_path, monkeypatch): - cli = tmp_path / "cli" - cli.touch() - monkeypatch.setattr(settings, "DISCORD_CHAT_EXPORTER_DOTNET_DLL", None) - proc = MagicMock(returncode=1, stdout="", stderr="err") - with ( - patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=cli, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.run", - return_value=proc, - ), - ): - with pytest.raises(DiscordChatExporterError, match="channels"): - _run_channels_listing(cli, "tok", 1, "None") - - -def test_run_channels_listing_success(monkeypatch, tmp_path): - cli = tmp_path / "cli" - cli.touch() - monkeypatch.setattr(settings, "DISCORD_CHAT_EXPORTER_DOTNET_DLL", None) - proc = MagicMock(returncode=0, stdout="12345 | #general\n", stderr="") - with ( - patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=cli, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.run", - return_value=proc, - ), - ): - ids = _run_channels_listing(cli, "tok", 1, "None") - assert ids == [12345] - - -def test_export_guild_dotnet_dll_missing_raises(tmp_path, monkeypatch): - out = tmp_path / "out" - missing_dll = tmp_path / "nope.dll" - monkeypatch.setattr(settings, "DISCORD_CHAT_EXPORTER_DOTNET_DLL", str(missing_dll)) - with pytest.raises(DiscordChatExporterError, match="missing"): - export_guild_to_json("t", 1, out) - - -def test_export_guild_dotnet_no_dotnet_binary_raises(tmp_path, monkeypatch): - dll = tmp_path / "app.dll" - dll.write_bytes(b"x") - monkeypatch.setattr(settings, "DISCORD_CHAT_EXPORTER_DOTNET_DLL", str(dll)) - monkeypatch.setattr(settings, "DISCORD_CHAT_EXPORTER_DOTNET", "") - with patch( - "discord_activity_tracker.sync.chat_exporter.shutil.which", return_value=None - ): - with pytest.raises(DiscordChatExporterError, match="dotnet"): - export_guild_to_json("t", 1, tmp_path / "o") - - -def test_export_guild_os_error_errno_8_wraps(tmp_path, monkeypatch): - cli = tmp_path / "cli" - cli.touch() - monkeypatch.setattr(settings, "DISCORD_CHAT_EXPORTER_DOTNET_DLL", None) - err = OSError("exec format error") - err.errno = 8 - with ( - patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=cli, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.validate_discord_chat_exporter_cli_architecture", - ), - patch( - "discord_activity_tracker.sync.chat_exporter._export_guild_by_channel_day", - side_effect=err, - ), - ): - if sys.platform == "win32": - pytest.skip("errno 8 branch is POSIX-only") - with pytest.raises(DiscordChatExporterError, match="wrong executable format"): - export_guild_to_json("t", 1, tmp_path / "o2", after_date=None) - - -def test_parse_channels_skips_thread_banner_lines(): - text = "* thread\n123 | #x\n" - assert parse_channels_command_stdout(text) == [123] diff --git a/discord_activity_tracker/tests/test_discord_internal_tokens_store.py b/discord_activity_tracker/tests/test_discord_internal_tokens_store.py deleted file mode 100644 index d91a2578..00000000 --- a/discord_activity_tracker/tests/test_discord_internal_tokens_store.py +++ /dev/null @@ -1,126 +0,0 @@ -"""Tests for workspace JSON Discord internal token storage.""" - -import json -import logging -from unittest.mock import patch - -import pytest -from django.test import override_settings - -from discord_activity_tracker.utils import discord_internal_tokens_store as store - - -@override_settings( - WORKSPACE_DIR="/tmp/ws", - DISCORD_INTERNAL_TOKENS_JSON="", -) -def test_save_and_load_tokens(tmp_path, settings): - settings.WORKSPACE_DIR = str(tmp_path) - path = store.save_discord_internal_tokens( - "discord-tok", user_id="123", username="alice" - ) - assert ( - path == tmp_path / "discord_activity_tracker" / "discord_internal_tokens.json" - ) - data = json.loads(path.read_text(encoding="utf-8")) - assert data["user_token"] == "discord-tok" - assert data["user_id"] == "123" - loaded = store.load_discord_internal_tokens() - assert loaded["user_token"] == "discord-tok" - assert loaded["username"] == "alice" - - -@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=True, WORKSPACE_DIR="/tmp/ws") -def test_get_discord_user_token_from_json(tmp_path, settings): - settings.WORKSPACE_DIR = str(tmp_path) - store.save_discord_internal_tokens("tok") - assert store.get_discord_user_token_from_json() == "tok" - - -@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=False, WORKSPACE_DIR="/tmp/ws") -def test_get_token_from_json_disabled(tmp_path, settings): - settings.WORKSPACE_DIR = str(tmp_path) - store.save_discord_internal_tokens("tok") - assert store.get_discord_user_token_from_json() is None - - -def test_save_requires_token(): - with pytest.raises(ValueError): - store.save_discord_internal_tokens("") - - -@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=False, DISCORD_USER_TOKEN="env-tok") -def test_get_or_load_uses_env_when_internal_disabled(): - assert store.get_or_load_discord_user_token() == "env-tok" - - -@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=True, WORKSPACE_DIR="/tmp/ws") -@patch( - "discord_activity_tracker.utils.discord_tokens.probe_discord_user_token", - return_value=True, -) -@patch( - "discord_activity_tracker.utils.discord_internal_tokens_store.extract_and_save_discord_internal_tokens", - return_value="fresh-tok", -) -def test_get_or_load_extracts_when_json_missing( - mock_extract, _mock_probe, tmp_path, settings -): - settings.WORKSPACE_DIR = str(tmp_path) - token = store.get_or_load_discord_user_token() - assert token == "fresh-tok" - mock_extract.assert_called_once() - - -@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=True, WORKSPACE_DIR="/tmp/ws") -@patch( - "discord_activity_tracker.utils.discord_tokens.probe_discord_user_token", - side_effect=[False, True], -) -@patch( - "discord_activity_tracker.utils.discord_internal_tokens_store.extract_and_save_discord_internal_tokens", - return_value="new-tok", -) -def test_get_or_load_reextracts_when_json_tokens_stale( - mock_extract, _mock_probe, tmp_path, settings -): - settings.WORKSPACE_DIR = str(tmp_path) - store.save_discord_internal_tokens("old-tok") - token = store.get_or_load_discord_user_token() - assert token == "new-tok" - mock_extract.assert_called_once() - - -@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=True, WORKSPACE_DIR="/tmp/ws") -@patch( - "discord_activity_tracker.utils.discord_tokens.probe_discord_user_token", - return_value=False, -) -@patch( - "discord_activity_tracker.utils.discord_internal_tokens_store.extract_and_save_discord_internal_tokens", - return_value="bad-tok", -) -def test_get_or_load_logs_when_reextracted_tokens_still_invalid( - mock_extract, _mock_probe, tmp_path, settings, caplog -): - settings.WORKSPACE_DIR = str(tmp_path) - store.save_discord_internal_tokens("old-tok") - with caplog.at_level(logging.ERROR): - token = store.get_or_load_discord_user_token() - assert token is None - mock_extract.assert_called_once() - assert "still invalid" in caplog.text - assert ".env.example" in caplog.text - - -@override_settings(ALLOW_INTERNAL_DISCORD_TOKENS=True, WORKSPACE_DIR="/tmp/ws") -@patch( - "discord_activity_tracker.utils.discord_tokens.probe_discord_user_token", - return_value=True, -) -def test_get_or_load_keeps_valid_json_tokens(_mock_probe, tmp_path, settings): - settings.WORKSPACE_DIR = str(tmp_path) - store.save_discord_internal_tokens("tok") - token = store.get_or_load_discord_user_token() - assert token == "tok" - _mock_probe.assert_called_once_with("tok") diff --git a/discord_activity_tracker/tests/test_discord_tokens.py b/discord_activity_tracker/tests/test_discord_tokens.py deleted file mode 100644 index c32418ff..00000000 --- a/discord_activity_tracker/tests/test_discord_tokens.py +++ /dev/null @@ -1,150 +0,0 @@ -"""Tests for discord_activity_tracker.utils.discord_tokens (no real Chrome profile).""" - -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest -from django.test import override_settings - -from discord_activity_tracker.utils import discord_tokens as dt - - -def test_parse_discord_token_raw_strips_prefix_and_quotes(): - raw = b'\x01"my-discord-token"' - assert dt._parse_discord_token_raw(raw) == "my-discord-token" - - -def test_parse_discord_token_raw_plain(): - assert dt._parse_discord_token_raw(b"plain-token") == "plain-token" - - -def test_parse_discord_token_raw_empty_raises(): - with pytest.raises(ValueError): - dt._parse_discord_token_raw(b"") - - -@patch("discord_activity_tracker.utils.discord_tokens.requests.get") -def test_probe_discord_user_token_ok(mock_get): - mock_resp = MagicMock() - mock_resp.status_code = 200 - mock_get.return_value = mock_resp - assert dt.probe_discord_user_token("tok") is True - - -@patch("discord_activity_tracker.utils.discord_tokens.requests.get") -def test_probe_discord_user_token_auth_error(mock_get): - mock_resp = MagicMock() - mock_resp.status_code = 401 - mock_get.return_value = mock_resp - assert dt.probe_discord_user_token("tok") is False - - -def test_probe_discord_user_token_empty(): - assert dt.probe_discord_user_token("") is False - - -@patch("discord_activity_tracker.utils.discord_tokens.requests.get") -def test_probe_discord_user_token_details(mock_get): - mock_resp = MagicMock() - mock_resp.status_code = 200 - mock_resp.json.return_value = {"id": "123", "username": "alice"} - mock_get.return_value = mock_resp - out = dt.probe_discord_user_token_details("tok") - assert out == {"user_id": "123", "username": "alice"} - - -def test_is_discord_exporter_auth_error(): - assert dt.is_discord_exporter_auth_error("HTTP 401 Unauthorized") - assert dt.is_discord_exporter_auth_error("invalid token") - assert not dt.is_discord_exporter_auth_error("channel not found") - - -@override_settings(DISCORD_CHROME_PROFILE_PATH="", WORKSPACE_DIR="/tmp/ws") -def test_resolve_discord_chrome_profile_uses_workspace_default(tmp_path, settings): - settings.WORKSPACE_DIR = str(tmp_path) - expected = tmp_path / "discord_activity_tracker" / "chrome_profile" - expected.mkdir(parents=True) - assert dt._resolve_discord_chrome_profile_root() == expected.resolve() - - -def test_resolve_discord_chrome_profile_respects_custom_path(tmp_path): - custom = tmp_path / "custom_discord_chrome" - custom.mkdir() - with override_settings( - DISCORD_CHROME_PROFILE_PATH=str(custom), WORKSPACE_DIR="/tmp/ws" - ): - assert dt._resolve_discord_chrome_profile_root() == custom.resolve() - - -@pytest.mark.parametrize("bad", ["", None, "bad\x00path", "???"]) -def test_validate_chrome_profile_path_bad(bad): - with pytest.raises(ValueError): - dt._validate_chrome_profile_path(bad) - - -def test_read_discord_token_from_leveldb_parses(tmp_path): - profile = tmp_path / "chrome_profile" - leveldb_dir = profile / "Default" / "Local Storage" / "leveldb" - leveldb_dir.mkdir(parents=True) - with patch.object( - dt, - "_read_leveldb_value", - return_value=b'\x01"token-from-leveldb"', - ): - assert dt._read_discord_token_from_leveldb(profile) == "token-from-leveldb" - - -def test_read_discord_token_from_leveldb_returns_none_when_no_leveldb(tmp_path): - profile = tmp_path / "empty_profile" - profile.mkdir() - assert dt._read_discord_token_from_leveldb(profile) is None - - -def test_read_discord_token_from_leveldb_falls_back_to_legacy_key(tmp_path): - profile = tmp_path / "chrome_profile" - leveldb_dir = profile / "Default" / "Local Storage" / "leveldb" - leveldb_dir.mkdir(parents=True) - - def read_side_effect(_leveldb_dir, key): - if key == dt.DISCORD_TOKEN_KEY: - return b'\x01""' - return b'\x01"legacy-token"' - - with patch.object(dt, "_read_leveldb_value", side_effect=read_side_effect): - assert dt._read_discord_token_from_leveldb(profile) == "legacy-token" - - -@patch.object(dt, "probe_discord_user_token", return_value=True) -@patch.object(dt, "probe_discord_user_token_details", return_value={"user_id": "1"}) -@patch.object(dt, "_read_discord_token_from_leveldb", return_value="tok") -@patch.object(dt, "_resolve_discord_chrome_profile_root") -def test_extract_discord_token_auto_success( - mock_resolve, mock_read, _mock_details, _mock_probe, tmp_path, settings -): - profile = tmp_path / "profile" - profile.mkdir() - settings.DISCORD_CHROME_PROFILE_PATH = str(profile) - mock_resolve.return_value = profile - out = dt.extract_discord_token_auto() - assert out["user_token"] == "tok" - assert out["user_id"] == "1" - - -@patch.object(dt, "_resolve_discord_chrome_profile_root") -def test_extract_discord_token_auto_missing_profile(mock_resolve, settings): - settings.DISCORD_CHROME_PROFILE_PATH = "/nonexistent/profile/path" - mock_resolve.return_value = Path("/nonexistent/profile/path") - assert dt.extract_discord_token_auto() is None - - -@patch.object(dt, "probe_discord_user_token", return_value=False) -@patch.object(dt, "_read_discord_token_from_leveldb", return_value="bad-tok") -@patch.object(dt, "_resolve_discord_chrome_profile_root") -def test_extract_discord_token_auto_probe_fails( - mock_resolve, _mock_read, _mock_probe, tmp_path, settings -): - profile = tmp_path / "profile" - profile.mkdir() - settings.DISCORD_CHROME_PROFILE_PATH = str(profile) - mock_resolve.return_value = profile - assert dt.extract_discord_token_auto() is None diff --git a/discord_activity_tracker/tests/test_export.py b/discord_activity_tracker/tests/test_export.py deleted file mode 100644 index d014ea54..00000000 --- a/discord_activity_tracker/tests/test_export.py +++ /dev/null @@ -1,456 +0,0 @@ -"""Tests for markdown export functions.""" - -from datetime import datetime, timezone -from unittest.mock import MagicMock - -from django.test import TestCase - -from discord_activity_tracker.sync.export import ( - _make_github_anchor, - _sanitize_discord_content, - generate_markdown_content, -) - - -class MakeGitHubAnchorTests(TestCase): - - def test_basic_anchor(self): - result = _make_github_anchor("14:30:25", "alice") - self.assertEqual(result, "143025-utc--alice") - - def test_special_chars_in_username(self): - result = _make_github_anchor("14:30:25", "user.name#1234") - self.assertNotIn(".", result) - self.assertNotIn("#", result) - self.assertIn("username1234", result) - - def test_millisecond_timestamp(self): - result = _make_github_anchor("14:30:25.123", "alice") - self.assertEqual(result, "143025123-utc--alice") - - def test_matches_github_format(self): - result = _make_github_anchor("01:32:31.841", "twopic") - self.assertEqual(result, "013231841-utc--twopic") - - -class SanitizeDiscordContentTests(TestCase): - - def test_user_mention(self): - result = _sanitize_discord_content("<@123456789>") - self.assertEqual(result, "@user-123456789") - - def test_user_mention_with_bang(self): - result = _sanitize_discord_content("<@!123456789>") - self.assertEqual(result, "@user-123456789") - - def test_role_mention(self): - result = _sanitize_discord_content("<@&987654>") - self.assertEqual(result, "@role-987654") - - def test_channel_mention(self): - result = _sanitize_discord_content("<#555666>") - self.assertEqual(result, "#channel-555666") - - def test_custom_emoji(self): - result = _sanitize_discord_content("<:thumbsup:123456>") - self.assertEqual(result, ":thumbsup:") - - def test_animated_emoji(self): - result = _sanitize_discord_content("") - self.assertEqual(result, ":partyblob:") - - def test_mixed_content(self): - content = "Hey <@123> check <#456> for <:fire:789>" - result = _sanitize_discord_content(content) - self.assertEqual(result, "Hey @user-123 check #channel-456 for :fire:") - - def test_plain_text_unchanged(self): - content = "Hello world, no special formatting here!" - result = _sanitize_discord_content(content) - self.assertEqual(result, content) - - def test_empty_string(self): - result = _sanitize_discord_content("") - self.assertEqual(result, "") - - def test_code_block_preserved(self): - content = "Look at this: ```<@123> should stay```" - result = _sanitize_discord_content(content) - self.assertIn("<@123>", result) - - def test_inline_code_preserved(self): - content = "Use `<@mention>` syntax" - result = _sanitize_discord_content(content) - self.assertIn("<@mention>", result) - - def test_mention_outside_code_block_converted(self): - content = "Hi <@111> ```code here``` and <@222>" - result = _sanitize_discord_content(content) - self.assertIn("@user-111", result) - self.assertIn("@user-222", result) - - def test_invisible_unicode_stripped(self): - content = "\u2068\u2069```cpp\nint x = 1;\n```" - result = _sanitize_discord_content(content) - self.assertNotIn("\u2068", result) - self.assertNotIn("\u2069", result) - self.assertIn("```cpp", result) - - -class GenerateMarkdownContentTests(TestCase): - - def _make_mock_channel( - self, name="general", server_name="TestServer", server_id=111, channel_id=222 - ): - channel = MagicMock() - channel.channel_name = name - channel.channel_id = channel_id - channel.server.server_name = server_name - channel.server.server_id = server_id - return channel - - def _make_mock_message( - self, - message_id, - content, - username, - created_at, - author_id=None, - is_bot=False, - reply_to=None, - reactions=None, - attachments=None, - ): - msg = MagicMock() - msg.message_id = message_id - msg.content = content - msg.author_id = author_id or message_id - msg.author.username = username - msg.author.is_bot = is_bot - msg.message_created_at = created_at - msg.reply_to_message_id = reply_to - msg.attachment_urls = attachments or [] - - # Mock reactions queryset - if reactions: - mock_reactions = [] - for emoji, count in reactions: - r = MagicMock() - r.emoji = emoji - r.count = count - mock_reactions.append(r) - msg.reactions.all.return_value = mock_reactions - else: - msg.reactions.all.return_value = [] - - return msg - - def test_empty_messages(self): - channel = self._make_mock_channel() - result = generate_markdown_content(channel, "2026-02", []) - self.assertIn("message_count: 0", result) - self.assertIn("active_users: 0", result) - - def test_frontmatter_has_channel_and_server(self): - channel = self._make_mock_channel(name="dev-chat", server_name="MyServer") - result = generate_markdown_content(channel, "2026-02", []) - self.assertIn("channel: dev-chat", result) - self.assertIn("server: MyServer", result) - - def test_frontmatter_month_mode(self): - channel = self._make_mock_channel() - result = generate_markdown_content(channel, "2026-02", []) - self.assertIn("month: 2026-02", result) - self.assertNotIn("date:", result) - - def test_frontmatter_date_mode(self): - channel = self._make_mock_channel() - result = generate_markdown_content( - channel, "2026-02", [], date_str="2026-02-15" - ) - self.assertIn("date: 2026-02-15", result) - self.assertNotIn("month:", result) - - def test_title_monthly(self): - channel = self._make_mock_channel(name="general") - result = generate_markdown_content(channel, "2026-02", []) - self.assertIn("# #general - February 2026", result) - - def test_title_daily(self): - channel = self._make_mock_channel(name="general") - result = generate_markdown_content( - channel, "2026-02", [], date_str="2026-02-15" - ) - self.assertIn("# #general - 2026-02-15", result) - - def test_message_utc_timestamp(self): - channel = self._make_mock_channel() - msg = self._make_mock_message( - message_id=1001, - content="Hello world", - username="alice", - created_at=datetime(2026, 2, 15, 14, 30, 25, tzinfo=timezone.utc), - ) - result = generate_markdown_content(channel, "2026-02", [msg]) - self.assertIn("14:30:25 UTC", result) - - def test_message_heading_format(self): - channel = self._make_mock_channel() - msg = self._make_mock_message( - message_id=1001, - content="Hello", - username="alice", - created_at=datetime(2026, 2, 15, 14, 30, 25, tzinfo=timezone.utc), - ) - result = generate_markdown_content(channel, "2026-02", [msg]) - self.assertIn("### 14:30:25 UTC — @alice", result) - - def test_message_content_sanitized(self): - channel = self._make_mock_channel() - msg = self._make_mock_message( - message_id=1001, - content="Hey <@99999> check this!", - username="alice", - created_at=datetime(2026, 2, 15, 14, 30, 25, tzinfo=timezone.utc), - ) - result = generate_markdown_content(channel, "2026-02", [msg]) - self.assertIn("@user-99999", result) - self.assertNotIn("<@99999>", result) - - def test_metadata_blockquoted_before_message(self): - channel = self._make_mock_channel() - msg = self._make_mock_message( - message_id=1001, - content="Hello", - username="alice", - created_at=datetime(2026, 2, 15, 14, 30, 25, tzinfo=timezone.utc), - ) - result = generate_markdown_content(channel, "2026-02", [msg]) - self.assertIn("> Url: https://discord.com/channels/", result) - # Metadata (Url) should appear before message content - url_pos = result.find("> Url:") - hello_pos = result.find("Hello") - self.assertLess(url_pos, hello_pos) - - def test_bot_label(self): - channel = self._make_mock_channel() - msg = self._make_mock_message( - message_id=1001, - content="Bot message", - username="MEE6", - created_at=datetime(2026, 2, 15, 14, 30, 25, tzinfo=timezone.utc), - is_bot=True, - ) - result = generate_markdown_content(channel, "2026-02", [msg]) - self.assertIn("(bot)", result) - - def test_non_bot_no_label(self): - channel = self._make_mock_channel() - msg = self._make_mock_message( - message_id=1001, - content="User message", - username="alice", - created_at=datetime(2026, 2, 15, 14, 30, 25, tzinfo=timezone.utc), - is_bot=False, - ) - result = generate_markdown_content(channel, "2026-02", [msg]) - self.assertNotIn("(bot)", result) - - def test_reactions_not_in_export(self): - channel = self._make_mock_channel() - msg = self._make_mock_message( - message_id=1001, - content="Great idea!", - username="alice", - created_at=datetime(2026, 2, 15, 14, 30, 25, tzinfo=timezone.utc), - reactions=[("👍", 3), ("🎉", 1)], - ) - result = generate_markdown_content(channel, "2026-02", [msg]) - self.assertNotIn("Reactions:", result) - - def test_attachments_blockquoted_with_indent(self): - channel = self._make_mock_channel() - msg = self._make_mock_message( - message_id=1001, - content="Check this file", - username="alice", - created_at=datetime(2026, 2, 15, 14, 30, 25, tzinfo=timezone.utc), - attachments=["https://cdn.discord.com/attachments/1/2/image.png?ex=abc"], - ) - result = generate_markdown_content(channel, "2026-02", [msg]) - self.assertIn("> Attachments:", result) - self.assertIn("> - [image.png]", result) - - def test_multiple_messages_grouped_by_date(self): - channel = self._make_mock_channel() - msg1 = self._make_mock_message( - message_id=1001, - content="Morning", - username="alice", - created_at=datetime(2026, 2, 15, 8, 0, 0, tzinfo=timezone.utc), - ) - msg2 = self._make_mock_message( - message_id=1002, - content="Evening", - username="bob", - created_at=datetime(2026, 2, 16, 20, 0, 0, tzinfo=timezone.utc), - author_id=2002, - ) - result = generate_markdown_content(channel, "2026-02", [msg1, msg2]) - self.assertIn("## 2026-02-15", result) - self.assertIn("## 2026-02-16", result) - - def test_message_count_and_active_users(self): - channel = self._make_mock_channel() - msg1 = self._make_mock_message( - message_id=1001, - content="Hi", - username="alice", - created_at=datetime(2026, 2, 15, 8, 0, 0, tzinfo=timezone.utc), - author_id=1, - ) - msg2 = self._make_mock_message( - message_id=1002, - content="Hello", - username="bob", - created_at=datetime(2026, 2, 15, 9, 0, 0, tzinfo=timezone.utc), - author_id=2, - ) - msg3 = self._make_mock_message( - message_id=1003, - content="Hey", - username="alice", - created_at=datetime(2026, 2, 15, 10, 0, 0, tzinfo=timezone.utc), - author_id=1, - ) - result = generate_markdown_content(channel, "2026-02", [msg1, msg2, msg3]) - self.assertIn("message_count: 3", result) - self.assertIn("active_users: 2", result) - - def test_utc_frontmatter_timestamps(self): - channel = self._make_mock_channel() - msg = self._make_mock_message( - message_id=1001, - content="Test", - username="alice", - created_at=datetime(2026, 2, 15, 14, 30, 25, tzinfo=timezone.utc), - ) - result = generate_markdown_content(channel, "2026-02", [msg]) - self.assertIn("first_message: 2026-02-15T14:30:25Z", result) - self.assertIn("last_message: 2026-02-15T14:30:25Z", result) - - def test_discord_channel_url(self): - channel = self._make_mock_channel(server_id=111, channel_id=222) - result = generate_markdown_content(channel, "2026-02", []) - self.assertIn( - "discord_channel_url: https://discord.com/channels/111/222", result - ) - - def test_two_day_split_generates_separate_content(self): - channel = self._make_mock_channel(name="dev-chat") - - # Day 1: 3 messages - day1_msgs = [ - self._make_mock_message( - message_id=1001, - content="Good morning!", - username="alice", - created_at=datetime(2026, 2, 15, 8, 0, 0, tzinfo=timezone.utc), - author_id=1, - ), - self._make_mock_message( - message_id=1002, - content="Hey <@111> check <#222>", - username="bob", - created_at=datetime(2026, 2, 15, 9, 30, 0, tzinfo=timezone.utc), - author_id=2, - ), - self._make_mock_message( - message_id=1003, - content="Thanks!", - username="alice", - created_at=datetime(2026, 2, 15, 10, 0, 0, tzinfo=timezone.utc), - author_id=1, - reactions=[("👍", 2)], - ), - ] - - # Day 2: 2 messages (one bot, one with attachment) - day2_msgs = [ - self._make_mock_message( - message_id=2001, - content="Daily reminder", - username="MEE6", - created_at=datetime(2026, 2, 16, 6, 0, 0, tzinfo=timezone.utc), - author_id=99, - is_bot=True, - ), - self._make_mock_message( - message_id=2002, - content="Here's the doc", - username="charlie", - created_at=datetime(2026, 2, 16, 14, 0, 0, tzinfo=timezone.utc), - author_id=3, - attachments=["https://cdn.discord.com/files/report.pdf?token=abc"], - ), - ] - - # Generate per-day file for Day 1 - result_day1 = generate_markdown_content( - channel, "2026-02", day1_msgs, date_str="2026-02-15", split_by_day=True - ) - - # Verify Day 1 output - self.assertIn("date: 2026-02-15", result_day1) - self.assertIn("# #dev-chat - 2026-02-15", result_day1) - self.assertIn("message_count: 3", result_day1) - self.assertIn("active_users: 2", result_day1) - self.assertIn("08:00:00 UTC", result_day1) - self.assertIn("@user-111", result_day1) # Sanitized mention - self.assertIn("#channel-222", result_day1) # Sanitized channel - self.assertNotIn("(bot)", result_day1) # No bots on day 1 - - # Generate per-day file for Day 2 - result_day2 = generate_markdown_content( - channel, "2026-02", day2_msgs, date_str="2026-02-16", split_by_day=True - ) - - # Verify Day 2 output - self.assertIn("date: 2026-02-16", result_day2) - self.assertIn("# #dev-chat - 2026-02-16", result_day2) - self.assertIn("message_count: 2", result_day2) - self.assertIn("(bot)", result_day2) # MEE6 is a bot - self.assertIn("[report.pdf]", result_day2) # Attachment link - self.assertIn("06:00:00 UTC", result_day2) - self.assertIn("14:00:00 UTC", result_day2) - - def test_two_day_combined_monthly(self): - channel = self._make_mock_channel(name="general") - msgs = [ - self._make_mock_message( - message_id=1001, - content="Day 1 msg", - username="alice", - created_at=datetime(2026, 2, 15, 12, 0, 0, tzinfo=timezone.utc), - author_id=1, - ), - self._make_mock_message( - message_id=2001, - content="Day 2 msg", - username="bob", - created_at=datetime(2026, 2, 16, 18, 0, 0, tzinfo=timezone.utc), - author_id=2, - ), - ] - - result = generate_markdown_content(channel, "2026-02", msgs, split_by_day=False) - - # Both days in one file - self.assertIn("month: 2026-02", result) - self.assertIn("# #general - February 2026", result) - self.assertIn("## 2026-02-15", result) - self.assertIn("## 2026-02-16", result) - self.assertIn("message_count: 2", result) - self.assertIn("active_users: 2", result) - self.assertIn("first_message: 2026-02-15T12:00:00Z", result) - self.assertIn("last_message: 2026-02-16T18:00:00Z", result) diff --git a/discord_activity_tracker/tests/test_export_sync_coverage.py b/discord_activity_tracker/tests/test_export_sync_coverage.py deleted file mode 100644 index 1343b009..00000000 --- a/discord_activity_tracker/tests/test_export_sync_coverage.py +++ /dev/null @@ -1,381 +0,0 @@ -"""Coverage for sync/export.py (markdown export, git helpers).""" - -from __future__ import annotations - -import uuid -from datetime import datetime, timedelta, timezone -from unittest.mock import MagicMock, patch - -import pytest -from django.utils import timezone as django_timezone - -from cppa_user_tracker.models import DiscordProfile -from discord_activity_tracker.models import ( - DiscordChannel, - DiscordMessage, - DiscordServer, -) -from discord_activity_tracker.sync.export import ( - _strip_invisible_unicode, - commit_and_push_context_repo, - export_all_active_channels, - export_and_push, - export_channel_to_markdown, - generate_markdown_content, -) - - -def _uid() -> int: - return uuid.uuid4().int % (2**50) - - -@pytest.fixture -def export_server(db): - return DiscordServer.objects.create( - server_id=_uid(), server_name="Export Guild", icon_url="" - ) - - -@pytest.fixture -def export_channel(db, export_server): - return DiscordChannel.objects.create( - server=export_server, - channel_id=_uid(), - channel_name="general", - channel_type="text", - ) - - -@pytest.fixture -def export_author(db): - return DiscordProfile.objects.create( - discord_user_id=_uid(), - username="alice", - display_name="Alice", - avatar_url="", - is_bot=False, - ) - - -def test_strip_invisible_unicode_empty_returns_empty(): - assert _strip_invisible_unicode("") == "" - - -@pytest.mark.django_db -def test_generate_markdown_microsecond_timestamp(export_channel, export_author): - ts = datetime(2026, 3, 1, 10, 0, 0, 500000, tzinfo=timezone.utc) - msg = DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="hi", - message_created_at=ts, - ) - out = generate_markdown_content(export_channel, "2026-03", [msg]) - assert "10:00:00.500" in out - - -@pytest.mark.django_db -def test_generate_markdown_reply_same_day(export_channel, export_author): - root = DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="root text here", - message_created_at=datetime(2026, 3, 5, 9, 0, 0, tzinfo=timezone.utc), - ) - reply = DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="reply", - message_created_at=datetime(2026, 3, 5, 9, 5, 0, tzinfo=timezone.utc), - reply_to_message_id=root.message_id, - ) - out = generate_markdown_content(export_channel, "2026-03", [root, reply]) - assert "Reply to:" in out - assert "Original:" in out - - -@pytest.mark.django_db -def test_generate_markdown_reply_split_by_day_other_month( - export_channel, export_author -): - root = DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="x" * 90, - message_created_at=datetime(2026, 2, 28, 23, 0, 0, tzinfo=timezone.utc), - ) - reply = DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="r", - message_created_at=datetime(2026, 3, 1, 1, 0, 0, tzinfo=timezone.utc), - reply_to_message_id=root.message_id, - ) - out = generate_markdown_content( - export_channel, "2026-03", [reply], date_str="2026-03-01", split_by_day=True - ) - assert "../2026-02/" in out or "2026-02" in out - - -@pytest.mark.django_db -def test_generate_markdown_reply_missing_parent_skipped(export_channel, export_author): - msg = DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="orphan", - message_created_at=datetime(2026, 3, 1, 12, 0, 0, tzinfo=timezone.utc), - reply_to_message_id=999999999999, - ) - out = generate_markdown_content(export_channel, "2026-03", [msg]) - assert "orphan" in out - assert "Reply to:" not in out - - -@pytest.mark.django_db -def test_generate_markdown_code_fence_and_unclosed(export_channel, export_author): - msg = DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="```\nunclosed", - message_created_at=datetime(2026, 3, 2, 8, 0, 0, tzinfo=timezone.utc), - ) - out = generate_markdown_content(export_channel, "2026-03", [msg]) - assert "" in out - assert out.count("```") >= 2 - - -@pytest.mark.django_db -def test_generate_markdown_attachments(export_channel, export_author): - msg = DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="see file", - message_created_at=datetime(2026, 3, 3, 8, 0, 0, tzinfo=timezone.utc), - has_attachments=True, - attachment_urls=["https://cdn.discord.com/a/b/file.png?ex=1"], - ) - out = generate_markdown_content(export_channel, "2026-03", [msg]) - assert "Attachments:" in out - assert "file.png" in out - - -@pytest.mark.django_db -def test_generate_markdown_reply_same_month_aggregate_link( - export_channel, export_author -): - """Reply in same calendar month as year_month uses in-page anchor (export.py ~168).""" - root = DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="root", - message_created_at=datetime(2026, 3, 1, 8, 0, 0, tzinfo=timezone.utc), - ) - reply = DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="later", - message_created_at=datetime(2026, 3, 15, 9, 0, 0, tzinfo=timezone.utc), - reply_to_message_id=root.message_id, - ) - out = generate_markdown_content(export_channel, "2026-03", [root, reply]) - assert "Reply to:" in out - assert "](" in out and "#" in out - - -@pytest.mark.django_db -def test_generate_markdown_reply_microsecond_reply_time(export_channel, export_author): - root = DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="root", - message_created_at=datetime(2026, 3, 10, 1, 0, 0, tzinfo=timezone.utc), - ) - reply = DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="r", - message_created_at=datetime(2026, 3, 10, 1, 0, 0, 123000, tzinfo=timezone.utc), - reply_to_message_id=root.message_id, - ) - out = generate_markdown_content(export_channel, "2026-03", [root, reply]) - assert "Reply to:" in out - - -@pytest.mark.django_db -def test_export_channel_to_markdown_writes_per_day_files( - export_channel, export_author, tmp_path -): - repo = tmp_path / "ctx" - repo.mkdir() - t0 = datetime(2026, 4, 10, 12, 0, 0, tzinfo=timezone.utc) - DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="day a", - message_created_at=t0, - ) - DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="day b", - message_created_at=t0 + timedelta(days=1), - ) - paths = export_channel_to_markdown(export_channel, "2026-04", repo) - assert paths is not None and len(paths) == 2 - assert all(p.suffix == ".md" for p in paths) - - -@pytest.mark.django_db -def test_export_channel_to_markdown_empty_month_returns_none(export_channel, tmp_path): - assert export_channel_to_markdown(export_channel, "2026-05", tmp_path) is None - - -@pytest.mark.django_db -def test_export_all_active_channels_collects_paths( - export_server, export_channel, export_author, tmp_path, monkeypatch -): - now = django_timezone.now() - DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="recent", - message_created_at=now - timedelta(days=1), - ) - ym = now.strftime("%Y-%m") - fake_paths = [tmp_path / f"{ym}-stub.md"] - - def fake_export(ch, year_month, out_dir): - if ch.pk == export_channel.pk and year_month == ym: - return fake_paths - return None - - monkeypatch.setattr( - "discord_activity_tracker.sync.export.export_channel_to_markdown", - fake_export, - ) - paths = export_all_active_channels( - tmp_path, export_server, months_back=1, active_days=30 - ) - assert paths == fake_paths - - -@pytest.mark.django_db -def test_export_all_active_channels_continues_on_channel_error( - export_server, export_channel, export_author, tmp_path, monkeypatch -): - now = django_timezone.now() - DiscordMessage.objects.create( - message_id=_uid(), - channel=export_channel, - author=export_author, - content="recent", - message_created_at=now - timedelta(hours=1), - ) - - def boom(*_a, **_k): - raise RuntimeError("export failed") - - monkeypatch.setattr( - "discord_activity_tracker.sync.export.export_channel_to_markdown", - boom, - ) - paths = export_all_active_channels(tmp_path, export_server, months_back=1) - assert paths == [] - - -def test_commit_and_push_no_changes(tmp_path): - calls: list[list[str]] = [] - - def run_side_effect(cmd, **_kwargs): - calls.append(list(cmd)) - if "status" in cmd: - return MagicMock(returncode=0, stdout="", stderr="") - return MagicMock(returncode=0, stdout="", stderr="") - - with patch("discord_activity_tracker.sync.export.subprocess.run", run_side_effect): - assert commit_and_push_context_repo(tmp_path) is True - assert any("status" in c for c in calls) - - -def test_commit_and_push_full_flow(tmp_path): - seq = iter( - [ - MagicMock(returncode=0, stdout="", stderr=""), - MagicMock(returncode=0, stdout=" M file\n", stderr=""), - MagicMock(returncode=0, stdout="", stderr=""), - MagicMock(returncode=0, stdout="", stderr=""), - ] - ) - - def run_side_effect(cmd, **_kwargs): - return next(seq) - - with patch("discord_activity_tracker.sync.export.subprocess.run", run_side_effect): - assert commit_and_push_context_repo(tmp_path, "msg") is True - - -def test_commit_and_push_git_error(tmp_path): - import subprocess as sp - - def run_side_effect(cmd, **_kwargs): - raise sp.CalledProcessError(1, cmd, stderr="err") - - with patch("discord_activity_tracker.sync.export.subprocess.run", run_side_effect): - assert commit_and_push_context_repo(tmp_path) is False - - -def test_commit_and_push_generic_exception(tmp_path): - with patch( - "discord_activity_tracker.sync.export.subprocess.run", - side_effect=OSError("boom"), - ): - assert commit_and_push_context_repo(tmp_path) is False - - -@pytest.mark.django_db -def test_export_and_push_no_files_returns_false(export_server, tmp_path): - with patch( - "discord_activity_tracker.sync.export.export_all_active_channels", - return_value=[], - ): - assert export_and_push(tmp_path, export_server) is False - - -@pytest.mark.django_db -def test_export_and_push_files_no_auto_commit(export_server, tmp_path): - with patch( - "discord_activity_tracker.sync.export.export_all_active_channels", - return_value=[tmp_path / "a.md"], - ): - assert export_and_push(tmp_path, export_server, auto_commit=False) is True - - -@pytest.mark.django_db -def test_export_and_push_auto_commit(export_server, tmp_path): - with ( - patch( - "discord_activity_tracker.sync.export.export_all_active_channels", - return_value=[tmp_path / "a.md"], - ), - patch( - "discord_activity_tracker.sync.export.commit_and_push_context_repo", - return_value=True, - ) as m, - ): - assert export_and_push(tmp_path, export_server, auto_commit=True) is True - m.assert_called_once() diff --git a/discord_activity_tracker/tests/test_exporter_window.py b/discord_activity_tracker/tests/test_exporter_window.py deleted file mode 100644 index 2c1c4428..00000000 --- a/discord_activity_tracker/tests/test_exporter_window.py +++ /dev/null @@ -1,219 +0,0 @@ -"""Tests for sync/exporter_window.py.""" - -from __future__ import annotations - -import uuid -from datetime import datetime, timezone - -import pytest - -from cppa_user_tracker.models import DiscordProfile -from discord_activity_tracker.models import ( - DiscordChannel, - DiscordMessage, - DiscordServer, -) -from discord_activity_tracker.sync.exporter_window import ( - incremental_export_after, - iter_channel_export_days, - latest_message_created_at_for_channel, - latest_message_created_at_for_guild, - resolve_channel_export_after, - utc_day_start, -) - - -def _uid() -> int: - return uuid.uuid4().int % (2**50) - - -@pytest.mark.django_db -def test_latest_message_empty_db(): - assert latest_message_created_at_for_guild(999001, channel_ids=None) is None - - -@pytest.mark.django_db -def test_latest_message_ignores_deleted(): - srv = DiscordServer.objects.create(server_id=_uid(), server_name="G", icon_url="") - ch = DiscordChannel.objects.create( - server=srv, channel_id=_uid(), channel_name="c", channel_type="text" - ) - author = DiscordProfile.objects.create( - discord_user_id=_uid(), - username="u", - display_name="U", - avatar_url="", - is_bot=False, - ) - t = datetime(2026, 1, 1, tzinfo=timezone.utc) - DiscordMessage.objects.create( - message_id=_uid(), - channel=ch, - author=author, - content="deleted", - message_created_at=t, - is_deleted=True, - ) - assert latest_message_created_at_for_guild(srv.server_id, channel_ids=None) is None - - -@pytest.mark.django_db -def test_latest_message_respects_channel_allowlist(): - srv = DiscordServer.objects.create(server_id=_uid(), server_name="G", icon_url="") - ch1 = DiscordChannel.objects.create( - server=srv, channel_id=_uid(), channel_name="a", channel_type="text" - ) - ch2 = DiscordChannel.objects.create( - server=srv, channel_id=_uid(), channel_name="b", channel_type="text" - ) - author = DiscordProfile.objects.create( - discord_user_id=_uid(), - username="u", - display_name="U", - avatar_url="", - is_bot=False, - ) - t1 = datetime(2026, 2, 1, tzinfo=timezone.utc) - t2 = datetime(2026, 3, 1, tzinfo=timezone.utc) - DiscordMessage.objects.create( - message_id=_uid(), - channel=ch1, - author=author, - content="older", - message_created_at=t1, - ) - DiscordMessage.objects.create( - message_id=_uid(), - channel=ch2, - author=author, - content="newer", - message_created_at=t2, - ) - latest = latest_message_created_at_for_guild( - srv.server_id, channel_ids=[ch1.channel_id] - ) - assert latest == t1 - - -def test_utc_day_start_normalizes_to_midnight(): - dt = datetime(2026, 6, 2, 22, 30, 45, tzinfo=timezone.utc) - assert utc_day_start(dt) == datetime(2026, 6, 2, 0, 0, 0, tzinfo=timezone.utc) - - -def test_iter_channel_export_days_empty_after_is_today_only(): - now = datetime(2026, 6, 11, 15, 0, 0, tzinfo=timezone.utc) - days = iter_channel_export_days(after=None, before=None, now=now) - assert len(days) == 1 - assert days[0][0] == "2026-06-11" - assert days[0][1] == datetime(2026, 6, 11, 0, 0, 0, tzinfo=timezone.utc) - assert days[0][2] == now - - -def test_iter_channel_export_days_spans_multiple_days(): - after = datetime(2026, 6, 1, 10, 0, 0, tzinfo=timezone.utc) - before = datetime(2026, 6, 3, 8, 0, 0, tzinfo=timezone.utc) - days = iter_channel_export_days(after=after, before=before, now=before) - assert [d[0] for d in days] == ["2026-06-01", "2026-06-02", "2026-06-03"] - assert days[0][1] == after - assert days[-1][2] == before - - -@pytest.mark.django_db -def test_latest_message_per_channel(): - srv = DiscordServer.objects.create(server_id=_uid(), server_name="G", icon_url="") - ch1 = DiscordChannel.objects.create( - server=srv, channel_id=_uid(), channel_name="a", channel_type="text" - ) - ch2 = DiscordChannel.objects.create( - server=srv, channel_id=_uid(), channel_name="b", channel_type="text" - ) - author = DiscordProfile.objects.create( - discord_user_id=_uid(), - username="u", - display_name="U", - avatar_url="", - is_bot=False, - ) - t1 = datetime(2026, 4, 1, 15, 0, 0, tzinfo=timezone.utc) - t2 = datetime(2026, 5, 1, 9, 0, 0, tzinfo=timezone.utc) - DiscordMessage.objects.create( - message_id=_uid(), - channel=ch1, - author=author, - content="a", - message_created_at=t1, - ) - DiscordMessage.objects.create( - message_id=_uid(), - channel=ch2, - author=author, - content="b", - message_created_at=t2, - ) - assert latest_message_created_at_for_channel(srv.server_id, ch1.channel_id) == t1 - assert latest_message_created_at_for_channel(srv.server_id, ch2.channel_id) == t2 - - -def test_incremental_export_after_floors_to_utc_day_start(): - latest = datetime(2026, 6, 10, 22, 45, 0, tzinfo=timezone.utc) - assert incremental_export_after(latest) == datetime( - 2026, 6, 10, 0, 0, 0, tzinfo=timezone.utc - ) - - -@pytest.mark.django_db -def test_resolve_channel_export_after_uses_day_start_without_explicit_since(): - srv = DiscordServer.objects.create(server_id=_uid(), server_name="G", icon_url="") - ch = DiscordChannel.objects.create( - server=srv, channel_id=_uid(), channel_name="c", channel_type="text" - ) - author = DiscordProfile.objects.create( - discord_user_id=_uid(), - username="u", - display_name="U", - avatar_url="", - is_bot=False, - ) - latest = datetime(2026, 6, 10, 18, 0, 0, tzinfo=timezone.utc) - DiscordMessage.objects.create( - message_id=_uid(), - channel=ch, - author=author, - content="msg", - message_created_at=latest, - ) - resolved = resolve_channel_export_after( - srv.server_id, - ch.channel_id, - explicit_after=None, - ) - assert resolved == datetime(2026, 6, 10, 0, 0, 0, tzinfo=timezone.utc) - - -def test_resolve_channel_export_after_honors_explicit_since(): - explicit = datetime(2026, 1, 1, tzinfo=timezone.utc) - assert resolve_channel_export_after(1, 2, explicit_after=explicit) == explicit - - -def test_iter_channel_export_days_naive_before_treated_as_utc(): - after = datetime(2026, 6, 1, 10, 0, 0, tzinfo=timezone.utc) - before_naive = datetime(2026, 6, 3, 8, 0, 0) - before_aware = datetime(2026, 6, 3, 8, 0, 0, tzinfo=timezone.utc) - naive_days = iter_channel_export_days( - after=after, before=before_naive, now=before_aware - ) - aware_days = iter_channel_export_days( - after=after, before=before_aware, now=before_aware - ) - assert naive_days == aware_days - assert [d[0] for d in naive_days] == ["2026-06-01", "2026-06-02", "2026-06-03"] - - -def test_iter_channel_export_days_clips_partial_last_day(): - after = datetime(2026, 6, 2, 22, 0, 0, tzinfo=timezone.utc) - now = datetime(2026, 6, 2, 23, 30, 0, tzinfo=timezone.utc) - days = iter_channel_export_days(after=after, before=None, now=now) - assert len(days) == 1 - assert days[0][0] == "2026-06-02" - assert days[0][1] == after - assert days[0][2] == now diff --git a/discord_activity_tracker/tests/test_extract_discord_tokens_command.py b/discord_activity_tracker/tests/test_extract_discord_tokens_command.py deleted file mode 100644 index 8253c719..00000000 --- a/discord_activity_tracker/tests/test_extract_discord_tokens_command.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Tests for extract_discord_tokens management command.""" - -from io import StringIO -from unittest.mock import patch - -import pytest -from django.core.management import call_command -from django.core.management.base import CommandError - - -@patch( - "discord_activity_tracker.management.commands.extract_discord_tokens.extract_and_save_discord_internal_tokens", - return_value="discord-tok", -) -@patch( - "discord_activity_tracker.management.commands.extract_discord_tokens._resolve_discord_chrome_profile_root", -) -def test_extract_discord_tokens_command_success( - mock_resolve_profile, mock_extract_and_save, tmp_path -): - profile = tmp_path / "chrome_profile" - profile.mkdir() - mock_resolve_profile.return_value = profile - out = StringIO() - call_command("extract_discord_tokens", stdout=out) - mock_extract_and_save.assert_called_once() - assert "Saved Discord session credentials" in out.getvalue() - - -@patch( - "discord_activity_tracker.management.commands.extract_discord_tokens.extract_and_save_discord_internal_tokens", - return_value=None, -) -@patch( - "discord_activity_tracker.management.commands.extract_discord_tokens._resolve_discord_chrome_profile_root", -) -def test_extract_discord_tokens_command_failure( - mock_resolve_profile, mock_extract_and_save, tmp_path -): - profile = tmp_path / "chrome_profile" - profile.mkdir() - mock_resolve_profile.return_value = profile - with pytest.raises(CommandError, match="Failed to load session credentials"): - call_command("extract_discord_tokens") - mock_extract_and_save.assert_called_once() - - -def test_extract_discord_tokens_command_missing_profile(settings, tmp_path): - settings.DISCORD_CHROME_PROFILE_PATH = str(tmp_path / "missing_profile") - with pytest.raises(CommandError, match="Session storage not found"): - call_command("extract_discord_tokens") diff --git a/discord_activity_tracker/tests/test_failure_classification.py b/discord_activity_tracker/tests/test_failure_classification.py deleted file mode 100644 index 359470ea..00000000 --- a/discord_activity_tracker/tests/test_failure_classification.py +++ /dev/null @@ -1,67 +0,0 @@ -"""Discord-related failure classification for CollectorFailureCategory.""" - -from __future__ import annotations - -from core.errors import CollectorFailureCategory, classify_failure - - -def _make_discord_http_exception(status: int) -> Exception: - cls = type("HTTPException", (Exception,), {}) - cls.__module__ = "discord.errors" - exc = cls() - exc.status = status - return exc - - -def test_discord_http_429_is_rate_limit(): - exc = _make_discord_http_exception(429) - assert classify_failure(exc) is CollectorFailureCategory.RATE_LIMIT - - -def test_discord_http_401_is_auth(): - exc = _make_discord_http_exception(401) - assert classify_failure(exc) is CollectorFailureCategory.AUTH - - -def test_discord_http_403_is_auth(): - exc = _make_discord_http_exception(403) - assert classify_failure(exc) is CollectorFailureCategory.AUTH - - -def test_discord_forbidden_subclass_403_is_auth(): - cls = type("Forbidden", (Exception,), {}) - cls.__module__ = "discord.errors" - exc = cls() - exc.status = 403 - assert classify_failure(exc) is CollectorFailureCategory.AUTH - - -def test_discord_not_found_subclass_404_is_unknown(): - cls = type("NotFound", (Exception,), {}) - cls.__module__ = "discord.errors" - exc = cls() - exc.status = 404 - assert classify_failure(exc) is CollectorFailureCategory.UNKNOWN - - -def test_discord_http_502_is_network(): - exc = _make_discord_http_exception(502) - assert classify_failure(exc) is CollectorFailureCategory.NETWORK - - -def test_discord_http_404_is_unknown(): - exc = _make_discord_http_exception(404) - assert classify_failure(exc) is CollectorFailureCategory.UNKNOWN - - -def test_discord_http_no_status_defaults_network(): - cls = type("HTTPException", (Exception,), {}) - cls.__module__ = "discord.errors" - exc = cls() - assert classify_failure(exc) is CollectorFailureCategory.NETWORK - - -def test_discord_login_failure_is_auth(): - cls = type("LoginFailure", (Exception,), {}) - cls.__module__ = "discord.errors" - assert classify_failure(cls()) is CollectorFailureCategory.AUTH diff --git a/discord_activity_tracker/tests/test_messages_more.py b/discord_activity_tracker/tests/test_messages_more.py deleted file mode 100644 index e39b8f51..00000000 --- a/discord_activity_tracker/tests/test_messages_more.py +++ /dev/null @@ -1,165 +0,0 @@ -"""Extra coverage for discord_activity_tracker.sync.messages branches.""" - -import asyncio -from datetime import timedelta -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest -from django.utils import timezone as django_timezone - -from cppa_user_tracker.models import DiscordProfile -from discord_activity_tracker.models import DiscordChannel, DiscordServer -from discord_activity_tracker.services import create_or_update_discord_message -from discord_activity_tracker.sync import messages as messages_mod -from discord_activity_tracker.sync.messages import ( - _sync_all_channels_async, - sync_channel_messages_async, -) - - -def _uniq(): - import uuid - - return uuid.uuid4().int % (2**50) - - -@pytest.mark.django_db -def test_sync_all_channels_async_continues_on_channel_failure(): - async def boom(*_a, **_kw): - raise RuntimeError("sync failed") - - server = DiscordServer.objects.create( - server_id=_uniq(), server_name="S", icon_url="" - ) - ch = DiscordChannel.objects.create( - server=server, - channel_id=_uniq(), - channel_name="x", - channel_type="text", - ) - - async def main(): - client = MagicMock() - with patch( - "discord_activity_tracker.sync.messages.sync_channel_messages_async", - new=boom, - ): - await _sync_all_channels_async(client, [ch], server.server_id) - - asyncio.run(main()) - - -@pytest.mark.django_db -def test_sync_channel_messages_async_since_date_branch(): - gid = _uniq() - cid = _uniq() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="c", - channel_type="text", - ) - since = django_timezone.now() - timedelta(days=1) - - async def main(): - client = MagicMock() - client.get_channel = AsyncMock(return_value=None) - await sync_channel_messages_async(client, channel, gid, since_date=since) - - asyncio.run(main()) - - -@pytest.mark.django_db(transaction=True) -def test_sync_channel_messages_async_uses_latest_stored_message_for_after(): - gid = _uniq() - cid = _uniq() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="c", - channel_type="text", - ) - author = DiscordProfile.objects.create( - discord_user_id=_uniq(), - username="u", - display_name="", - avatar_url="", - is_bot=False, - ) - stored_ts = django_timezone.now() - timedelta(hours=3) - create_or_update_discord_message( - _uniq(), channel, author, "x", message_created_at=stored_ts - ) - - async def main(): - client = MagicMock() - dch = MagicMock() - client.get_channel = AsyncMock(return_value=dch) - client.fetch_messages_since = AsyncMock(return_value=[]) - await sync_channel_messages_async(client, channel, gid) - client.fetch_messages_since.assert_awaited_once() - assert client.fetch_messages_since.await_args.kwargs["after"] == stored_ts - - asyncio.run(main()) - - -@pytest.mark.django_db -def test_sync_channel_messages_async_default_window(monkeypatch): - gid = _uniq() - cid = _uniq() - fixed_now = django_timezone.now() - monkeypatch.setattr(django_timezone, "now", lambda: fixed_now) - - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="c", - channel_type="text", - ) - - async def main(): - client = MagicMock() - dch = MagicMock() - client.get_channel = AsyncMock(return_value=dch) - client.fetch_messages_since = AsyncMock(return_value=[]) - await sync_channel_messages_async(client, channel, gid) - client.fetch_messages_since.assert_awaited_once() - assert client.fetch_messages_since.await_args.kwargs[ - "after" - ] == fixed_now - timedelta(days=30) - - asyncio.run(main()) - - -@pytest.mark.django_db -def test_sync_channel_messages_async_process_batch_raises(): - gid = _uniq() - cid = _uniq() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="c", - channel_type="text", - ) - - async def main(): - client = MagicMock() - dch = MagicMock() - dch.name = "c" - client.get_channel = AsyncMock(return_value=dch) - client.fetch_messages_since = AsyncMock(return_value=[{"id": 1}]) - - with patch.object( - messages_mod, - "_process_messages_in_batches", - new_callable=AsyncMock, - side_effect=ValueError("bad batch"), - ): - with pytest.raises(ValueError, match="bad batch"): - await sync_channel_messages_async(client, channel, gid, full_sync=True) - - asyncio.run(main()) diff --git a/discord_activity_tracker/tests/test_models_str.py b/discord_activity_tracker/tests/test_models_str.py deleted file mode 100644 index e88d828d..00000000 --- a/discord_activity_tracker/tests/test_models_str.py +++ /dev/null @@ -1,112 +0,0 @@ -"""Coverage for Discord model __str__ methods.""" - -import pytest - -from cppa_user_tracker.models import DiscordProfile -from discord_activity_tracker.models import ( - DiscordChannel, - DiscordMessage, - DiscordReaction, - DiscordServer, -) - - -@pytest.mark.django_db -def test_discord_server_str(): - s = DiscordServer.objects.create(server_id=1, server_name="Guild", icon_url="") - assert "Guild" in str(s) and "1" in str(s) - - -@pytest.mark.django_db -def test_discord_channel_str(): - s = DiscordServer.objects.create(server_id=2, server_name="G", icon_url="") - ch = DiscordChannel.objects.create( - server=s, - channel_id=3, - channel_name="help", - channel_type="text", - ) - assert "#help" == str(ch) - - -@pytest.mark.django_db -def test_discord_channel_str_with_category(): - """Category fields are stored correctly; channel str is still the name.""" - s = DiscordServer.objects.create(server_id=20, server_name="G", icon_url="") - ch = DiscordChannel.objects.create( - server=s, - channel_id=30, - channel_name="c-cpp-discussion", - channel_type="GuildTextChat", - category_id=855220194887335977, - category_name="Discussion", - ) - assert "#c-cpp-discussion" == str(ch) - ch.refresh_from_db() - assert ch.category_id == 855220194887335977 - assert ch.category_name == "Discussion" - - -@pytest.mark.django_db -def test_discord_message_and_reaction_str(): - s = DiscordServer.objects.create(server_id=4, server_name="G", icon_url="") - ch = DiscordChannel.objects.create( - server=s, - channel_id=5, - channel_name="c", - channel_type="text", - ) - author = DiscordProfile.objects.create( - discord_user_id=99, - username="alice", - display_name="Alice", - avatar_url="", - is_bot=False, - ) - from django.utils import timezone as dj_tz - - msg = DiscordMessage.objects.create( - message_id=100, - channel=ch, - author=author, - content="hello world example text", - message_created_at=dj_tz.now(), - ) - assert "alice" in str(msg) - assert "hello" in str(msg) - - r = DiscordReaction.objects.create(message=msg, emoji="👍", count=2) - assert "👍" in str(r) and "2" in str(r) - - -@pytest.mark.django_db -def test_discord_message_type_and_is_pinned_fields(): - """New message_type and is_pinned fields persist correctly.""" - from django.utils import timezone as dj_tz - - s = DiscordServer.objects.create(server_id=40, server_name="G", icon_url="") - ch = DiscordChannel.objects.create( - server=s, - channel_id=50, - channel_name="announcements", - channel_type="GuildTextChat", - ) - author = DiscordProfile.objects.create( - discord_user_id=990, - username="bob", - display_name="Bob", - avatar_url="", - is_bot=False, - ) - msg = DiscordMessage.objects.create( - message_id=200, - channel=ch, - author=author, - content="pinned reply", - message_type="Reply", - is_pinned=True, - message_created_at=dj_tz.now(), - ) - msg.refresh_from_db() - assert msg.message_type == "Reply" - assert msg.is_pinned is True diff --git a/discord_activity_tracker/tests/test_pinecone_runner_coverage.py b/discord_activity_tracker/tests/test_pinecone_runner_coverage.py deleted file mode 100644 index b13a4c52..00000000 --- a/discord_activity_tracker/tests/test_pinecone_runner_coverage.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Coverage for pinecone_runner.""" - -from __future__ import annotations - -from unittest.mock import patch - -import pytest - -from discord_activity_tracker.pinecone_runner import task_discord_pinecone_sync - - -def test_task_discord_pinecone_sync_dry_run(): - task_discord_pinecone_sync(dry_run=True) - - -@pytest.mark.django_db -def test_task_discord_pinecone_sync_skips_when_app_type_empty(monkeypatch, settings): - monkeypatch.setattr(settings, "PINECONE_DISCORD_APP_TYPE", "") - monkeypatch.setattr(settings, "PINECONE_DISCORD_NAMESPACE", "ns") - task_discord_pinecone_sync(dry_run=False) - - -@pytest.mark.django_db -def test_task_discord_pinecone_sync_skips_when_namespace_empty(monkeypatch, settings): - monkeypatch.setattr(settings, "PINECONE_DISCORD_APP_TYPE", "app") - monkeypatch.setattr(settings, "PINECONE_DISCORD_NAMESPACE", " ") - task_discord_pinecone_sync(dry_run=False) - - -@pytest.mark.django_db -def test_task_discord_pinecone_sync_calls_run_command(monkeypatch, settings): - monkeypatch.setattr(settings, "PINECONE_DISCORD_APP_TYPE", "discord") - monkeypatch.setattr(settings, "PINECONE_DISCORD_NAMESPACE", "ns") - with patch("discord_activity_tracker.pinecone_runner.call_command") as cc: - task_discord_pinecone_sync(dry_run=False) - cc.assert_called_once() - - -@pytest.mark.django_db -def test_task_discord_pinecone_sync_swallows_call_command_error(monkeypatch, settings): - monkeypatch.setattr(settings, "PINECONE_DISCORD_APP_TYPE", "discord") - monkeypatch.setattr(settings, "PINECONE_DISCORD_NAMESPACE", "ns") - with patch( - "discord_activity_tracker.pinecone_runner.call_command", - side_effect=RuntimeError("no command"), - ): - task_discord_pinecone_sync(dry_run=False) diff --git a/discord_activity_tracker/tests/test_preprocessor.py b/discord_activity_tracker/tests/test_preprocessor.py deleted file mode 100644 index f8d84d07..00000000 --- a/discord_activity_tracker/tests/test_preprocessor.py +++ /dev/null @@ -1,374 +0,0 @@ -"""Unit tests for discord_activity_tracker.preprocessor.""" - -from datetime import datetime, timezone - -import pytest - -from core.utils.text_processing import clean_discord_text -from cppa_user_tracker.models import DiscordProfile -from discord_activity_tracker.models import ( - DiscordChannel, - DiscordMessage, - DiscordServer, -) -from discord_activity_tracker.preprocessor import ( - _build_reply_chains, - _chain_to_document, - _is_content_too_short, - _normalize_failed_ids, - _pinecone_channel_display_name, - preprocess_discord_for_pinecone, -) - -# Content that passes PINECONE_MIN_TEXT_LENGTH=50 (default in settings.py) -_L = "This is a sample Discord message with enough text for Pinecone indexing purposes." -_L2 = "Another Discord message long enough to pass the Pinecone minimum text length filter." -_L_REPLY = ( - "This is a reply message also long enough to pass the Pinecone minimum text length." -) -_L_RETRY = ( - "This failed message is long enough to be retried by the Pinecone sync pipeline." -) -_L_META = "This message has enough characters to pass the minimum text length check in Pinecone." - - -# --------------------------------------------------------------------------- -# Text helpers -# --------------------------------------------------------------------------- - - -def test_clean_discord_text_removes_user_mentions(): - assert clean_discord_text("discussion <@123456>") == "discussion" - - -def test_clean_discord_text_removes_role_mentions(): - assert "<@&" not in clean_discord_text("readership <@&9876>") - assert clean_discord_text("readership <@&9876>") == "readership" - - -def test_clean_discord_text_removes_channel_refs(): - assert "<#" not in clean_discord_text("see <#5555>") - assert clean_discord_text("see <#5555>") == "see" - - -def test_clean_discord_text_converts_custom_emoji(): - assert ":wave:" in clean_discord_text("<:wave:123456789>") - - -def test_clean_discord_text_preserves_plain_text(): - assert clean_discord_text("alpha beta gamma") == "alpha beta gamma" - - -def test_clean_discord_text_removes_greeting_after_mention(): - assert clean_discord_text("hi <@1>") == "" - - -def test_clean_discord_text_removes_thanks_keeps_substance(): - assert clean_discord_text("thanks <@9> everyone here") == "everyone here" - - -def test_is_content_too_short_below_threshold(): - assert _is_content_too_short("hi") is True - - -def test_is_content_too_short_at_or_above_threshold(): - assert _is_content_too_short(_L) is False - - -def test_normalize_failed_ids_deduplicates(): - result = _normalize_failed_ids(["1", "2", "1", "3"]) - assert result.count("1") == 1 - assert len(result) == 3 - - -def test_normalize_failed_ids_strips_whitespace(): - result = _normalize_failed_ids([" 1 ", "2"]) - assert "1" in result - - -def test_normalize_failed_ids_skips_empty(): - result = _normalize_failed_ids(["", None, "5"]) # type: ignore[list-item] - assert "" not in result - assert "5" in result - - -# --------------------------------------------------------------------------- -# DB fixtures -# --------------------------------------------------------------------------- - - -@pytest.fixture -def server(db): - import uuid - - return DiscordServer.objects.create( - server_id=uuid.uuid4().int % (2**50), - server_name="TestGuild", - icon_url="", - ) - - -@pytest.fixture -def channel(server): - import uuid - - return DiscordChannel.objects.create( - server=server, - channel_id=uuid.uuid4().int % (2**50), - channel_name="general", - channel_type="GuildTextChat", - ) - - -@pytest.fixture -def author(db): - import uuid - - return DiscordProfile.objects.create( - discord_user_id=uuid.uuid4().int % (2**50), - username="alice", - display_name="Alice", - avatar_url="", - is_bot=False, - ) - - -@pytest.mark.django_db -def test_pinecone_channel_display_name_with_category(server): - import uuid - - ch = DiscordChannel.objects.create( - server=server, - channel_id=uuid.uuid4().int % (2**50), - channel_name="cpp-discussion", - channel_type="GuildTextChat", - category_name="Together", - ) - assert _pinecone_channel_display_name(ch) == "Together - cpp-discussion" - - -@pytest.mark.django_db -def test_pinecone_channel_display_name_without_category(channel): - assert _pinecone_channel_display_name(channel) == "general" - - -@pytest.mark.django_db -def test_pinecone_channel_display_name_whitespace_category(server): - import uuid - - ch = DiscordChannel.objects.create( - server=server, - channel_id=uuid.uuid4().int % (2**50), - channel_name="x", - channel_type="GuildTextChat", - category_name=" ", - ) - assert _pinecone_channel_display_name(ch) == "x" - - -def _make_msg(channel, author, message_id, content, ts=None, reply_to=None): - if ts is None: - ts = datetime(2026, 1, 1, 12, 0, 0, tzinfo=timezone.utc) - return DiscordMessage.objects.create( - message_id=message_id, - channel=channel, - author=author, - content=content, - message_created_at=ts, - reply_to_message_id=reply_to, - ) - - -# --------------------------------------------------------------------------- -# _build_reply_chains -# --------------------------------------------------------------------------- - - -@pytest.mark.django_db -def test_build_reply_chains_standalone_messages(channel, author): - m1 = _make_msg(channel, author, 1001, _L) - m2 = _make_msg(channel, author, 1002, _L2) - chains = _build_reply_chains([m1, m2]) - assert len(chains) == 2 - assert all(len(c) == 1 for c in chains) - - -@pytest.mark.django_db -def test_build_reply_chains_groups_replies(channel, author): - root = _make_msg(channel, author, 2001, _L) - reply1 = _make_msg(channel, author, 2002, _L_REPLY, reply_to=2001) - reply2 = _make_msg(channel, author, 2003, _L2, reply_to=2001) - chains = _build_reply_chains([root, reply1, reply2]) - assert len(chains) == 1 - chain = chains[0] - assert chain[0].message_id == root.message_id - assert len(chain) == 3 - - -@pytest.mark.django_db -def test_build_reply_chains_orphan_reply(channel, author): - """Reply whose root is not in the batch becomes its own single-item chain.""" - orphan = _make_msg(channel, author, 3001, _L, reply_to=9999) - chains = _build_reply_chains([orphan]) - assert len(chains) == 1 - assert chains[0][0].message_id == orphan.message_id - - -# --------------------------------------------------------------------------- -# _chain_to_document -# --------------------------------------------------------------------------- - - -@pytest.mark.django_db -def test_chain_to_document_single_message(channel, author): - msg = _make_msg(channel, author, 4001, _L) - doc = _chain_to_document([msg]) - assert doc is not None - assert _L.lower() in doc["content"] - assert doc["content"].startswith('alice: "') - assert doc["content"].endswith('"') - meta = doc["metadata"] - assert meta["doc_id"] == str(msg.message_id) - assert meta["type"] == "discord" - assert meta["channel_name"] == channel.channel_name - assert meta["server_name"] == channel.server.server_name - assert meta["is_reply_chain"] is False - assert meta["source_ids"] == str(msg.message_id) - - -@pytest.mark.django_db -def test_chain_to_document_escapes_internal_double_quotes(channel, author): - body = 'Before "quoted" after and more text so we exceed fifty chars easily fine.' - assert len(body) >= 50 - msg = _make_msg(channel, author, 4004, body) - doc = _chain_to_document([msg]) - assert doc is not None - assert '\\"quoted\\"' in doc["content"] - - -@pytest.mark.django_db -def test_chain_to_document_reply_chain(channel, author): - root = _make_msg(channel, author, 5001, _L) - reply = _make_msg(channel, author, 5002, _L_REPLY) - doc = _chain_to_document([root, reply]) - assert doc is not None - assert doc["metadata"]["is_reply_chain"] is True - assert str(root.message_id) in doc["metadata"]["source_ids"] - assert str(reply.message_id) in doc["metadata"]["source_ids"] - assert "\n" in doc["content"] - assert _L.lower() in doc["content"] and _L_REPLY.lower() in doc["content"] - assert doc["content"].startswith("alice:") - - -@pytest.mark.django_db -def test_chain_to_document_empty_content_returns_none(channel, author): - msg = _make_msg(channel, author, 6001, "") - doc = _chain_to_document([msg]) - assert doc is None - - -@pytest.mark.django_db -def test_chain_to_document_too_short_returns_none(channel, author): - msg = _make_msg(channel, author, 6002, "hi") - doc = _chain_to_document([msg]) - assert doc is None - - -# --------------------------------------------------------------------------- -# preprocess_discord_for_pinecone integration tests -# --------------------------------------------------------------------------- - - -@pytest.mark.django_db -def test_first_sync_indexes_all_messages(channel, author): - _make_msg(channel, author, 7001, _L) - _make_msg(channel, author, 7002, _L2) - docs, is_chunked = preprocess_discord_for_pinecone( - failed_ids=[], final_sync_at=None - ) - assert is_chunked is False - doc_ids = {d["metadata"]["doc_id"] for d in docs} - assert "7001" in doc_ids - assert "7002" in doc_ids - - -@pytest.mark.django_db -def test_incremental_sync_only_new_messages(channel, author): - old_ts = datetime(2026, 1, 1, tzinfo=timezone.utc) - new_ts = datetime(2026, 6, 1, tzinfo=timezone.utc) - - _make_msg(channel, author, 8001, _L, ts=old_ts) - _make_msg(channel, author, 8002, _L2, ts=new_ts) - - # Force updated_at on old message to be before cutoff - DiscordMessage.objects.filter(message_id=8001).update(updated_at=old_ts) - - cutoff = datetime(2026, 3, 1, tzinfo=timezone.utc) - docs, _ = preprocess_discord_for_pinecone(failed_ids=[], final_sync_at=cutoff) - doc_ids = {d["metadata"]["doc_id"] for d in docs} - # 8001 updated_at was forced to old_ts, before cutoff → not included - assert "8001" not in doc_ids - - -@pytest.mark.django_db -def test_failed_ids_are_retried(channel, author): - _make_msg(channel, author, 9001, _L_RETRY) - # Simulate that a sync already ran (cutoff in future = no new messages) - cutoff = datetime(2099, 1, 1, tzinfo=timezone.utc) - docs, _ = preprocess_discord_for_pinecone(failed_ids=["9001"], final_sync_at=cutoff) - doc_ids = {d["metadata"]["doc_id"] for d in docs} - assert "9001" in doc_ids - - -@pytest.mark.django_db -def test_empty_db_returns_empty_list(): - DiscordMessage.objects.all().delete() - docs, _ = preprocess_discord_for_pinecone(failed_ids=[], final_sync_at=None) - assert docs == [] - - -@pytest.mark.django_db -def test_metadata_shape(channel, author): - _make_msg(channel, author, 10001, _L_META) - docs, _ = preprocess_discord_for_pinecone(failed_ids=[], final_sync_at=None) - doc = next((d for d in docs if d["metadata"]["doc_id"] == "10001"), None) - assert doc is not None - meta = doc["metadata"] - required_keys = { - "doc_id", - "type", - "channel_id", - "channel_name", - "server_id", - "server_name", - "author", - "timestamp", - "is_reply_chain", - "source_ids", - } - assert required_keys.issubset(meta.keys()) - assert meta["type"] == "discord" - assert meta["author"] == "alice" - assert meta["channel_id"] == str(channel.channel_id) - assert meta["server_id"] == str(channel.server.server_id) - assert meta["server_name"] == channel.server.server_name - assert isinstance(meta["timestamp"], int) - - -@pytest.mark.django_db -def test_metadata_channel_name_includes_category(server, author): - import uuid - - ch = DiscordChannel.objects.create( - server=server, - channel_id=uuid.uuid4().int % (2**50), - channel_name="cpp-discussion", - channel_type="GuildTextChat", - category_name="C & C++ Together", - ) - _make_msg(ch, author, 10002, _L_META) - docs, _ = preprocess_discord_for_pinecone(failed_ids=[], final_sync_at=None) - doc = next((d for d in docs if d["metadata"]["doc_id"] == "10002"), None) - assert doc is not None - assert doc["metadata"]["channel_name"] == "C & C++ Together - cpp-discussion" - assert doc["metadata"]["server_name"] == server.server_name diff --git a/discord_activity_tracker/tests/test_preprocessor_extra.py b/discord_activity_tracker/tests/test_preprocessor_extra.py deleted file mode 100644 index 54526c62..00000000 --- a/discord_activity_tracker/tests/test_preprocessor_extra.py +++ /dev/null @@ -1,159 +0,0 @@ -"""Extra coverage for preprocessor reply chains and edge paths.""" - -from __future__ import annotations - -import uuid -from datetime import timedelta -from unittest.mock import patch - -import pytest -from django.utils import timezone as django_timezone - -from cppa_user_tracker.models import DiscordProfile -from discord_activity_tracker.models import ( - DiscordChannel, - DiscordMessage, - DiscordServer, -) -from discord_activity_tracker.preprocessor import ( - _build_reply_chains, - _chain_to_document, - preprocess_discord_for_pinecone, -) - - -def _uid() -> int: - return uuid.uuid4().int % (2**50) - - -@pytest.mark.django_db -def test_build_reply_chains_skips_reply_having_parent_in_batch(): - srv = DiscordServer.objects.create(server_id=_uid(), server_name="G", icon_url="") - ch = DiscordChannel.objects.create( - server=srv, channel_id=_uid(), channel_name="c", channel_type="text" - ) - author = DiscordProfile.objects.create( - discord_user_id=_uid(), - username="u", - display_name="U", - avatar_url="", - is_bot=False, - ) - root = DiscordMessage.objects.create( - message_id=_uid(), - channel=ch, - author=author, - content="root msg here long enough for any downstream checks", - message_created_at=django_timezone.now(), - ) - reply = DiscordMessage.objects.create( - message_id=_uid(), - channel=ch, - author=author, - content="reply text here long enough for any downstream checks", - message_created_at=django_timezone.now(), - reply_to_message_id=root.message_id, - ) - chains = _build_reply_chains([root, reply]) - assert len(chains) == 1 - assert {m.message_id for m in chains[0]} == {root.message_id, reply.message_id} - - -@pytest.mark.django_db -def test_build_reply_chains_orphan_at_end(): - srv = DiscordServer.objects.create(server_id=_uid(), server_name="G", icon_url="") - ch = DiscordChannel.objects.create( - server=srv, channel_id=_uid(), channel_name="c", channel_type="text" - ) - author = DiscordProfile.objects.create( - discord_user_id=_uid(), - username="u", - display_name="U", - avatar_url="", - is_bot=False, - ) - orphan = DiscordMessage.objects.create( - message_id=_uid(), - channel=ch, - author=author, - content="orphan reply text here long enough for downstream checks", - message_created_at=django_timezone.now(), - reply_to_message_id=999999999999, - ) - chains = _build_reply_chains([orphan]) - assert len(chains) == 1 - assert chains[0] == [orphan] - - -@pytest.mark.django_db -def test_chain_to_document_long_content_returns_document(): - srv = DiscordServer.objects.create(server_id=_uid(), server_name="G", icon_url="") - ch = DiscordChannel.objects.create( - server=srv, channel_id=_uid(), channel_name="c", channel_type="text" - ) - author = DiscordProfile.objects.create( - discord_user_id=_uid(), - username="u", - display_name="U", - avatar_url="", - is_bot=False, - ) - root = DiscordMessage.objects.create( - message_id=_uid(), - channel=ch, - author=author, - content="x" * 80, - message_created_at=django_timezone.now(), - ) - assert _chain_to_document([root]) is not None - - -@pytest.mark.django_db -def test_preprocess_discord_duplicate_doc_ids_skipped_second(): - srv = DiscordServer.objects.create(server_id=_uid(), server_name="G", icon_url="") - ch = DiscordChannel.objects.create( - server=srv, channel_id=_uid(), channel_name="c", channel_type="text" - ) - author = DiscordProfile.objects.create( - discord_user_id=_uid(), - username="u", - display_name="U", - avatar_url="", - is_bot=False, - ) - DiscordMessage.objects.create( - message_id=_uid(), - channel=ch, - author=author, - content="y" * 80, - message_created_at=django_timezone.now(), - ) - DiscordMessage.objects.create( - message_id=_uid(), - channel=ch, - author=author, - content="z" * 80, - message_created_at=django_timezone.now(), - ) - - doc = { - "content": "a" * 80, - "metadata": {"doc_id": "same", "type": "discord"}, - } - - with patch( - "discord_activity_tracker.preprocessor._chain_to_document", - return_value=doc, - ): - docs, _ = preprocess_discord_for_pinecone([], None) - - assert len(docs) == 1 - - -@pytest.mark.django_db -def test_preprocess_discord_nothing_to_sync_logs(caplog): - caplog.set_level("INFO") - future = django_timezone.now() + timedelta(days=3650) - docs, _ = preprocess_discord_for_pinecone([], future) - assert docs == [] - assert "nothing to sync" in caplog.text.lower() diff --git a/discord_activity_tracker/tests/test_protocol_impl.py b/discord_activity_tracker/tests/test_protocol_impl.py deleted file mode 100644 index 00a1f4aa..00000000 --- a/discord_activity_tracker/tests/test_protocol_impl.py +++ /dev/null @@ -1,56 +0,0 @@ -"""Tests for :mod:`discord_activity_tracker.protocol_impl` and chat_exporter bridge.""" - -from __future__ import annotations - -from core.activity_types import ActivityType, SourceSystem -from core.protocols import ActivityRecord, IncrementalState - -from discord_activity_tracker.protocol_impl import ( - DiscordActivityRecord, - DiscordIncrementalState, -) -from discord_activity_tracker.sync.chat_exporter import ( - exporter_message_to_activity_record, -) - - -def test_discord_incremental_state_from_after_date(): - st = DiscordIncrementalState.from_after_date( - after=None, last_message_id=100, channel_id=55 - ) - assert isinstance(st, IncrementalState) - assert st.extras["channel_id"] == 55 - - -def test_exporter_message_to_activity_record_matches_protocol(): - msg = { - "id": "12", - "timestamp": "2024-06-01T12:00:00.0000000+00:00", - "content": "hello world", - "type": "Default", - "author": {"id": "99", "name": "user1"}, - "attachments": [], - "reactions": [], - } - rec = exporter_message_to_activity_record(msg, server_id=1, channel_id=2) - assert isinstance(rec, ActivityRecord) - assert rec.external_id == "1:2:12" - assert "hello" in rec.summary - - -def test_discord_activity_record_from_converted_export_dict(): - converted = { - "id": 5, - "created_at": "2024-01-01T00:00:00.0000000Z", - "occurred_at": "2024-01-01T00:00:00.0000000Z", - "message_type": "Reply", - "content": "x", - "author": {"id": 7}, - "source_url": "https://discord.com/channels/1/2/5", - } - rec = DiscordActivityRecord.from_converted_export_dict( - converted, server_id=1, channel_id=2 - ) - assert str(rec.actor_external_id) == "7" - assert rec.activity_type == ActivityType.discord_message("Reply") - assert rec.source_system is SourceSystem.DISCORD diff --git a/discord_activity_tracker/tests/test_raw_archive.py b/discord_activity_tracker/tests/test_raw_archive.py deleted file mode 100644 index b58e33a9..00000000 --- a/discord_activity_tracker/tests/test_raw_archive.py +++ /dev/null @@ -1,95 +0,0 @@ -"""Tests for sync/raw_archive.py.""" - -from __future__ import annotations - -import json -from pathlib import Path - -from discord_activity_tracker.sync.raw_archive import ( - merge_exporter_json, - message_utc_date_str, -) - - -def _msg(mid: str, ts: str) -> dict: - return {"id": mid, "timestamp": ts, "content": f"msg-{mid}"} - - -def _envelope(*messages: dict) -> dict: - return { - "guild": {"id": "1", "name": "G"}, - "channel": {"id": "2", "name": "c"}, - "messages": list(messages), - } - - -def test_message_utc_date_str_parses_offset(): - assert message_utc_date_str(_msg("1", "2026-06-02T22:00:00+00:00")) == "2026-06-02" - - -def test_merge_exporter_json_creates_new_file(tmp_path: Path): - dest = tmp_path / "2026-06-02.json" - incoming = _envelope(_msg("100", "2026-06-02T10:00:00Z")) - count = merge_exporter_json(dest, incoming, day="2026-06-02") - assert count == 1 - data = json.loads(dest.read_text(encoding="utf-8")) - assert len(data["messages"]) == 1 - assert data["messages"][0]["id"] == "100" - - -def test_merge_exporter_json_appends_new_message_same_day(tmp_path: Path): - dest = tmp_path / "2026-06-02.json" - first = _envelope(_msg("100", "2026-06-02T10:00:00Z")) - merge_exporter_json(dest, first, day="2026-06-02") - second = _envelope(_msg("101", "2026-06-02T23:00:00Z")) - count = merge_exporter_json(dest, second, day="2026-06-02") - assert count == 2 - data = json.loads(dest.read_text(encoding="utf-8")) - ids = [m["id"] for m in data["messages"]] - assert ids == ["100", "101"] - - -def test_merge_exporter_json_updates_same_id(tmp_path: Path): - dest = tmp_path / "2026-06-02.json" - merge_exporter_json( - dest, - _envelope(_msg("100", "2026-06-02T10:00:00Z")), - day="2026-06-02", - ) - merge_exporter_json( - dest, - _envelope({**_msg("100", "2026-06-02T10:00:00Z"), "content": "edited"}), - day="2026-06-02", - ) - data = json.loads(dest.read_text(encoding="utf-8")) - assert len(data["messages"]) == 1 - assert data["messages"][0]["content"] == "edited" - - -def test_merge_exporter_json_filters_wrong_day(tmp_path: Path): - dest = tmp_path / "2026-06-02.json" - incoming = _envelope( - _msg("100", "2026-06-02T10:00:00Z"), - _msg("200", "2026-06-03T01:00:00Z"), - ) - count = merge_exporter_json(dest, incoming, day="2026-06-02") - assert count == 1 - data = json.loads(dest.read_text(encoding="utf-8")) - assert [m["id"] for m in data["messages"]] == ["100"] - - -def test_merge_exporter_json_refreshes_date_range(tmp_path: Path): - dest = tmp_path / "2026-06-02.json" - merge_exporter_json( - dest, - _envelope( - _msg("100", "2026-06-02T10:00:00Z"), - _msg("101", "2026-06-02T23:00:00Z"), - ), - day="2026-06-02", - ) - data = json.loads(dest.read_text(encoding="utf-8")) - assert "dateRange" in data - assert data["dateRange"]["after"].startswith("2026-06-02") - assert data["dateRange"]["before"].startswith("2026-06-02") - assert "exportedAt" in data diff --git a/discord_activity_tracker/tests/test_run_command_coverage.py b/discord_activity_tracker/tests/test_run_command_coverage.py deleted file mode 100644 index f6f5ec71..00000000 --- a/discord_activity_tracker/tests/test_run_command_coverage.py +++ /dev/null @@ -1,298 +0,0 @@ -"""Coverage for run_discord_activity_tracker command _handle_core and helpers.""" - -from __future__ import annotations - -import asyncio -from io import StringIO -from unittest.mock import MagicMock, patch - -import pytest -from django.core.management.base import CommandError - -from discord_activity_tracker.management.commands.run_discord_activity_tracker import ( - Command, - DiscordActivityCollector, - _resolve_exporter_date_bounds, - task_preprocess_workspace, -) - - -def _cmd_collector(**opts): - defaults = { - "dry_run": False, - "skip_discord_sync": False, - "skip_markdown_export": False, - "skip_remote_push": False, - "skip_pinecone": False, - "channels": "", - "since": None, - "until": None, - "task": None, - } - defaults.update(opts) - cmd = Command() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.WARNING = lambda x: x - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options=defaults) - return cmd, collector - - -@pytest.mark.django_db -def test_resolve_bounds_since_after_until_resets(monkeypatch, caplog): - """since > until logs warning and falls back so bounds are recomputed.""" - caplog.set_level("WARNING") - after, before, _per_ch = _resolve_exporter_date_bounds( - {"since": "2026-06-10", "until": "2026-06-01"}, - guild_snowflake=1, - channel_ids=[], - ) - assert before is None - assert after is None - assert "invalid date range" in caplog.text - - -def test_resolve_bounds_bad_since_raises_command_error(): - with pytest.raises(CommandError): - _resolve_exporter_date_bounds( - {"since": "not-a-date", "until": None}, - guild_snowflake=1, - channel_ids=[], - ) - - -@pytest.mark.django_db -def test_handle_core_dry_run_all_branches(monkeypatch, settings): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9001) - _, collector = _cmd_collector( - dry_run=True, - skip_discord_sync=False, - skip_markdown_export=False, - skip_remote_push=False, - skip_pinecone=False, - since="2026-01-01", - until="2026-01-31", - ) - with patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_preprocess_workspace" - ) as tp: - collector.cmd._handle_core(collector.options, collector) - tp.assert_called_once_with(dry_run=True) - out = collector.stdout.getvalue() - assert "DRY RUN" in out - assert "Lower bound" in out - assert "Upper bound" in out - - -@pytest.mark.django_db -def test_handle_core_dry_run_skip_sync_only(monkeypatch, settings): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9002) - _, collector = _cmd_collector( - dry_run=True, - skip_discord_sync=True, - skip_markdown_export=True, - skip_remote_push=True, - skip_pinecone=True, - ) - with patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_preprocess_workspace" - ): - collector.cmd._handle_core(collector.options, collector) - out = collector.stdout.getvalue() - assert "today" in out.lower() - - -@pytest.mark.django_db -def test_handle_core_task_sync_skips_markdown(monkeypatch, settings): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9003) - _, collector = _cmd_collector(dry_run=False, task="sync") - with ( - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_discord_sync" - ) as ts, - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_markdown_export_and_push" - ) as tm, - ): - collector.cmd._handle_core(collector.options, collector) - ts.assert_called_once() - tm.assert_called_once() - assert collector.options["skip_markdown_export"] is True - assert collector.options["skip_remote_push"] is True - - -@pytest.mark.django_db -def test_handle_core_task_export_skips_sync(monkeypatch, settings): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9004) - _, collector = _cmd_collector(dry_run=False, task="export") - with ( - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_discord_sync" - ) as ts, - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_markdown_export_and_push" - ) as tm, - ): - collector.cmd._handle_core(collector.options, collector) - ts.assert_called_once() - tm.assert_called_once() - assert collector.options["skip_discord_sync"] is True - assert collector.options["skip_pinecone"] is True - - -@pytest.mark.django_db -def test_handle_core_non_dry_calls_sync_and_markdown(monkeypatch, settings): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9005) - _, collector = _cmd_collector(dry_run=False) - with ( - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_discord_sync" - ) as ts, - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_markdown_export_and_push" - ) as tm, - ): - collector.cmd._handle_core(collector.options, collector) - ts.assert_called_once() - tm.assert_called_once() - - -@pytest.mark.django_db -def test_handle_core_skip_pinecone_logs(monkeypatch, settings, caplog): - caplog.set_level("INFO") - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9006) - _, collector = _cmd_collector(dry_run=False, skip_pinecone=True) - with ( - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_discord_sync" - ) as ts, - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_markdown_export_and_push" - ) as tm, - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_discord_pinecone_sync" - ) as tp, - ): - collector.cmd._handle_core(collector.options, collector) - collector.sync_pinecone() - ts.assert_called_once() - tm.assert_called_once() - tp.assert_not_called() - assert "skipping Pinecone (--skip-pinecone)" in caplog.text - - -@pytest.mark.django_db -def test_handle_core_propagates_task_failure(monkeypatch, settings): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9007) - _, collector = _cmd_collector(dry_run=False) - with patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_discord_sync", - side_effect=RuntimeError("fail"), - ): - with pytest.raises(RuntimeError, match="fail"): - collector.cmd._handle_core(collector.options, collector) - - -def test_get_collector_normalizes_skip_pinecone_none(): - cmd = Command() - cmd.stdout = StringIO() - cmd.style = MagicMock() - c = cmd.get_collector( - dry_run=False, - skip_discord_sync=False, - skip_markdown_export=False, - skip_remote_push=False, - skip_pinecone=None, - ) - assert c.options.get("skip_pinecone") is False - - -@pytest.mark.django_db -def test_task_preprocess_workspace_dry_run(tmp_path, settings): - settings.WORKSPACE_DIR = tmp_path / "ws" - settings.WORKSPACE_DIR.mkdir(parents=True) - task_preprocess_workspace(dry_run=True) - - -def test_resolve_bounds_since_naive_becomes_utc(): - after, before, _per_ch = _resolve_exporter_date_bounds( - {"since": "2026-04-01T00:00:00", "until": None}, - guild_snowflake=1, - channel_ids=[], - ) - assert after is not None - assert after.tzinfo is not None - assert before is None - - -@pytest.mark.django_db -def test_handle_core_task_all_runs_both_phases(monkeypatch, settings): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9008) - _, collector = _cmd_collector(dry_run=False, task="all") - with ( - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_discord_sync" - ) as ts, - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_markdown_export_and_push" - ) as tm, - ): - collector.cmd._handle_core(collector.options, collector) - ts.assert_called_once() - tm.assert_called_once() - - -@pytest.mark.django_db -def test_handle_core_wraps_discord_exporter_error(monkeypatch, settings): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9009) - _, collector = _cmd_collector(dry_run=False) - with patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.task_discord_sync", - side_effect=CommandError("DiscordChatExporter failed: cli missing"), - ): - with pytest.raises(CommandError, match="DiscordChatExporter"): - collector.cmd._handle_core(collector.options, collector) - - -@pytest.mark.django_db -def test_persist_channel_inserts_messages(monkeypatch, settings, tmp_path): - settings.WORKSPACE_DIR = tmp_path / "ws" - settings.WORKSPACE_DIR.mkdir(parents=True) - gid, cid = 330011, 330022 - guild_info = {"id": gid, "name": "Guild", "iconUrl": ""} - channel_info = { - "id": cid, - "name": "chan", - "type": "GuildTextChat", - "topic": "", - "category": "", - "categoryId": None, - } - messages = [ - { - "id": str(10**12 + 7), - "type": "Default", - "isPinned": False, - "timestamp": "2026-01-15T12:00:00Z", - "content": "hello world example text long enough for validation", - "author": {"id": "1082347485026070548", "name": "user"}, - "attachments": [], - "reactions": [], - } - ] - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - collector = DiscordActivityCollector(cmd=cmd, options={}) - count = asyncio.run(collector._persist_channel(guild_info, channel_info, messages)) - assert count >= 1 diff --git a/discord_activity_tracker/tests/test_run_discord_activity_tracker_command.py b/discord_activity_tracker/tests/test_run_discord_activity_tracker_command.py deleted file mode 100644 index 73eb8a3f..00000000 --- a/discord_activity_tracker/tests/test_run_discord_activity_tracker_command.py +++ /dev/null @@ -1,300 +0,0 @@ -"""Tests for run_discord_activity_tracker management command.""" - -from io import StringIO -from unittest.mock import MagicMock, patch - -import pytest -from django.conf import settings -from django.core.management import call_command -from django.core.management.base import CommandError - -from discord_activity_tracker.management.commands.run_discord_activity_tracker import ( - Command, - DiscordActivityCollector, - _parse_channel_ids, - _resolve_exporter_date_bounds, -) -from discord_activity_tracker.staging_schema import StagingValidationError - - -def _cmd_and_collector(**opts): - defaults = { - "dry_run": False, - "skip_discord_sync": False, - "skip_markdown_export": False, - "skip_remote_push": False, - "skip_pinecone": False, - "channels": "", - "since": None, - "until": None, - "task": None, - } - defaults.update(opts) - cmd = Command() - cmd.stdout = StringIO() - cmd.style = MagicMock() - collector = DiscordActivityCollector(cmd=cmd, options=defaults) - collector.style.SUCCESS = lambda x: x - collector.style.WARNING = lambda x: x - return cmd, collector - - -def test_staging_validation_error_subclasses_value_error(): - assert issubclass(StagingValidationError, ValueError) - - -# --------------------------------------------------------------------------- -# _parse_channel_ids -# --------------------------------------------------------------------------- - - -def test_parse_channel_ids_basic(): - assert _parse_channel_ids("1,2,3") == [1, 2, 3] - - -def test_parse_channel_ids_strips_whitespace(): - assert _parse_channel_ids(" 10 , 20 ") == [10, 20] - - -def test_parse_channel_ids_skips_non_digits(): - assert _parse_channel_ids("abc,123,!@#") == [123] - - -def test_parse_channel_ids_empty_string(): - assert _parse_channel_ids("") == [] - - -# --------------------------------------------------------------------------- -# _resolve_exporter_date_bounds -# --------------------------------------------------------------------------- - - -@pytest.mark.django_db -def test_resolve_bounds_no_since_empty_db_after_is_none(settings): - settings.USE_TZ = True - after, before, per_ch = _resolve_exporter_date_bounds( - {"since": None, "until": None}, - guild_snowflake=888001, - channel_ids=[], - ) - assert before is None - assert after is None - assert per_ch is True - - -def test_resolve_bounds_since_until_only(): - after, before, per_ch = _resolve_exporter_date_bounds( - { - "since": "2026-01-01", - "until": "2026-01-31", - }, - guild_snowflake=1, - channel_ids=[], - ) - assert after is not None and before is not None - assert per_ch is False - - -def test_resolve_bounds_explicit_since_no_until(): - after, before, per_ch = _resolve_exporter_date_bounds( - {"since": "2026-05-01", "until": None}, - guild_snowflake=1, - channel_ids=[], - ) - assert after is not None - assert before is None - assert per_ch is False - - -@pytest.mark.django_db -def test_resolve_bounds_no_since_uses_latest_db_message(): - from datetime import datetime, timezone - - from cppa_user_tracker.models import DiscordProfile - from discord_activity_tracker.models import ( - DiscordChannel, - DiscordMessage, - DiscordServer, - ) - - server = DiscordServer.objects.create(server_id=700, server_name="S", icon_url="") - ch = DiscordChannel.objects.create( - server=server, - channel_id=701, - channel_name="c", - channel_type="text", - ) - author = DiscordProfile.objects.create( - discord_user_id=701001, - username="u", - display_name="U", - avatar_url="", - is_bot=False, - ) - msg_time = datetime(2026, 5, 6, 10, 0, 0, tzinfo=timezone.utc) - DiscordMessage.objects.create( - message_id=701002, - channel=ch, - author=author, - content="hello world example text long enough", - message_created_at=msg_time, - ) - - after, before, per_ch = _resolve_exporter_date_bounds( - {"since": None, "until": None}, - guild_snowflake=700, - channel_ids=[701], - ) - assert before is None - assert after == msg_time - assert per_ch is True - - -# --------------------------------------------------------------------------- -# Channel allowlist from settings vs --channels override -# --------------------------------------------------------------------------- - - -def test_collector_uses_settings_channel_ids(monkeypatch): - monkeypatch.setattr(settings, "DISCORD_CHANNEL_IDS", [111, 222]) - _, c = _cmd_and_collector() - assert c.channel_ids == [111, 222] - - -def test_collector_channels_arg_overrides_settings(monkeypatch): - monkeypatch.setattr(settings, "DISCORD_CHANNEL_IDS", [111, 222]) - _, c = _cmd_and_collector(channels="333,444") - assert c.channel_ids == [333, 444] - - -def test_collector_empty_channels_arg_falls_back_to_settings(monkeypatch): - monkeypatch.setattr(settings, "DISCORD_CHANNEL_IDS", [555]) - _, c = _cmd_and_collector(channels="") - assert c.channel_ids == [555] - - -# --------------------------------------------------------------------------- -# Missing credentials validation -# --------------------------------------------------------------------------- - - -@pytest.mark.django_db -def test_handle_core_raises_when_user_token_missing(monkeypatch): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "") - monkeypatch.setattr(settings, "ALLOW_INTERNAL_DISCORD_TOKENS", False) - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9999) - cmd, collector = _cmd_and_collector() - with pytest.raises(CommandError, match="Discord credentials not configured"): - cmd._handle_core(collector.options, collector=collector) - - -@pytest.mark.django_db -def test_handle_core_raises_when_server_id_missing(monkeypatch): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", None) - cmd, collector = _cmd_and_collector() - with pytest.raises(CommandError, match="DISCORD_SERVER_ID"): - cmd._handle_core(collector.options, collector=collector) - - -# --------------------------------------------------------------------------- -# Dry-run mode -# --------------------------------------------------------------------------- - - -@pytest.mark.django_db -def test_dry_run_prints_config(monkeypatch): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9999) - monkeypatch.setattr(settings, "DISCORD_CHANNEL_IDS", [1, 2, 3]) - out = StringIO() - call_command( - "run_discord_activity_tracker", - dry_run=True, - stdout=out, - verbosity=0, - ) - assert "DRY RUN" in out.getvalue() - - -# --------------------------------------------------------------------------- -# sync_pinecone skipped with --skip-pinecone / --ignore-pinecone -# --------------------------------------------------------------------------- - - -@pytest.mark.django_db -def test_sync_pinecone_skipped_when_skip_flag(monkeypatch): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9999) - _, c = _cmd_and_collector(skip_pinecone=True, dry_run=True) - c.sync_pinecone() - - -@pytest.mark.django_db -def test_sync_pinecone_skipped_when_dry_run(monkeypatch): - _, c = _cmd_and_collector(dry_run=True) - c.sync_pinecone() - - -@pytest.mark.django_db -def test_sync_pinecone_calls_run_cppa_pinecone_sync(monkeypatch): - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 9999) - monkeypatch.setattr(settings, "PINECONE_DISCORD_APP_TYPE", "discord") - monkeypatch.setattr(settings, "PINECONE_DISCORD_NAMESPACE", "discord-messages") - _, c = _cmd_and_collector(skip_pinecone=False, dry_run=False) - with patch( - "discord_activity_tracker.pinecone_runner.call_command", - ) as cc: - c.sync_pinecone() - cc.assert_called_once() - assert cc.call_args[0][0] == "run_cppa_pinecone_sync" - - -@pytest.mark.django_db -def test_sync_pinecone_skipped_when_app_type_empty(monkeypatch): - monkeypatch.setattr(settings, "PINECONE_DISCORD_APP_TYPE", "") - monkeypatch.setattr(settings, "PINECONE_DISCORD_NAMESPACE", "ns") - _, c = _cmd_and_collector(skip_pinecone=False, dry_run=False) - with patch("discord_activity_tracker.pinecone_runner.call_command") as cc: - c.sync_pinecone() - cc.assert_not_called() - - -# --------------------------------------------------------------------------- -# DISCORD_SERVER_ID is already int from settings -# --------------------------------------------------------------------------- - - -@pytest.mark.django_db -def test_server_id_is_already_int_in_settings(monkeypatch): - """Large snowflake as int for DISCORD_SERVER_ID must not break _handle_core.""" - monkeypatch.setattr(settings, "DISCORD_USER_TOKEN", "tok") - monkeypatch.setattr(settings, "DISCORD_SERVER_ID", 331718482485837825) - cmd, collector = _cmd_and_collector(dry_run=True) - cmd._handle_core(collector.options, collector=collector) - - -# --------------------------------------------------------------------------- -# command get_collector wiring -# --------------------------------------------------------------------------- - - -def test_get_collector_returns_discord_activity_collector(): - cmd = Command() - cmd.stdout = StringIO() - cmd.style = MagicMock() - collector = cmd.get_collector( - dry_run=True, - channels="999", - skip_pinecone=True, - skip_discord_sync=False, - skip_markdown_export=False, - skip_remote_push=False, - since=None, - until=None, - task=None, - ) - assert isinstance(collector, DiscordActivityCollector) - assert collector.options["dry_run"] is True - assert collector.channel_ids == [999] diff --git a/discord_activity_tracker/tests/test_services_core.py b/discord_activity_tracker/tests/test_services_core.py deleted file mode 100644 index bf403eaf..00000000 --- a/discord_activity_tracker/tests/test_services_core.py +++ /dev/null @@ -1,304 +0,0 @@ -"""Tests for non-bulk discord_activity_tracker.services helpers.""" - -import uuid -from datetime import datetime, timedelta, timezone - -import pytest -from django.utils import timezone as django_timezone - -from cppa_user_tracker.models import DiscordProfile -from discord_activity_tracker.models import DiscordChannel, DiscordServer -from discord_activity_tracker.services import ( - add_or_update_reaction, - create_or_update_discord_message, - get_active_channels, - get_channel_latest_message_at, - get_or_create_discord_channel, - get_or_create_discord_server, - mark_message_deleted, -) - - -def _uniq_id() -> int: - return uuid.uuid4().int % (2**50) - - -@pytest.fixture -def server(db): - return DiscordServer.objects.create( - server_id=_uniq_id(), server_name="Guild", icon_url="" - ) - - -@pytest.fixture -def channel(server): - return DiscordChannel.objects.create( - server=server, - channel_id=_uniq_id(), - channel_name="general", - channel_type="text", - topic="", - position=0, - ) - - -@pytest.fixture -def author(db): - return DiscordProfile.objects.create( - discord_user_id=_uniq_id(), - username="writer", - display_name="Writer", - avatar_url="", - is_bot=False, - ) - - -@pytest.mark.django_db -def test_get_or_create_discord_server_create_and_update(): - s1, created = get_or_create_discord_server(777, "Old", icon_url="") - assert created is True - s2, created2 = get_or_create_discord_server(777, "NewName", icon_url="http://i") - assert created2 is False - s2.refresh_from_db() - assert s2.server_name == "NewName" - assert s2.icon_url == "http://i" - - -@pytest.mark.django_db -def test_get_or_create_discord_channel_updates(channel, server): - ch, created = get_or_create_discord_channel( - server, channel.channel_id, "general", "text", topic="", position=0 - ) - assert created is False - ch2, created2 = get_or_create_discord_channel( - server, channel.channel_id, "general-renamed", "forum", topic="t", position=1 - ) - assert created2 is False - ch2.refresh_from_db() - assert ch2.channel_name == "general-renamed" - assert ch2.channel_type == "forum" - - -@pytest.mark.django_db -def test_get_or_create_discord_channel_category_fields(server): - """category_id and category_name are persisted on create and update.""" - ch, created = get_or_create_discord_channel( - server=server, - channel_id=_uniq_id(), - channel_name="c-cpp-discussion", - channel_type="GuildTextChat", - category_id=855220194887335977, - category_name="Discussion", - ) - assert created is True - ch.refresh_from_db() - assert ch.category_id == 855220194887335977 - assert ch.category_name == "Discussion" - - # Update category fields - ch2, created2 = get_or_create_discord_channel( - server=server, - channel_id=ch.channel_id, - channel_name="c-cpp-discussion", - channel_type="GuildTextChat", - category_id=999000111222333444, - category_name="NewCategory", - ) - assert created2 is False - ch2.refresh_from_db() - assert ch2.category_id == 999000111222333444 - assert ch2.category_name == "NewCategory" - - -@pytest.mark.django_db -def test_create_or_update_discord_message(channel, author): - mid = _uniq_id() - ts = datetime(2026, 4, 1, 12, 0, 0, tzinfo=timezone.utc) - msg, created = create_or_update_discord_message( - mid, - channel, - author, - "hello", - message_created_at=ts, - attachment_urls=["http://a"], - ) - assert created is True - assert msg.has_attachments is True - msg2, created2 = create_or_update_discord_message( - mid, - channel, - author, - "updated", - message_created_at=ts, - ) - assert created2 is False - msg2.refresh_from_db() - assert msg2.content == "updated" - - -@pytest.mark.django_db -def test_create_or_update_discord_message_type_and_pinned(channel, author): - """message_type and is_pinned are persisted on create and update.""" - mid = _uniq_id() - ts = datetime(2026, 4, 1, tzinfo=timezone.utc) - msg, created = create_or_update_discord_message( - mid, - channel, - author, - "pinned reply", - message_created_at=ts, - message_type="Reply", - is_pinned=True, - ) - assert created is True - msg.refresh_from_db() - assert msg.message_type == "Reply" - assert msg.is_pinned is True - - # Update: unpin and change type - msg2, _ = create_or_update_discord_message( - mid, - channel, - author, - "updated", - message_created_at=ts, - message_type="Default", - is_pinned=False, - ) - msg2.refresh_from_db() - assert msg2.message_type == "Default" - assert msg2.is_pinned is False - - -@pytest.mark.django_db -def test_mark_message_deleted(channel, author): - ts = datetime(2026, 4, 1, tzinfo=timezone.utc) - msg, _ = create_or_update_discord_message( - _uniq_id(), channel, author, "x", message_created_at=ts - ) - deleted_at = datetime(2026, 4, 2, tzinfo=timezone.utc) - mark_message_deleted(msg, deleted_at=deleted_at) - msg.refresh_from_db() - assert msg.is_deleted is True - assert msg.deleted_at == deleted_at - - -@pytest.mark.django_db -def test_add_or_update_reaction(channel, author): - ts = datetime(2026, 4, 1, tzinfo=timezone.utc) - msg, _ = create_or_update_discord_message( - _uniq_id(), channel, author, "react", message_created_at=ts - ) - r1, c1 = add_or_update_reaction(msg, "👍", 1) - assert c1 is True - r2, c2 = add_or_update_reaction(msg, "👍", 5) - assert c2 is False - r2.refresh_from_db() - assert r2.count == 5 - - -@pytest.mark.django_db -def test_get_channel_latest_message_at(channel, author): - assert get_channel_latest_message_at(channel) is None - ts = django_timezone.now() - create_or_update_discord_message( - _uniq_id(), channel, author, "x", message_created_at=ts - ) - assert get_channel_latest_message_at(channel) == ts - - -@pytest.mark.django_db -def test_get_channel_latest_message_at_ignores_deleted(channel, author): - t_old = django_timezone.now() - timedelta(hours=1) - t_new = django_timezone.now() - create_or_update_discord_message( - _uniq_id(), channel, author, "old", message_created_at=t_old - ) - msg_new, _ = create_or_update_discord_message( - _uniq_id(), channel, author, "new", message_created_at=t_new - ) - assert get_channel_latest_message_at(channel) == t_new - mark_message_deleted(msg_new) - assert get_channel_latest_message_at(channel) == t_old - - -@pytest.mark.django_db -def test_get_active_channels_filters_by_days(channel, server, author): - create_or_update_discord_message( - _uniq_id(), - channel, - author, - "recent", - message_created_at=django_timezone.now(), - ) - stale = DiscordChannel.objects.create( - server=server, - channel_id=_uniq_id(), - channel_name="quiet", - channel_type="text", - topic="", - position=1, - ) - create_or_update_discord_message( - _uniq_id(), - stale, - author, - "old", - message_created_at=django_timezone.now() - timedelta(days=60), - ) - active = get_active_channels(server, days=30) - ids = {c.channel_id for c in active} - assert channel.channel_id in ids - assert stale.channel_id not in ids - - -@pytest.mark.django_db -def test_get_active_channels_allowlist_filter(server, author): - """channel_ids allowlist pre-filters the queryset.""" - now = django_timezone.now() - ch1 = DiscordChannel.objects.create( - server=server, - channel_id=_uniq_id(), - channel_name="allowed", - channel_type="text", - ) - ch2 = DiscordChannel.objects.create( - server=server, - channel_id=_uniq_id(), - channel_name="blocked", - channel_type="text", - ) - for ch in (ch1, ch2): - create_or_update_discord_message( - _uniq_id(), ch, author, "x", message_created_at=now - ) - result = get_active_channels(server, days=30, channel_ids=[ch1.channel_id]) - ids = {c.channel_id for c in result} - assert ch1.channel_id in ids - assert ch2.channel_id not in ids - - -@pytest.mark.django_db -def test_get_active_channels_empty_allowlist_returns_all(server, author): - """Empty channel_ids means no filter — all active channels returned.""" - now = django_timezone.now() - ch1 = DiscordChannel.objects.create( - server=server, - channel_id=_uniq_id(), - channel_name="a", - channel_type="text", - ) - ch2 = DiscordChannel.objects.create( - server=server, - channel_id=_uniq_id(), - channel_name="b", - channel_type="text", - ) - for ch in (ch1, ch2): - create_or_update_discord_message( - _uniq_id(), ch, author, "x", message_created_at=now - ) - result = get_active_channels(server, days=30, channel_ids=None) - ids = {c.channel_id for c in result} - assert ch1.channel_id in ids - assert ch2.channel_id in ids diff --git a/discord_activity_tracker/tests/test_services_extras.py b/discord_activity_tracker/tests/test_services_extras.py deleted file mode 100644 index 4a8f205f..00000000 --- a/discord_activity_tracker/tests/test_services_extras.py +++ /dev/null @@ -1,122 +0,0 @@ -"""Coverage for small services.py branches.""" - -from datetime import datetime, timezone - -import pytest -from django.utils import timezone as django_timezone - -from cppa_user_tracker.models import DiscordProfile -from discord_activity_tracker.models import DiscordChannel, DiscordServer -from discord_activity_tracker.services import ( - bulk_process_message_batch, - bulk_upsert_discord_messages, - bulk_upsert_discord_reactions, - bulk_upsert_discord_users, - mark_message_deleted, -) - - -def _uniq(): - import uuid - - return uuid.uuid4().int % (2**50) - - -@pytest.fixture -def channel(db): - s = DiscordServer.objects.create(server_id=_uniq(), server_name="S", icon_url="") - return DiscordChannel.objects.create( - server=s, - channel_id=_uniq(), - channel_name="c", - channel_type="text", - ) - - -@pytest.mark.django_db -def test_mark_message_deleted_default_timestamp(channel): - author = DiscordProfile.objects.create( - discord_user_id=_uniq(), - username="u", - display_name="", - avatar_url="", - is_bot=False, - ) - ts = datetime(2026, 5, 1, 12, 0, 0, tzinfo=timezone.utc) - from discord_activity_tracker.services import create_or_update_discord_message - - msg, _ = create_or_update_discord_message( - _uniq(), channel, author, "x", message_created_at=ts - ) - before = django_timezone.now() - mark_message_deleted(msg) - msg.refresh_from_db() - assert msg.is_deleted is True - assert msg.deleted_at is not None - assert msg.deleted_at >= before - - -@pytest.mark.django_db -def test_bulk_upsert_skips_message_without_author(channel): - """Covers bulk_upsert_discord_messages warning path when author missing.""" - now = datetime(2026, 2, 17, 12, 0, 0, tzinfo=timezone.utc) - out = bulk_upsert_discord_messages( - [ - { - "message_id": _uniq(), - "author": {"user_id": _uniq()}, - "content": "orphan", - "message_created_at": now, - "attachment_urls": [], - } - ], - channel, - {}, - ) - assert out == {} - - -@pytest.mark.django_db -def test_bulk_upsert_reactions_skips_unknown_message(): - bulk_upsert_discord_reactions( - [{"discord_message_id": _uniq(), "emoji": "\U0001f44d", "count": 1}], - {}, - ) - - -@pytest.mark.django_db -def test_bulk_process_empty_returns_zero(channel): - assert bulk_process_message_batch([], channel) == 0 - - -def test_bulk_upsert_reactions_empty(): - bulk_upsert_discord_reactions([], {}) - - -@pytest.mark.django_db -def test_bulk_upsert_users_updates_existing_profile_fields(): - uid = _uniq() - DiscordProfile.objects.create( - discord_user_id=uid, - type="discord", - username="old_name", - display_name="Old", - avatar_url="", - is_bot=False, - ) - bulk_upsert_discord_users( - [ - { - "user_id": uid, - "username": "new_name", - "display_name": "New", - "avatar_url": "http://avatar.example/x.png", - "is_bot": True, - } - ] - ) - p = DiscordProfile.objects.get(discord_user_id=uid) - assert p.username == "new_name" - assert p.display_name == "New" - assert "avatar.example" in p.avatar_url - assert p.is_bot is True diff --git a/discord_activity_tracker/tests/test_settings_channel_filter.py b/discord_activity_tracker/tests/test_settings_channel_filter.py deleted file mode 100644 index 11a29733..00000000 --- a/discord_activity_tracker/tests/test_settings_channel_filter.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Tests for DISCORD_CHANNEL_IDS and DISCORD_SERVER_ID parsing in config/settings.py. - -These tests simulate the parsing logic defined in settings.py by re-running the -same code with various env var values, rather than reloading Django settings -(which is not possible at test time). -""" - -from __future__ import annotations - - -def _parse_channel_ids(raw: str) -> list[int]: - """Mirror of the parsing logic in config/settings.py.""" - raw = (raw or "").strip() - return [int(c.strip()) for c in raw.split(",") if c.strip().isdigit()] - - -def _parse_server_id(raw: str) -> "int | None": - """Mirror of the parsing logic in config/settings.py.""" - raw = (raw or "").strip() - return int(raw) if raw.isdigit() else None - - -# --------------------------------------------------------------------------- -# DISCORD_CHANNEL_IDS -# --------------------------------------------------------------------------- - - -def test_channel_ids_comma_separated(): - result = _parse_channel_ids("851121440425639956,123456789012345678") - assert result == [851121440425639956, 123456789012345678] - - -def test_channel_ids_single_value(): - assert _parse_channel_ids("9999") == [9999] - - -def test_channel_ids_empty_string(): - assert _parse_channel_ids("") == [] - - -def test_channel_ids_whitespace_only(): - assert _parse_channel_ids(" ") == [] - - -def test_channel_ids_non_digit_values_skipped(): - result = _parse_channel_ids("valid123,abc,!@#,456") - assert result == [456] - - -def test_channel_ids_mixed_valid_and_invalid(): - result = _parse_channel_ids("100,abc,200,,300") - assert result == [100, 200, 300] - - -def test_channel_ids_strips_whitespace_around_each(): - result = _parse_channel_ids(" 100 , 200 ") - assert result == [100, 200] - - -# --------------------------------------------------------------------------- -# DISCORD_SERVER_ID -# --------------------------------------------------------------------------- - - -def test_server_id_valid_number(): - assert _parse_server_id("331718482485837825") == 331718482485837825 - - -def test_server_id_empty_string(): - assert _parse_server_id("") is None - - -def test_server_id_non_numeric(): - assert _parse_server_id("my-guild") is None - - -def test_server_id_whitespace(): - assert _parse_server_id(" ") is None diff --git a/discord_activity_tracker/tests/test_staging_schema.py b/discord_activity_tracker/tests/test_staging_schema.py deleted file mode 100644 index 5c847950..00000000 --- a/discord_activity_tracker/tests/test_staging_schema.py +++ /dev/null @@ -1,119 +0,0 @@ -"""Tests for discord_activity_tracker.staging_schema validation.""" - -import pytest - -from discord_activity_tracker.staging_schema import ( - StagingValidationError, - validate_envelope, - validate_normalized_message, -) -from discord_activity_tracker.sync.chat_exporter import convert_exporter_message_to_dict - - -def _minimal_exporter_message(): - return { - "id": "1399663560723923005", - "type": "Default", - "isPinned": False, - "timestamp": "2026-01-01T12:00:00Z", - "content": "hello world example text long enough", - "author": {"id": "1082347485026070548", "name": "user"}, - "attachments": [], - "reactions": [], - } - - -def test_validate_normalized_well_formed_message(): - raw = _minimal_exporter_message() - converted = convert_exporter_message_to_dict( - raw, server_id=900, channel_id=851121440425639956 - ) - model = validate_normalized_message(converted, source="test") - assert model.id == 1399663560723923005 - assert model.source_url.startswith("https://discord.com/channels/") - assert model.actor_id == "1082347485026070548" - assert model.occurred_at.endswith("Z") - - -def test_validate_normalized_well_formed_reactions(): - raw = { - "id": "1", - "timestamp": "2026-01-01T00:00:00Z", - "content": "x", - "author": {"id": "1", "name": "a"}, - "attachments": [], - "reactions": [ - {"emoji": {"id": None, "name": "thumbsup", "isAnimated": False}, "count": 2} - ], - } - converted = convert_exporter_message_to_dict(raw, server_id=1, channel_id=2) - model = validate_normalized_message(converted) - assert len(model.reactions) == 1 - assert model.reactions[0].emoji == "thumbsup" - assert model.reactions[0].count == 2 - - -def test_validate_normalized_malformed_rejects_with_staging_validation_error(): - bad = { - "id": 1, - "content": "", - "created_at": "", - "edited_at": None, - "message_type": "Default", - "is_pinned": False, - "author": { - "id": 0, - "username": "x", - "global_name": "", - "avatar_url": "", - "bot": False, - }, - "attachments": [], - "reactions": [], - "reference": None, - } - with pytest.raises( - StagingValidationError, match="Invalid normalized Discord message" - ) as excinfo: - validate_normalized_message(bad, source="unit") - assert "pydantic" not in type(excinfo.value).__name__.lower() - err = excinfo.value - assert err.__cause__ is not None - - -def test_validate_normalized_rejects_created_at_without_z_suffix(): - raw = _minimal_exporter_message() - bad = convert_exporter_message_to_dict(raw, server_id=1, channel_id=2) - bad["created_at"] = "2026-01-01T00:00:00+00:00" - with pytest.raises( - StagingValidationError, match="Invalid normalized Discord message" - ): - validate_normalized_message(bad, source="unit") - - -def test_validate_envelope_rejects_non_list_messages(): - with pytest.raises(StagingValidationError, match="Invalid Discord export envelope"): - validate_envelope( - { - "guild": {"id": "1", "name": "G"}, - "channel": {"id": "2", "name": "C"}, - "messages": "nope", - }, - source="x.json", - ) - - -def test_validate_envelope_messages_none_becomes_empty_list(): - env = validate_envelope( - { - "guild": {"id": "1", "name": "G"}, - "channel": {"id": "2", "name": "C"}, - "messages": None, - }, - source="empty.json", - ) - assert env.messages == [] - guild = env.guild.model_dump(by_alias=True) - channel = env.channel.model_dump(by_alias=True) - assert guild.get("id") == "1" - assert channel.get("name") == "C" diff --git a/discord_activity_tracker/tests/test_staging_schema_extra.py b/discord_activity_tracker/tests/test_staging_schema_extra.py deleted file mode 100644 index 566b699a..00000000 --- a/discord_activity_tracker/tests/test_staging_schema_extra.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Extra coverage for staging_schema.""" - -from __future__ import annotations - -import pytest - -from discord_activity_tracker.staging_schema import ( - build_staging_json_schema_bundle, - validate_normalized_message, - write_staging_json_schema, -) - - -def test_build_staging_json_schema_bundle_has_models(): - bundle = build_staging_json_schema_bundle() - assert bundle["title"] == "discord_staging_v1" - assert "discord_chat_exporter_envelope" in bundle - assert "normalized_discord_message" in bundle - - -def test_write_staging_json_schema_writes_file(tmp_path): - p = tmp_path / "out.json" - out = write_staging_json_schema(p) - assert out == p - assert p.read_text(encoding="utf-8").startswith("{") - - -@pytest.mark.django_db -def test_validate_normalized_blank_edited_at_becomes_none(): - from discord_activity_tracker.sync.chat_exporter import ( - convert_exporter_message_to_dict, - ) - - raw = { - "id": "1", - "timestamp": "2026-01-01T00:00:00Z", - "timestampEdited": " ", - "content": "hello world example text long enough", - "author": {"id": "1", "name": "a"}, - "attachments": [], - "reactions": [], - } - d = convert_exporter_message_to_dict(raw, server_id=1, channel_id=2) - m = validate_normalized_message(d, source="t") - assert m.edited_at is None diff --git a/discord_activity_tracker/tests/test_sync_chat_exporter.py b/discord_activity_tracker/tests/test_sync_chat_exporter.py deleted file mode 100644 index 93dea194..00000000 --- a/discord_activity_tracker/tests/test_sync_chat_exporter.py +++ /dev/null @@ -1,777 +0,0 @@ -"""Tests for discord_activity_tracker.sync.chat_exporter.""" - -import json -from datetime import datetime, timezone -from io import StringIO -from unittest.mock import MagicMock, patch - -import pytest - -from discord_activity_tracker.sync.chat_exporter import ( - ChannelDayExport, - DiscordChatExporterError, - _sorted_discord_export_json_paths, - filter_discord_export_json_paths, - _get_cli_path, - convert_exporter_message_to_dict, - export_and_parse_guild, - export_guild_to_json, - parse_channels_command_stdout, - parse_exported_json, - validate_discord_chat_exporter_cli_architecture, -) - - -def test_filter_discord_export_json_paths_drops_dot_underscore(tmp_path): - real = tmp_path / "Together.json" - sidecar = tmp_path / "._Together.json" - real.touch() - sidecar.touch() - assert filter_discord_export_json_paths([real, sidecar]) == [real] - - -def test_sorted_discord_export_json_paths_skips_appledouble_sidecars(tmp_path): - d = tmp_path / "staging" - d.mkdir() - (d / "Together.json").write_text("{}", encoding="utf-8") - (d / "._Together.json").write_bytes(b"\xb0not utf8") - assert _sorted_discord_export_json_paths(d) == [d / "Together.json"] - - -def test_get_cli_path_defaults_to_workspace_script_on_windows(tmp_path, settings): - settings.DISCORD_CHAT_EXPORTER_CLI = None - with ( - patch( - "discord_activity_tracker.sync.chat_exporter.get_workspace_root", - return_value=tmp_path, - ), - patch("sys.platform", "win32"), - ): - assert _get_cli_path() == tmp_path / "script" / "DiscordChatExporter.Cli.exe" - - -def test_get_cli_path_defaults_to_workspace_script_on_macos(tmp_path, settings): - settings.DISCORD_CHAT_EXPORTER_CLI = None - with ( - patch( - "discord_activity_tracker.sync.chat_exporter.get_workspace_root", - return_value=tmp_path, - ), - patch("sys.platform", "darwin"), - ): - assert _get_cli_path() == tmp_path / "script" / "DiscordChatExporter.Cli" - - -def test_get_cli_path_respects_discord_chat_exporter_cli_env(settings, tmp_path): - custom = tmp_path / "my-cli.exe" - custom.write_text("fake", encoding="utf-8") - settings.DISCORD_CHAT_EXPORTER_CLI = str(custom) - assert _get_cli_path() == custom.resolve() - - -def test_cli_missing_error_includes_releases_url(tmp_path): - with patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=tmp_path / "missing.exe", - ): - with pytest.raises( - DiscordChatExporterError, match="Tyrrrz/DiscordChatExporter" - ): - export_guild_to_json("tok", 1, tmp_path / "out") - - -def test_export_guild_cli_missing_raises(tmp_path): - with patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=tmp_path / "missing.exe", - ): - with pytest.raises(DiscordChatExporterError, match="CLI not found"): - export_guild_to_json("tok", 1, tmp_path / "out") - - -def test_export_guild_success(tmp_path): - cli = tmp_path / "DiscordChatExporter.Cli.exe" - cli.write_text("fake", encoding="utf-8") - out = tmp_path / "exp" - channel_id = 100 - day_str = "2026-01-02" - after = datetime(2026, 1, 2, tzinfo=timezone.utc) - before = datetime(2026, 1, 3, tzinfo=timezone.utc) - - proc = MagicMock() - proc.stdout = StringIO("line1\n\n") - proc.stderr.read.return_value = "" - - def wait(): - proc.returncode = 0 - out.mkdir(parents=True, exist_ok=True) - (out / f"{channel_id}_{day_str}.json").write_text("{}", encoding="utf-8") - - proc.wait = wait - - with ( - patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=cli, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.Popen", - return_value=proc, - ), - ): - exports = export_guild_to_json( - "user-token", - 42, - out, - channel_ids=[channel_id], - after_date=after, - before_date=before, - include_threads="All", - ) - - assert len(exports) == 1 - assert exports[0].path == out / f"{channel_id}_{day_str}.json" - assert exports[0].day_str == day_str - - -def test_export_guild_nonzero_exit_raises(tmp_path): - cli = tmp_path / "DiscordChatExporter.Cli.exe" - cli.write_text("fake", encoding="utf-8") - out = tmp_path / "exp" - - proc = MagicMock() - proc.stdout = StringIO("") - proc.stderr.read.return_value = "boom" - - def wait(): - proc.returncode = 1 - - proc.wait = wait - - with ( - patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=cli, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.Popen", - return_value=proc, - ), - ): - with pytest.raises(DiscordChatExporterError, match="exit code"): - export_guild_to_json( - "tok", - 1, - out, - channel_ids=[1], - after_date=datetime(2026, 1, 2, tzinfo=timezone.utc), - before_date=datetime(2026, 1, 3, tzinfo=timezone.utc), - ) - - -def test_export_guild_auth_failure_retries_with_reextracted_token(tmp_path, settings): - settings.ALLOW_INTERNAL_DISCORD_TOKENS = True - cli = tmp_path / "DiscordChatExporter.Cli.exe" - cli.write_text("fake", encoding="utf-8") - out = tmp_path / "exp" - - call_count = {"n": 0} - - def make_proc(): - proc = MagicMock() - proc.stdout = StringIO("line1\n") - if call_count["n"] == 0: - proc.stderr.read.return_value = "HTTP 401 Unauthorized" - proc.wait = lambda: setattr(proc, "returncode", 1) or None - else: - proc.stderr.read.return_value = "" - - def wait_ok(): - proc.returncode = 0 - out.mkdir(parents=True, exist_ok=True) - (out / "100_2026-01-02.json").write_text("{}", encoding="utf-8") - - proc.wait = wait_ok - call_count["n"] += 1 - return proc - - with ( - patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=cli, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.Popen", - side_effect=lambda *a, **k: make_proc(), - ), - patch( - "discord_activity_tracker.utils.discord_internal_tokens_store.extract_and_save_discord_internal_tokens", - return_value="fresh-tok", - ), - ): - exports = export_guild_to_json( - "old-tok", - 42, - out, - channel_ids=[100], - after_date=datetime(2026, 1, 2, tzinfo=timezone.utc), - before_date=datetime(2026, 1, 3, tzinfo=timezone.utc), - ) - - assert call_count["n"] == 2 - assert exports[0].path == out / "100_2026-01-02.json" - - -def test_export_guild_unexpected_wraps(tmp_path): - cli = tmp_path / "DiscordChatExporter.Cli.exe" - cli.write_text("fake", encoding="utf-8") - - with ( - patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=cli, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.Popen", - side_effect=OSError("bad"), - ), - ): - with pytest.raises(DiscordChatExporterError, match="Unexpected"): - export_guild_to_json( - "tok", - 1, - tmp_path / "o", - channel_ids=[1], - after_date=datetime(2026, 1, 2, tzinfo=timezone.utc), - before_date=datetime(2026, 1, 3, tzinfo=timezone.utc), - ) - - -def test_export_guild_output_path_is_explicit_json_file(tmp_path): - """Per-day export passes an explicit ``-o`` JSON file path (no directory slash).""" - cli = tmp_path / "DiscordChatExporter.Cli.exe" - cli.write_text("fake", encoding="utf-8") - out = tmp_path / "exp" - captured = {} - - proc = MagicMock() - proc.stdout = StringIO("") - proc.stderr.read.return_value = "" - - def wait(): - proc.returncode = 0 - out.mkdir(parents=True, exist_ok=True) - (out / "1_2026-01-02.json").write_text("{}", encoding="utf-8") - - proc.wait = wait - - def popen(cmd, **_kwargs): - captured["cmd"] = cmd - return proc - - with ( - patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=cli, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.Popen", - side_effect=popen, - ), - ): - export_guild_to_json( - "tok", - 1, - out, - channel_ids=[1], - after_date=datetime(2026, 1, 2, tzinfo=timezone.utc), - before_date=datetime(2026, 1, 3, tzinfo=timezone.utc), - ) - - output_index = captured["cmd"].index("--output") + 1 - output_value = captured["cmd"][output_index] - assert output_value.endswith("1_2026-01-02.json") - - -def test_export_guild_per_channel_parallel_is_one(tmp_path, settings): - settings.DISCORD_CHAT_EXPORTER_PARALLEL = 4 - cli = tmp_path / "DiscordChatExporter.Cli.exe" - cli.write_text("fake", encoding="utf-8") - out = tmp_path / "exp" - captured = {} - - proc = MagicMock() - proc.stdout = StringIO("") - proc.stderr.read.return_value = "" - - def wait(): - proc.returncode = 0 - out.mkdir(parents=True, exist_ok=True) - (out / "1_2026-01-02.json").write_text("{}", encoding="utf-8") - - proc.wait = wait - - def popen(cmd, **_kwargs): - captured["cmd"] = cmd - return proc - - with ( - patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=cli, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.Popen", - side_effect=popen, - ), - ): - export_guild_to_json( - "tok", - 1, - out, - channel_ids=[1], - after_date=datetime(2026, 1, 2, tzinfo=timezone.utc), - before_date=datetime(2026, 1, 3, tzinfo=timezone.utc), - ) - - par_idx = captured["cmd"].index("--parallel") - assert captured["cmd"][par_idx + 1] == "1" - - -def test_parse_channels_command_stdout_skips_threads_and_banner(): - text = ( - "Some banner line\n" - "851121440425639956 | #cpp-discussion\n" - " * 999888777666555444 | Thread / foo | Active\n" - "123456789012345678 | voice-room\n" - ) - assert parse_channels_command_stdout(text) == [ - 851121440425639956, - 123456789012345678, - ] - - -def test_parse_channels_command_stdout_empty(): - assert parse_channels_command_stdout("") == [] - - -def test_validate_cli_rejects_intel_only_on_arm64_mac(tmp_path, monkeypatch): - cli = tmp_path / "DiscordChatExporter.Cli" - cli.write_bytes(b"\x00") - monkeypatch.setattr("sys.platform", "darwin") - monkeypatch.setattr("platform.machine", lambda: "arm64") - monkeypatch.setattr( - "discord_activity_tracker.sync.chat_exporter._file_command_brief_description", - lambda _p: "Mach-O 64-bit executable x86_64", - ) - with pytest.raises(DiscordChatExporterError, match="Intel-only"): - validate_discord_chat_exporter_cli_architecture(cli) - - -def test_validate_cli_accepts_arm64_on_arm64_mac(tmp_path, monkeypatch, caplog): - import logging - - caplog.set_level(logging.INFO) - cli = tmp_path / "DiscordChatExporter.Cli" - cli.write_bytes(b"\x00") - monkeypatch.setattr("sys.platform", "darwin") - monkeypatch.setattr("platform.machine", lambda: "arm64") - monkeypatch.setattr( - "discord_activity_tracker.sync.chat_exporter._file_command_brief_description", - lambda _p: "Mach-O 64-bit executable arm64", - ) - validate_discord_chat_exporter_cli_architecture(cli) - assert any("arch check OK" in r.message for r in caplog.records) - - -def test_validate_cli_skips_windows(tmp_path, monkeypatch): - cli = tmp_path / "DiscordChatExporter.Cli.exe" - cli.write_bytes(b"\x00") - monkeypatch.setattr("sys.platform", "win32") - validate_discord_chat_exporter_cli_architecture(cli) # no raise - - -def test_export_guild_sigkill_error_message_hints_parallel(tmp_path): - cli = tmp_path / "DiscordChatExporter.Cli.exe" - cli.write_text("fake", encoding="utf-8") - out = tmp_path / "exp" - - proc = MagicMock() - proc.stdout = StringIO("") - proc.stderr.read.return_value = "" - - def wait(): - proc.returncode = -9 - - proc.wait = wait - - with ( - patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=cli, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.Popen", - return_value=proc, - ), - ): - with pytest.raises(DiscordChatExporterError, match="SIGKILL"): - export_guild_to_json( - "tok", - 1, - out, - channel_ids=[1], - after_date=datetime(2026, 1, 2, tzinfo=timezone.utc), - before_date=datetime(2026, 1, 3, tzinfo=timezone.utc), - ) - - -def test_sequential_export_skips_channels_cli_when_channel_ids_set(tmp_path, settings): - """Explicit allowlist avoids `channels` subprocess (SIGKILL/OOM on huge guilds).""" - settings.DISCORD_CHAT_EXPORTER_SEQUENTIAL_EXPORT = True - cli = tmp_path / "DiscordChatExporter.Cli.exe" - cli.write_text("fake", encoding="utf-8") - out = tmp_path / "exp" - run_calls: list[list[str]] = [] - day_str = "2026-01-01" - after = datetime(2026, 1, 1, tzinfo=timezone.utc) - before = datetime(2026, 1, 2, tzinfo=timezone.utc) - - def capture_run(cmd, **_kwargs): - run_calls.append(list(cmd)) - - class R: - returncode = 1 - - return R() - - def make_popen(cmd, **_kwargs): - assert cmd[1] == "export" - ch = cmd[cmd.index("--channel") + 1] - proc = MagicMock() - proc.stdout = StringIO("") - proc.stderr.read.return_value = "" - - def wait(): - proc.returncode = 0 - out.mkdir(parents=True, exist_ok=True) - (out / f"{ch}_{day_str}.json").write_text("{}", encoding="utf-8") - - proc.wait = wait - return proc - - with ( - patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=cli, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.validate_discord_chat_exporter_cli_architecture", - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.run", - side_effect=capture_run, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.Popen", - side_effect=make_popen, - ), - ): - exports = export_guild_to_json( - "tok", - 1, - out, - channel_ids=[222, 111, 222], - after_date=after, - before_date=before, - ) - - assert run_calls == [] - assert len(exports) == 2 - - -def test_export_guild_adds_after_before_flags(tmp_path): - cli = tmp_path / "DiscordChatExporter.Cli.exe" - cli.write_text("fake", encoding="utf-8") - out = tmp_path / "exp" - captured_cmds: list[list[str]] = [] - - proc = MagicMock() - proc.stdout = StringIO("") - proc.stderr.read.return_value = "" - - def wait(): - proc.returncode = 0 - out.mkdir(parents=True, exist_ok=True) - (out / "7_2026-01-02.json").write_text("{}", encoding="utf-8") - - proc.wait = wait - - def popen(cmd, **_kwargs): - captured_cmds.append(list(cmd)) - return proc - - with ( - patch( - "discord_activity_tracker.sync.chat_exporter._get_cli_path", - return_value=cli, - ), - patch( - "discord_activity_tracker.sync.chat_exporter.validate_discord_chat_exporter_cli_architecture", - ), - patch( - "discord_activity_tracker.sync.chat_exporter.subprocess.Popen", - side_effect=popen, - ), - ): - export_guild_to_json( - "tok", - 7, - out, - channel_ids=[7], - after_date=datetime(2026, 1, 2, 3, 4, 5, tzinfo=timezone.utc), - before_date=datetime(2026, 1, 2, 12, 0, 0, tzinfo=timezone.utc), - ) - - assert captured_cmds - assert any("--after" in cmd and "--before" in cmd for cmd in captured_cmds) - - -def test_parse_exported_json_roundtrip(tmp_path): - p = tmp_path / "x.json" - data = {"guild": {"id": "1"}, "channel": {}, "messages": []} - p.write_text(json.dumps(data), encoding="utf-8") - assert parse_exported_json(p) == data - - -def test_parse_exported_json_invalid(tmp_path): - p = tmp_path / "bad.json" - p.write_text("{", encoding="utf-8") - with pytest.raises(json.JSONDecodeError): - parse_exported_json(p) - - -def test_parse_exported_json_io_error(tmp_path): - p = tmp_path / "x.json" - p.touch() - with patch( - "discord_activity_tracker.sync.chat_exporter.open", - side_effect=OSError("read failed"), - ): - with pytest.raises(OSError, match="read failed"): - parse_exported_json(p) - - -def test_convert_exporter_message_reference(): - raw = { - "id": "10", - "timestamp": "2026-01-01T00:00:00", - "content": "c", - "author": {"id": "1", "name": "a"}, - "attachments": [], - "reactions": [], - "reference": {"messageId": "9"}, - } - out = convert_exporter_message_to_dict(raw) - # reference messageId should be coerced to int - assert out["reference"]["message_id"] == 9 - - -# --- new: ID coercion, emoji flattening, avatarUrl, message_type, is_pinned --- - - -def test_convert_exporter_message_ids_are_int(): - """All snowflake IDs must be coerced from string to int.""" - raw = { - "id": "1399663560723923005", - "type": "Default", - "isPinned": False, - "timestamp": "2025-07-29T04:03:17.368-04:00", - "content": "hello", - "author": {"id": "1082347485026070548", "name": "raubtier"}, - "attachments": [], - "reactions": [], - } - out = convert_exporter_message_to_dict(raw) - assert out["id"] == 1399663560723923005 - assert isinstance(out["id"], int) - assert out["author"]["id"] == 1082347485026070548 - assert isinstance(out["author"]["id"], int) - - -def test_convert_exporter_message_canonical_fields_with_server_channel(): - raw = { - "id": "10", - "timestamp": "2026-01-01T00:00:00Z", - "content": "c", - "author": {"id": "1", "name": "a"}, - "attachments": [], - "reactions": [], - } - out = convert_exporter_message_to_dict(raw, server_id=99, channel_id=100) - assert out["actor_id"] == "1" - assert out["occurred_at"] == "2026-01-01T00:00:00Z" - assert out["source_url"] == "https://discord.com/channels/99/100/10" - - -def test_convert_exporter_message_reaction_emoji_flattened(): - """Reaction emoji dict must be flattened to a plain string.""" - raw = { - "id": "1", - "timestamp": "2026-01-01T00:00:00Z", - "content": "", - "author": {"id": "1", "name": "a"}, - "attachments": [], - "reactions": [ - {"emoji": {"id": None, "name": "👍", "isAnimated": False}, "count": 3} - ], - } - out = convert_exporter_message_to_dict(raw) - assert out["reactions"][0]["emoji"] == "👍" - assert out["reactions"][0]["count"] == 3 - - -def test_convert_exporter_message_reaction_malformed_count_defaults(): - raw = { - "id": "1", - "timestamp": "2026-01-01T00:00:00Z", - "content": "", - "author": {"id": "1", "name": "a"}, - "attachments": [], - "reactions": [ - {"emoji": {"name": "x"}, "count": "not-a-number"}, - {"emoji": {"name": "y"}, "count": -2}, - ], - } - out = convert_exporter_message_to_dict(raw) - assert len(out["reactions"]) == 2 - assert out["reactions"][0]["count"] == 0 - assert out["reactions"][1]["count"] == 0 - - -def test_convert_exporter_message_reaction_null_emoji_is_dropped(): - raw = { - "id": "1", - "timestamp": "2026-01-01T00:00:00Z", - "content": "", - "author": {"id": "1", "name": "a"}, - "attachments": [], - "reactions": [{"emoji": None, "count": 1}], - } - out = convert_exporter_message_to_dict(raw) - assert out["reactions"] == [] - - -def test_convert_exporter_message_avatarUrl_mapped(): - """Author avatarUrl must be mapped to avatar_url.""" - raw = { - "id": "2", - "timestamp": "2026-01-01T00:00:00Z", - "content": "hi", - "author": { - "id": "5", - "name": "hero", - "avatarUrl": "https://cdn.discordapp.com/avatars/avatar.png", - }, - "attachments": [], - "reactions": [], - } - out = convert_exporter_message_to_dict(raw) - assert ( - out["author"]["avatar_url"] == "https://cdn.discordapp.com/avatars/avatar.png" - ) - - -def test_convert_exporter_message_type_and_is_pinned(): - raw = { - "id": "3", - "type": "Reply", - "isPinned": True, - "timestamp": "2026-01-01T00:00:00Z", - "content": "reply here", - "author": {"id": "1", "name": "a"}, - "attachments": [], - "reactions": [], - } - out = convert_exporter_message_to_dict(raw) - assert out["message_type"] == "Reply" - assert out["is_pinned"] is True - - -def test_convert_exporter_message_type_defaults_to_default(): - raw = { - "id": "4", - "timestamp": "2026-01-01T00:00:00Z", - "content": "x", - "author": {"id": "1", "name": "a"}, - "attachments": [], - "reactions": [], - } - out = convert_exporter_message_to_dict(raw) - assert out["message_type"] == "Default" - assert out["is_pinned"] is False - - -def test_convert_exporter_reference_message_id_int(): - """reference.messageId snowflake string must be coerced to int.""" - raw = { - "id": "10", - "timestamp": "2026-01-01T00:00:00Z", - "content": "reply", - "author": {"id": "1", "name": "a"}, - "attachments": [], - "reactions": [], - "reference": {"messageId": "1399663560723923000"}, - } - out = convert_exporter_message_to_dict(raw) - assert out["reference"]["message_id"] == 1399663560723923000 - assert isinstance(out["reference"]["message_id"], int) - - -def test_export_and_parse_skips_bad_file(tmp_path): - bad = tmp_path / "bad.json" - bad.write_text("{", encoding="utf-8") - - with patch( - "discord_activity_tracker.sync.chat_exporter.export_guild_to_json", - return_value=[ChannelDayExport(path=bad, day_str="2026-01-01", channel_id=1)], - ): - assert export_and_parse_guild("t", 1, tmp_path / "o") == [] - - -def test_export_and_parse_returns_channels(tmp_path): - ok = tmp_path / "ok.json" - ok.write_text( - json.dumps( - {"guild": {"id": "g"}, "channel": {"id": "c"}, "messages": [{"id": "1"}]} - ), - encoding="utf-8", - ) - - with patch( - "discord_activity_tracker.sync.chat_exporter.export_guild_to_json", - return_value=[ChannelDayExport(path=ok, day_str="2026-01-01", channel_id=1)], - ): - rows = export_and_parse_guild("t", 1, tmp_path / "o") - - assert len(rows) == 1 - assert rows[0]["guild"] == {"id": "g"} - assert rows[0]["file_path"] == ok - - -def test_convert_exporter_message_with_embeds_key_ignored(): - raw = { - "id": "1", - "timestamp": "2026-01-01T00:00:00Z", - "content": "body", - "author": {"id": "1", "name": "a"}, - "attachments": [], - "reactions": [], - "embeds": [{"title": "E", "description": "d"}], - } - out = convert_exporter_message_to_dict(raw, server_id=1, channel_id=2) - assert out["content"] == "body" - assert "embeds" not in out - from discord_activity_tracker.staging_schema import validate_normalized_message - - validate_normalized_message(out, source="embed-test") diff --git a/discord_activity_tracker/tests/test_sync_client.py b/discord_activity_tracker/tests/test_sync_client.py deleted file mode 100644 index e2f5a5a7..00000000 --- a/discord_activity_tracker/tests/test_sync_client.py +++ /dev/null @@ -1,489 +0,0 @@ -"""Tests for discord_activity_tracker.sync.client.DiscordSyncClient.""" - -import asyncio -from datetime import datetime, timezone -from types import SimpleNamespace -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest - -from discord_activity_tracker.sync.client import ( - DiscordSyncClient, - discord_message_to_sync_dict, - run_async, -) - - -@pytest.fixture -def mock_discord_pkg(): - with patch("discord_activity_tracker.sync.client.discord") as m: - m.NotFound = type("NotFound", (Exception,), {}) - m.Forbidden = type("Forbidden", (Exception,), {}) - m.HTTPException = type("HTTPException", (Exception,), {}) - m.TextChannel = type("TextChannel", (), {}) - m.Intents.default.return_value = MagicMock() - inner = MagicMock() - inner.login = AsyncMock() - inner.close = AsyncMock() - m.Client.return_value = inner - yield m, inner - - -def test_init_registers_client(mock_discord_pkg): - _, inner = mock_discord_pkg - c = DiscordSyncClient("tok") - assert c.token == "tok" - assert c.client is inner - assert c._ready is False - - -def test_ensure_ready_logs_in_once(mock_discord_pkg): - _, inner = mock_discord_pkg - inner.login = AsyncMock() - - async def main(): - c = DiscordSyncClient("tok") - await c._ensure_ready() - await c._ensure_ready() - - asyncio.run(main()) - inner.login.assert_called_once_with("tok") - - -def test_get_guild_not_found(mock_discord_pkg): - m, inner = mock_discord_pkg - inner.login = AsyncMock() - inner.fetch_guild = AsyncMock(side_effect=m.NotFound()) - - async def main(): - c = DiscordSyncClient("tok") - return await c.get_guild(99) - - assert asyncio.run(main()) is None - - -def test_get_guild_forbidden(mock_discord_pkg): - m, inner = mock_discord_pkg - inner.login = AsyncMock() - inner.fetch_guild = AsyncMock(side_effect=m.Forbidden()) - - async def main(): - c = DiscordSyncClient("tok") - return await c.get_guild(99) - - assert asyncio.run(main()) is None - - -def test_get_channels_empty_when_no_guild(mock_discord_pkg): - _, inner = mock_discord_pkg - inner.login = AsyncMock() - inner.fetch_guild = AsyncMock(return_value=None) - - async def main(): - c = DiscordSyncClient("tok") - return await c.get_channels(1) - - assert asyncio.run(main()) == [] - - -def test_get_channels_filters_text_only(mock_discord_pkg): - m, inner = mock_discord_pkg - inner.login = AsyncMock() - guild = MagicMock() - guild.name = "G" - tc = MagicMock(spec=m.TextChannel) - vc = MagicMock() - inner.fetch_guild = AsyncMock(return_value=guild) - guild.fetch_channels = AsyncMock(return_value=[tc, vc]) - - async def main(): - c = DiscordSyncClient("tok") - return await c.get_channels(1) - - assert asyncio.run(main()) == [tc] - - -def test_get_channels_fetch_raises(mock_discord_pkg): - _, inner = mock_discord_pkg - inner.login = AsyncMock() - guild = MagicMock() - inner.fetch_guild = AsyncMock(return_value=guild) - guild.fetch_channels = AsyncMock(side_effect=RuntimeError("network")) - - async def main(): - c = DiscordSyncClient("tok") - return await c.get_channels(1) - - assert asyncio.run(main()) == [] - - -def test_get_channel_success(mock_discord_pkg): - m, inner = mock_discord_pkg - inner.login = AsyncMock() - tc = MagicMock(spec=m.TextChannel) - inner.fetch_channel = AsyncMock(return_value=tc) - - async def main(): - c = DiscordSyncClient("tok") - return await c.get_channel(10) - - assert asyncio.run(main()) is tc - - -def test_get_channel_wrong_type(mock_discord_pkg): - _, inner = mock_discord_pkg - inner.login = AsyncMock() - inner.fetch_channel = AsyncMock(return_value=MagicMock()) - - async def main(): - c = DiscordSyncClient("tok") - return await c.get_channel(10) - - assert asyncio.run(main()) is None - - -def test_get_channel_not_found(mock_discord_pkg): - m, inner = mock_discord_pkg - inner.login = AsyncMock() - inner.fetch_channel = AsyncMock(side_effect=m.NotFound()) - - async def main(): - c = DiscordSyncClient("tok") - return await c.get_channel(10) - - assert asyncio.run(main()) is None - - -def test_get_channel_forbidden(mock_discord_pkg): - m, inner = mock_discord_pkg - inner.login = AsyncMock() - inner.fetch_channel = AsyncMock(side_effect=m.Forbidden()) - - async def main(): - c = DiscordSyncClient("tok") - return await c.get_channel(10) - - assert asyncio.run(main()) is None - - -def test_fetch_messages_since_collects(mock_discord_pkg): - m, inner = mock_discord_pkg - inner.login = AsyncMock() - - msg = MagicMock() - msg.id = 1 - msg.content = "hi" - msg.author.id = 9 - msg.author.name = "u" - msg.author.display_name = "U" - msg.author.bot = False - msg.author.avatar = None - msg.created_at = datetime(2026, 1, 1, tzinfo=timezone.utc) - msg.edited_at = None - msg.reference = None - msg.attachments = [] - msg.reactions = [] - - ch = MagicMock(spec=m.TextChannel) - ch.name = "general" - - async def hist(**_kwargs): - yield msg - - ch.history = hist - - async def main(): - c = DiscordSyncClient("tok") - return await c.fetch_messages_since(ch) - - out = asyncio.run(main()) - assert len(out) == 1 - assert out[0]["id"] == 1 - - -def test_fetch_messages_since_forbidden(mock_discord_pkg): - m, inner = mock_discord_pkg - inner.login = AsyncMock() - ch = MagicMock(spec=m.TextChannel) - ch.name = "x" - - async def hist(**_kwargs): - raise m.Forbidden() - yield # pragma: no cover - - ch.history = hist - - async def main(): - c = DiscordSyncClient("tok") - return await c.fetch_messages_since(ch) - - assert asyncio.run(main()) == [] - - -def test_fetch_messages_since_http_exception(mock_discord_pkg): - m, inner = mock_discord_pkg - inner.login = AsyncMock() - ch = MagicMock(spec=m.TextChannel) - ch.name = "x" - - async def hist(**_kwargs): - raise m.HTTPException() - yield # pragma: no cover - - ch.history = hist - - async def main(): - c = DiscordSyncClient("tok") - return await c.fetch_messages_since(ch) - - assert asyncio.run(main()) == [] - - -def test_fetch_messages_since_logs_every_100_messages(mock_discord_pkg): - m, inner = mock_discord_pkg - ch = MagicMock(spec=m.TextChannel) - ch.name = "logs-here" - - async def hist(**_kwargs): - for i in range(100): - msg = MagicMock() - msg.id = i - msg.content = "" - msg.author = SimpleNamespace( - id=1, name="u", display_name="u", bot=False, avatar=None - ) - msg.created_at = datetime(2026, 1, 1, tzinfo=timezone.utc) - msg.edited_at = None - msg.reference = None - msg.attachments = [] - msg.reactions = [] - yield msg - - ch.history = hist - - async def main(): - c = DiscordSyncClient("tok") - return await c.fetch_messages_since(ch) - - assert len(asyncio.run(main())) == 100 - - -def test_fetch_messages_since_unexpected_error(mock_discord_pkg): - m, inner = mock_discord_pkg - inner.login = AsyncMock() - ch = MagicMock(spec=m.TextChannel) - ch.name = "x" - - async def hist(**_kwargs): - raise ValueError("weird") - yield # pragma: no cover - - ch.history = hist - - async def main(): - c = DiscordSyncClient("tok") - return await c.fetch_messages_since(ch) - - assert asyncio.run(main()) == [] - - -def test_close_ready(mock_discord_pkg): - _, inner = mock_discord_pkg - inner.login = AsyncMock() - inner.close = AsyncMock() - - async def main(): - c = DiscordSyncClient("tok") - await c._ensure_ready() - await c.close() - - asyncio.run(main()) - inner.close.assert_awaited_once() - - -def test_context_manager_calls_close(mock_discord_pkg): - _, inner = mock_discord_pkg - inner.login = AsyncMock() - inner.close = AsyncMock() - - # __exit__ calls shutdown_sync(); without .run() there is no dedicated loop, - # so shutdown_sync falls back to run_async(close()). The coroutine must be - # awaited — use wraps=real run_async instead of a bare MagicMock. - import discord_activity_tracker.sync.client as client_module - - real_run_async = client_module.run_async - with patch.object(client_module, "run_async", wraps=real_run_async) as ra: - with DiscordSyncClient("tok") as c: - c._ready = True - ra.assert_called_once() - inner.close.assert_awaited_once() - - -def test_run_async_returns_coroutine_result(): - """run_async must use a fresh event loop (not the deprecated get_event_loop).""" - - async def coro(): - return 42 - - assert run_async(coro()) == 42 - - -def test_run_async_does_not_use_get_event_loop(): - """Ensure run_async never calls the deprecated asyncio.get_event_loop.""" - - async def coro(): - return 99 - - with patch( - "discord_activity_tracker.sync.client.asyncio.get_event_loop" - ) as mock_gel: - result = run_async(coro()) - - # get_event_loop should never be touched - mock_gel.assert_not_called() - assert result == 99 - - -def test_run_async_closes_loop_on_exception(): - """Loop must be closed even if the coroutine raises.""" - - async def failing(): - raise ValueError("boom") - - with pytest.raises(ValueError, match="boom"): - run_async(failing()) - - -def test_discord_sync_client_run_reuses_event_loop(mock_discord_pkg): - """All work on one client instance must share one loop (discord.py / aiohttp).""" - loops: list = [] - - async def record_loop(): - loops.append(asyncio.get_running_loop()) - return 1 - - c = DiscordSyncClient("tok") - c.run(record_loop()) - c.run(record_loop()) - assert len(loops) == 2 - assert loops[0] is loops[1] - c.shutdown_sync() - - -def test_message_to_dict_with_attachment_and_reaction(mock_discord_pkg): - msg = MagicMock() - msg.id = 5 - msg.content = "c" - msg.author = SimpleNamespace( - id=1, name="n", display_name="n", bot=False, avatar=None - ) - msg.created_at = datetime(2026, 1, 1, tzinfo=timezone.utc) - msg.edited_at = datetime(2026, 1, 2, tzinfo=timezone.utc) - msg.reference = MagicMock(message_id=99) - - att = MagicMock() - att.url = "http://f" - att.filename = "f.txt" - att.size = 3 - msg.attachments = [att] - - react = MagicMock() - react.emoji = "👍" - react.count = 2 - msg.reactions = [react] - - msg.type = SimpleNamespace(name="default") - msg.pinned = False - - d = discord_message_to_sync_dict(msg) - assert d["author"]["display_name"] == "n" - assert d["message_type"] == "Default" - assert d["is_pinned"] is False - assert d["attachments"][0]["filename"] == "f.txt" - assert d["reactions"][0]["count"] == 2 - - -def test_message_to_dict_reply_and_pinned(mock_discord_pkg): - msg = MagicMock() - msg.id = 1 - msg.content = "" - msg.author = SimpleNamespace( - id=1, name="u", display_name="u", bot=False, avatar=None - ) - msg.created_at = datetime(2026, 1, 1, tzinfo=timezone.utc) - msg.edited_at = None - msg.reference = None - msg.attachments = [] - msg.reactions = [] - msg.type = SimpleNamespace(name="reply") - msg.pinned = True - - d = discord_message_to_sync_dict(msg) - assert d["message_type"] == "Reply" - assert d["is_pinned"] is True - - -def test_fetch_messages_since_http_429_returns_empty(mock_discord_pkg): - m, _ = mock_discord_pkg - exc = m.HTTPException() - exc.status = 429 - - async def hist(*_a, **_k): - if False: - yield None - raise exc - - ch = MagicMock() - ch.name = "c" - ch.history = hist - - c = DiscordSyncClient("tok") - c._ready = True - - async def run(): - return await c.fetch_messages_since(ch, after=None, limit=None) - - assert asyncio.run(run()) == [] - - -def test_message_type_label_from_typed_message_type(mock_discord_pkg): - m, _ = mock_discord_pkg - MT = type("MessageType", (), {}) - m.MessageType = MT - mt = MT() - mt.name = "thread_created" - from discord_activity_tracker.sync.client import _message_type_label - - assert _message_type_label(mt) == "ThreadCreated" - - -def test_discord_sync_client_message_to_dict_delegates(mock_discord_pkg): - c = DiscordSyncClient("tok") - msg = MagicMock() - msg.id = 1 - msg.content = "x" - msg.author = SimpleNamespace( - id=1, name="a", display_name="a", bot=False, avatar=None - ) - msg.created_at = datetime(2026, 1, 1, tzinfo=timezone.utc) - msg.edited_at = None - msg.reference = None - msg.attachments = [] - msg.reactions = [] - msg.type = SimpleNamespace(name="default") - msg.pinned = False - d = c._message_to_dict(msg) - assert d["id"] == 1 - - -def test_shutdown_sync_logs_when_close_raises(mock_discord_pkg, caplog): - import logging - - caplog.set_level(logging.ERROR) - _, inner = mock_discord_pkg - inner.close = AsyncMock(side_effect=RuntimeError("close failed")) - c = DiscordSyncClient("tok") - c._ready = True - c._asyncio_loop = asyncio.new_event_loop() - c.shutdown_sync() - assert "Error while closing" in caplog.text diff --git a/discord_activity_tracker/tests/test_sync_messages.py b/discord_activity_tracker/tests/test_sync_messages.py deleted file mode 100644 index 5a29c761..00000000 --- a/discord_activity_tracker/tests/test_sync_messages.py +++ /dev/null @@ -1,718 +0,0 @@ -"""Tests for discord_activity_tracker.sync.messages.""" - -import asyncio -import uuid -from datetime import timedelta -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest -from django.utils import timezone as django_timezone - -from cppa_user_tracker.services import get_or_create_discord_profile -from cppa_user_tracker.models import DiscordProfile -from discord_activity_tracker.models import DiscordChannel, DiscordServer -from discord_activity_tracker.services import ( - add_or_update_reaction, - bulk_process_message_batch, - create_or_update_discord_message, -) -from asgiref.sync import sync_to_async as asgiref_sync_to_async - -from discord_activity_tracker.services import ( - get_or_create_discord_channel, - get_or_create_discord_server, -) -from discord_activity_tracker.sync import messages as messages_mod -from discord_activity_tracker.sync.messages import ( - _prepare_message_data, - _process_message_data, - _process_messages_in_batches, - sync_all_channels, - sync_channel_messages, - sync_channel_messages_async, - sync_channels, - sync_channels_async, - sync_guild, - sync_guild_async, -) - - -def _uniq_id() -> int: - """Discord-sized positive int; avoids collisions when sync_to_async sees committed rows.""" - return uuid.uuid4().int % (2**50) - - -def _sample_api_message(mid=100, uid=7, ts=None): - if ts is None: - ts = "2026-03-01T12:00:00+00:00" - return { - "id": mid, - "content": "hello", - "author": { - "id": uid, - "username": "alice", - "display_name": "Alice", - "avatar_url": "", - "bot": False, - }, - "created_at": ts, - "edited_at": None, - "reference": {"message_id": None}, - "attachments": [], - "reactions": [{"emoji": "👍", "count": 1}], - } - - -@pytest.mark.django_db -def test_sync_guild_async_creates_server(): - gid = _uniq_id() - - async def main(): - client = MagicMock() - guild = MagicMock() - guild.id = gid - guild.name = "Guild" - guild.icon = None - client.get_guild = AsyncMock(return_value=guild) - with _selective_sync_to_async_guild(): - return await sync_guild_async(client, gid) - - server = asyncio.run(main()) - assert server.server_id == gid - assert server.server_name == "Guild" - - -@pytest.mark.django_db -def test_sync_guild_async_existing_server(): - gid = _uniq_id() - - async def main(): - client = MagicMock() - guild = MagicMock() - guild.id = gid - guild.name = "Guild" - guild.icon = None - client.get_guild = AsyncMock(return_value=guild) - - def router(fn): - if fn is get_or_create_discord_server: - - async def fake_get_or_create(**kwargs): - m = MagicMock() - m.server_id = kwargs["server_id"] - m.server_name = kwargs["server_name"] - return m, False - - return fake_get_or_create - - return asgiref_sync_to_async(fn, thread_sensitive=True) - - with patch( - "discord_activity_tracker.sync.messages.sync_to_async", - side_effect=router, - ): - return await sync_guild_async(client, gid) - - server = asyncio.run(main()) - assert server.server_id == gid - - -@pytest.mark.django_db -def test_sync_guild_async_missing_guild_raises(): - async def main(): - client = MagicMock() - client.get_guild = AsyncMock(return_value=None) - await sync_guild_async(client, 999) - - with pytest.raises(ValueError, match="not found"): - asyncio.run(main()) - - -def _selective_sync_to_async_guild(): - """Stub server persistence so guild sync tests do not insert DiscordServer rows.""" - - def router(fn): - if fn is get_or_create_discord_server: - - async def fake_get_or_create(**kwargs): - m = MagicMock() - m.server_id = kwargs["server_id"] - m.server_name = kwargs["server_name"] - return m, True - - return fake_get_or_create - - return asgiref_sync_to_async(fn, thread_sensitive=True) - - return patch( - "discord_activity_tracker.sync.messages.sync_to_async", - side_effect=router, - ) - - -def _selective_sync_to_async_channels(): - """Avoid ORM from thread pool (pytest-django tx); only stub channel creation.""" - - def router(fn): - if fn is get_or_create_discord_channel: - - async def fake_get_or_create(**kwargs): - m = MagicMock() - m.channel_id = kwargs["channel_id"] - m.channel_name = kwargs["channel_name"] - return m, True - - return fake_get_or_create - - return asgiref_sync_to_async(fn, thread_sensitive=True) - - return patch( - "discord_activity_tracker.sync.messages.sync_to_async", - side_effect=router, - ) - - -@pytest.mark.django_db -def test_sync_channels_async_creates_channels(): - gid = _uniq_id() - cid = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - - async def main(): - client = MagicMock() - ch = MagicMock() - ch.id = cid - ch.name = "general" - ch.type = MagicMock() - ch.type.__str__ = lambda *_: "text" - ch.topic = "t" - ch.position = 1 - client.get_channels = AsyncMock(return_value=[ch]) - - with _selective_sync_to_async_channels(): - return await sync_channels_async(client, server, gid) - - channels = asyncio.run(main()) - assert len(channels) == 1 - assert channels[0].channel_id == cid - - -def test_prepare_message_data_requires_created_at(): - assert _prepare_message_data({"id": 1, "author": {}}) is None - - -def test_prepare_message_data_ok(): - raw = _sample_api_message() - out = _prepare_message_data(raw) - assert out is not None - assert out.message_id == 100 - assert out.author.user_id == 7 - - -@pytest.mark.django_db -def test_process_messages_in_batches_skips_empty_prepared(): - gid = _uniq_id() - cid = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="c", - channel_type="text", - ) - - async def main(): - return await _process_messages_in_batches( - channel, [{"id": 1, "author": {}}], batch_size=10 - ) - - assert asyncio.run(main()) == 0 - - -def _selective_sync_to_async_bulk_batch(): - """Avoid ORM from sync_to_async worker threads (FK visibility across connections).""" - - def router(fn): - if fn is bulk_process_message_batch: - - async def fake_bulk(batch, _channel): - return len(batch) - - return fake_bulk - - return asgiref_sync_to_async(fn, thread_sensitive=True) - - return patch( - "discord_activity_tracker.sync.messages.sync_to_async", - side_effect=router, - ) - - -@pytest.mark.django_db -def test_process_messages_in_batches_runs_bulk_for_valid_messages(): - gid = _uniq_id() - cid = _uniq_id() - mid = _uniq_id() - uid = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="c", - channel_type="text", - ) - - async def main(): - with _selective_sync_to_async_bulk_batch(): - return await _process_messages_in_batches( - channel, [_sample_api_message(mid=mid, uid=uid)], batch_size=500 - ) - - assert asyncio.run(main()) == 1 - - -@pytest.mark.django_db -def test_process_message_data_skips_without_created_at(): - gid = _uniq_id() - cid = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="c", - channel_type="text", - ) - - def router(fn): - if fn is get_or_create_discord_profile: - - async def fake_profile(**_kwargs): - return MagicMock(), True - - return fake_profile - - return asgiref_sync_to_async(fn, thread_sensitive=True) - - async def main(): - with patch( - "discord_activity_tracker.sync.messages.sync_to_async", - side_effect=router, - ): - await _process_message_data( - channel, - {"id": _uniq_id(), "author": {}, "created_at": None}, - ) - - asyncio.run(main()) - - -def _selective_sync_to_async_process_message_ok(): - def router(fn): - if fn is get_or_create_discord_profile: - - async def fake_profile(**_kwargs): - return MagicMock(), True - - return fake_profile - if fn is create_or_update_discord_message: - - async def fake_msg(**kwargs): - m = MagicMock() - m.message_id = kwargs["message_id"] - m.channel_name = kwargs["channel"].channel_name - return m, True - - return fake_msg - if fn is add_or_update_reaction: - - async def fake_reaction(*_a, **_kw): - return MagicMock(), True - - return fake_reaction - - return asgiref_sync_to_async(fn, thread_sensitive=True) - - return patch( - "discord_activity_tracker.sync.messages.sync_to_async", - side_effect=router, - ) - - -@pytest.mark.django_db -def test_process_message_data_creates_message(): - gid = _uniq_id() - cid = _uniq_id() - mid = _uniq_id() - uid = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="c", - channel_type="text", - ) - - async def main(): - with _selective_sync_to_async_process_message_ok(): - await _process_message_data(channel, _sample_api_message(mid=mid, uid=uid)) - - asyncio.run(main()) - - -@pytest.mark.django_db -def test_process_message_data_swallows_inner_errors(): - gid = _uniq_id() - cid = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="c", - channel_type="text", - ) - - async def main(): - with patch.object( - messages_mod, - "parse_discord_user", - side_effect=RuntimeError("bad author"), - ): - await _process_message_data( - channel, _sample_api_message(mid=_uniq_id(), uid=_uniq_id()) - ) - - asyncio.run(main()) - - -@pytest.mark.django_db -def test_sync_channel_messages_async_full_sync(): - gid = _uniq_id() - cid = _uniq_id() - author_uid = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="general", - channel_type="text", - ) - - holder = {} - - async def main(): - client = MagicMock() - dch = MagicMock() - dch.name = "general" - client.get_channel = AsyncMock(return_value=dch) - - msg = _sample_api_message(mid=_uniq_id(), uid=author_uid) - client.fetch_messages_since = AsyncMock(return_value=[msg]) - - with patch.object( - messages_mod, - "_process_messages_in_batches", - new_callable=AsyncMock, - return_value=1, - ): - await sync_channel_messages_async(client, channel, gid, full_sync=True) - holder["client"] = client - - asyncio.run(main()) - - assert holder["client"].fetch_messages_since.await_args.kwargs["after"] is None - - -@pytest.mark.django_db -def test_sync_channel_messages_async_no_discord_channel(): - gid = _uniq_id() - cid = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="general", - channel_type="text", - ) - - holder = {} - - async def main(): - client = MagicMock() - client.get_channel = AsyncMock(return_value=None) - holder["client"] = client - await sync_channel_messages_async(client, channel, gid) - - asyncio.run(main()) - - assert not holder["client"].fetch_messages_since.called - - -@pytest.mark.django_db -def test_sync_channel_messages_async_raises(): - gid = _uniq_id() - cid = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="general", - channel_type="text", - ) - - async def main(): - client = MagicMock() - client.get_channel = AsyncMock(side_effect=RuntimeError("boom")) - await sync_channel_messages_async(client, channel, gid) - - with pytest.raises(RuntimeError, match="boom"): - asyncio.run(main()) - - -@pytest.mark.django_db -def test_sync_guild_wrapper_runs_and_closes(): - async def fake_guild(_client, _gid): - return MagicMock() - - with ( - patch("discord_activity_tracker.sync.messages.DiscordSyncClient") as Cls, - patch( - "discord_activity_tracker.sync.messages.sync_guild_async", - new=fake_guild, - ), - ): - inst = Cls.return_value - inst.close = AsyncMock() - - def client_run(coro): - loop = asyncio.new_event_loop() - try: - return loop.run_until_complete(coro) - finally: - loop.close() - - inst.run = MagicMock(side_effect=client_run) - inst.shutdown_sync = MagicMock() - sync_guild("token", 1) - - assert inst.run.call_count == 1 - inst.shutdown_sync.assert_called_once() - - -@pytest.mark.django_db -def test_sync_channels_wrapper(): - gid = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - - async def fake_channels(_c, _s, _g): - return [] - - with ( - patch("discord_activity_tracker.sync.messages.DiscordSyncClient") as Cls, - patch( - "discord_activity_tracker.sync.messages.sync_channels_async", - new=fake_channels, - ), - ): - inst = Cls.return_value - inst.close = AsyncMock() - - def client_run(coro): - loop = asyncio.new_event_loop() - try: - return loop.run_until_complete(coro) - finally: - loop.close() - - inst.run = MagicMock(side_effect=client_run) - inst.shutdown_sync = MagicMock() - sync_channels("token", server, gid) - - assert inst.run.call_count == 1 - inst.shutdown_sync.assert_called_once() - - -@pytest.mark.django_db -def test_sync_channel_messages_wrapper(): - gid = _uniq_id() - cid = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - channel = DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="c", - channel_type="text", - ) - - async def fake_sync(*_args, **_kwargs): - return None - - with ( - patch("discord_activity_tracker.sync.messages.DiscordSyncClient") as Cls, - patch( - "discord_activity_tracker.sync.messages.sync_channel_messages_async", - new=fake_sync, - ), - ): - inst = Cls.return_value - inst.close = AsyncMock() - - def client_run(coro): - loop = asyncio.new_event_loop() - try: - return loop.run_until_complete(coro) - finally: - loop.close() - - inst.run = MagicMock(side_effect=client_run) - inst.shutdown_sync = MagicMock() - sync_channel_messages("token", channel, gid, full_sync=True) - - assert inst.run.call_count == 1 - inst.shutdown_sync.assert_called_once() - - -@pytest.mark.django_db -def test_sync_all_channels_respects_active_filter(): - now = django_timezone.now() - gid = _uniq_id() - cid_active = _uniq_id() - cid_stale = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - author = DiscordProfile.objects.create( - discord_user_id=_uniq_id(), - username="u", - display_name="", - avatar_url="", - is_bot=False, - ) - active_ch = DiscordChannel.objects.create( - server=server, - channel_id=cid_active, - channel_name="active", - channel_type="text", - ) - stale_ch = DiscordChannel.objects.create( - server=server, - channel_id=cid_stale, - channel_name="stale", - channel_type="text", - ) - create_or_update_discord_message( - _uniq_id(), - active_ch, - author, - "recent", - message_created_at=now, - ) - create_or_update_discord_message( - _uniq_id(), - stale_ch, - author, - "old", - message_created_at=now - timedelta(days=90), - ) - - channels_snapshot = list(DiscordChannel.objects.filter(server=server)) - - async def guild_ok(_c, _gid): - return server - - async def channels_ok(_c, _srv, _gid): - return channels_snapshot - - sync_body = AsyncMock() - - with ( - patch("discord_activity_tracker.sync.messages.DiscordSyncClient") as Cls, - patch( - "discord_activity_tracker.sync.messages.sync_guild_async", - new=guild_ok, - ), - patch( - "discord_activity_tracker.sync.messages.sync_channels_async", - new=channels_ok, - ), - patch( - "discord_activity_tracker.sync.messages._sync_all_channels_async", - new=sync_body, - ), - ): - inst = Cls.return_value - inst.close = AsyncMock() - - def client_run(coro): - loop = asyncio.new_event_loop() - try: - return loop.run_until_complete(coro) - finally: - loop.close() - - inst.run = MagicMock(side_effect=client_run) - inst.shutdown_sync = MagicMock() - sync_all_channels("tok", gid, active_only=True, active_days=30, full_sync=False) - - assert inst.run.call_count == 3 - inst.shutdown_sync.assert_called_once() - sync_body.assert_awaited_once() - args, _kwargs = sync_body.call_args - passed_channels = args[1] - assert len(passed_channels) == 1 - assert passed_channels[0].pk == active_ch.pk - - -@pytest.mark.django_db -def test_sync_all_channels_full_sync_no_active_filter(): - gid = _uniq_id() - cid = _uniq_id() - server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") - DiscordChannel.objects.create( - server=server, - channel_id=cid, - channel_name="c", - channel_type="text", - ) - - channels_snapshot = list(DiscordChannel.objects.filter(server=server)) - - async def guild_ok(_c, _gid): - return server - - async def channels_ok(_c, _srv, _gid): - return channels_snapshot - - sync_body = AsyncMock() - - with ( - patch("discord_activity_tracker.sync.messages.DiscordSyncClient") as Cls, - patch( - "discord_activity_tracker.sync.messages.sync_guild_async", - new=guild_ok, - ), - patch( - "discord_activity_tracker.sync.messages.sync_channels_async", - new=channels_ok, - ), - patch( - "discord_activity_tracker.sync.messages._sync_all_channels_async", - new=sync_body, - ), - ): - inst = Cls.return_value - inst.close = AsyncMock() - - def client_run(coro): - loop = asyncio.new_event_loop() - try: - return loop.run_until_complete(coro) - finally: - loop.close() - - inst.run = MagicMock(side_effect=client_run) - inst.shutdown_sync = MagicMock() - sync_all_channels("tok", gid, full_sync=True) - - assert inst.run.call_count == 3 - inst.shutdown_sync.assert_called_once() - sync_body.assert_awaited_once() - args, _kwargs = sync_body.call_args - assert len(args[1]) == 1 diff --git a/discord_activity_tracker/tests/test_sync_utils.py b/discord_activity_tracker/tests/test_sync_utils.py deleted file mode 100644 index 311903c1..00000000 --- a/discord_activity_tracker/tests/test_sync_utils.py +++ /dev/null @@ -1,119 +0,0 @@ -"""Tests for discord_activity_tracker.sync.utils. - -ISO datetime parsing is implemented and tested in ``core.utils.datetime_parsing`` -(see ``core/tests/test_datetime_parsing.py``). - -``truncate_content`` lives in ``core.utils.text_processing``; see -``core/tests/test_text_processing.py``. -""" - -from discord_activity_tracker.sync.utils import ( - format_discord_url, - parse_discord_user, - sanitize_channel_name, -) - - -def test_parse_discord_user_empty_dict(): - out = parse_discord_user(None) - assert out.user_id == 0 - assert out.username == "unknown" - assert out.is_bot is False - - -def test_parse_discord_user_bot_api_shape(): - out = parse_discord_user( - { - "id": 123456789012345678, - "username": "alice", - "display_name": "Alice", - "avatar_url": "https://cdn.example/a.png", - "bot": True, - } - ) - assert out.user_id == 123456789012345678 - assert out.username == "alice" - assert out.display_name == "Alice" - assert out.avatar_url == "https://cdn.example/a.png" - assert out.is_bot is True - - -def test_parse_discord_user_exporter_name_fallback(): - out = parse_discord_user({"id": 1, "name": "bob"}) - assert out.username == "bob" - - -# --- new: DiscordChatExporter shape --- - - -def test_parse_discord_user_string_id_coerced_to_int(): - """Exporter provides id as string; must become int.""" - out = parse_discord_user({"id": "1082347485026070548", "name": "raubtier"}) - assert out.user_id == 1082347485026070548 - assert isinstance(out.user_id, int) - - -def test_parse_discord_user_avatarUrl_camelCase(): - """Exporter uses camelCase avatarUrl; parse_discord_user should pick it up.""" - out = parse_discord_user( - { - "id": "99", - "name": "Raubtier-Asyl", - "avatarUrl": "https://cdn.discordapp.com/avatar.png", - } - ) - assert out.avatar_url == "https://cdn.discordapp.com/avatar.png" - - -def test_parse_discord_user_avatar_url_takes_priority_over_avatarUrl(): - """When both keys exist, avatar_url (Bot API key) wins.""" - out = parse_discord_user( - { - "id": "1", - "name": "x", - "avatar_url": "https://cdn/a", - "avatarUrl": "https://cdn/b", - } - ) - assert out.avatar_url == "https://cdn/a" - - -def test_parse_discord_user_isBot_camelCase(): - """Exporter uses isBot; must be interpreted as a boolean.""" - out = parse_discord_user({"id": "1", "name": "mybot", "isBot": True}) - assert out.is_bot is True - - -def test_parse_discord_user_nickname_as_display_name(): - """Exporter uses nickname; should be captured in display_name.""" - out = parse_discord_user( - { - "id": "5", - "name": "user5", - "nickname": "Cool User", - } - ) - assert out.display_name == "Cool User" - - -def test_parse_discord_user_invalid_id_defaults_to_zero(): - out = parse_discord_user({"id": "not-a-number", "name": "x"}) - assert out.user_id == 0 - - -def test_parse_discord_user_none_id_defaults_to_zero(): - out = parse_discord_user({"id": None, "name": "x"}) - assert out.user_id == 0 - - -# --- unchanged helpers --- - - -def test_sanitize_channel_name_strips_unsafe_chars(): - assert "/" not in sanitize_channel_name("a/b") - assert "*" not in sanitize_channel_name("x*y") - assert "?" not in sanitize_channel_name("help?") - - -def test_format_discord_url(): - assert format_discord_url(1, 2, 3) == "https://discord.com/channels/1/2/3" diff --git a/discord_activity_tracker/tests/test_task_discord_sync_coverage.py b/discord_activity_tracker/tests/test_task_discord_sync_coverage.py deleted file mode 100644 index 186b6713..00000000 --- a/discord_activity_tracker/tests/test_task_discord_sync_coverage.py +++ /dev/null @@ -1,466 +0,0 @@ -"""Coverage for task_discord_sync (fetch → persist → raw archive).""" - -from __future__ import annotations - -import json -import secrets -from datetime import datetime, timezone -from io import StringIO -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest - -from discord_activity_tracker.management.commands.run_discord_activity_tracker import ( - DiscordActivityCollector, - task_discord_sync, -) -from discord_activity_tracker.sync.chat_exporter import ChannelDayExport - - -def _phony_token() -> str: - return secrets.token_hex(16) - - -def _channel_day_export( - path, - *, - day_str: str = "2026-01-15", - channel_id: int = 0, -) -> ChannelDayExport: - return ChannelDayExport(path=path, day_str=day_str, channel_id=channel_id) - - -def _minimal_envelope(guild_id: int, channel_id: int): - msg = { - "id": str(10**12 + guild_id + channel_id), - "type": "Default", - "isPinned": False, - "timestamp": "2026-01-15T12:00:00Z", - "content": "hello world example text long enough for validation", - "author": {"id": "1082347485026070548", "name": "user"}, - "attachments": [], - "reactions": [], - } - return { - "guild": {"id": str(guild_id), "name": "G"}, - "channel": {"id": str(channel_id), "name": "c", "type": "GuildTextChat"}, - "messages": [msg], - } - - -@pytest.mark.django_db -def test_task_discord_sync_skip_returns_early(settings): - tok = _phony_token() - settings.DISCORD_USER_TOKEN = tok - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - task_discord_sync( - dry_run=False, - skip_discord_sync=True, - user_token=tok, - guild_id=1, - channel_ids=[], - after_date=None, - before_date=None, - per_channel_incremental=False, - collector=collector, - ) - - -@pytest.mark.django_db -def test_task_discord_sync_dry_run_returns_early(settings): - tok = _phony_token() - settings.DISCORD_USER_TOKEN = tok - cmd = MagicMock() - cmd.stdout = StringIO() - collector = DiscordActivityCollector(cmd=cmd, options={}) - task_discord_sync( - dry_run=True, - skip_discord_sync=False, - user_token=tok, - guild_id=1, - channel_ids=[], - after_date=None, - before_date=None, - per_channel_incremental=False, - collector=collector, - ) - - -@pytest.mark.django_db -def test_task_discord_sync_happy_path_rename_raw(settings, tmp_path, monkeypatch): - settings.WORKSPACE_DIR = tmp_path / "ws" - settings.WORKSPACE_DIR.mkdir(parents=True) - tok = _phony_token() - settings.DISCORD_USER_TOKEN = tok - - gid, cid = 880011, 880022 - staging = tmp_path / "staging" - staging.mkdir() - raw_ch = tmp_path / "raw" / str(gid) / str(cid) - raw_ch.mkdir(parents=True) - - jpath = staging / "c.json" - jpath.write_text(json.dumps(_minimal_envelope(gid, cid)), encoding="utf-8") - - def fake_export(**_kwargs): - return [_channel_day_export(jpath, day_str="2026-01-15", channel_id=cid)] - - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - collector._persist_channel = AsyncMock(return_value=1) - - with ( - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.export_guild_to_json", - side_effect=fake_export, - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_exporter_staging_dir", - return_value=staging, - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.clear_exporter_staging_dir", - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_channel_raw_dir", - return_value=raw_ch, - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_raw_dir", - return_value=tmp_path / "raw", - ), - ): - task_discord_sync( - dry_run=False, - skip_discord_sync=False, - user_token=tok, - guild_id=gid, - channel_ids=[], - after_date=datetime(2026, 1, 1, tzinfo=timezone.utc), - before_date=None, - per_channel_incremental=False, - collector=collector, - ) - - dest = raw_ch / "2026-01-15.json" - assert dest.is_file() - assert not jpath.exists() - - -@pytest.mark.django_db -def test_task_discord_sync_skips_channel_not_in_allowlist(settings, tmp_path): - settings.WORKSPACE_DIR = tmp_path / "ws" - settings.WORKSPACE_DIR.mkdir(parents=True) - tok = _phony_token() - - gid, cid = 770011, 770022 - staging = tmp_path / "st2" - staging.mkdir() - jpath = staging / "x.json" - jpath.write_text(json.dumps(_minimal_envelope(gid, cid)), encoding="utf-8") - - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - collector._persist_channel = AsyncMock(return_value=0) - - with ( - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.export_guild_to_json", - return_value=[ - _channel_day_export(jpath, day_str="2026-01-15", channel_id=cid) - ], - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_exporter_staging_dir", - return_value=staging, - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.clear_exporter_staging_dir", - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_channel_raw_dir", - return_value=tmp_path / "raw" / str(gid) / str(cid), - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_raw_dir", - return_value=tmp_path / "raw", - ), - ): - task_discord_sync( - dry_run=False, - skip_discord_sync=False, - user_token=tok, - guild_id=gid, - channel_ids=[999999], - after_date=None, - before_date=None, - per_channel_incremental=False, - collector=collector, - ) - - assert not jpath.exists() - - -@pytest.mark.django_db -def test_task_discord_sync_staging_validation_error_keeps_file( - settings, tmp_path, monkeypatch -): - settings.WORKSPACE_DIR = tmp_path / "ws" - settings.WORKSPACE_DIR.mkdir(parents=True) - tok = _phony_token() - gid, cid = 660011, 660022 - staging = tmp_path / "st3" - staging.mkdir() - jpath = staging / "bad.json" - jpath.write_text( - json.dumps({"guild": {}, "channel": {}, "messages": "bad"}), encoding="utf-8" - ) - - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - - with ( - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.export_guild_to_json", - return_value=[ - _channel_day_export(jpath, day_str="2026-01-15", channel_id=cid) - ], - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_exporter_staging_dir", - return_value=staging, - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.clear_exporter_staging_dir", - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_channel_raw_dir", - return_value=tmp_path / "raw" / str(gid) / str(cid), - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_raw_dir", - return_value=tmp_path / "raw", - ), - ): - task_discord_sync( - dry_run=False, - skip_discord_sync=False, - user_token=tok, - guild_id=gid, - channel_ids=[], - after_date=None, - before_date=None, - per_channel_incremental=False, - collector=collector, - ) - assert jpath.is_file() - - -@pytest.mark.django_db -def test_task_discord_sync_value_error_unlinks(settings, tmp_path): - settings.WORKSPACE_DIR = tmp_path / "ws" - settings.WORKSPACE_DIR.mkdir(parents=True) - tok = _phony_token() - gid, cid = 550011, 550022 - staging = tmp_path / "st4" - staging.mkdir() - jpath = staging / "v.json" - jpath.write_text("{", encoding="utf-8") - - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - - with ( - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.export_guild_to_json", - return_value=[ - _channel_day_export(jpath, day_str="2026-01-15", channel_id=cid) - ], - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_exporter_staging_dir", - return_value=staging, - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.clear_exporter_staging_dir", - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_channel_raw_dir", - return_value=tmp_path / "raw" / str(gid) / str(cid), - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_raw_dir", - return_value=tmp_path / "raw", - ), - ): - task_discord_sync( - dry_run=False, - skip_discord_sync=False, - user_token=tok, - guild_id=gid, - channel_ids=[], - after_date=None, - before_date=None, - per_channel_incremental=False, - collector=collector, - ) - assert not jpath.exists() - - -@pytest.mark.django_db -def test_task_discord_sync_exporter_error_becomes_command_error(settings, tmp_path): - from django.core.management.base import CommandError - - from discord_activity_tracker.sync.chat_exporter import DiscordChatExporterError - - settings.WORKSPACE_DIR = tmp_path / "ws" - settings.WORKSPACE_DIR.mkdir(parents=True) - tok = _phony_token() - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - with patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.export_guild_to_json", - side_effect=DiscordChatExporterError("boom"), - ): - with pytest.raises(CommandError, match="DiscordChatExporter"): - task_discord_sync( - dry_run=False, - skip_discord_sync=False, - user_token=tok, - guild_id=1, - channel_ids=[], - after_date=None, - before_date=None, - per_channel_incremental=False, - collector=collector, - ) - - -@pytest.mark.django_db -def test_task_discord_sync_persist_raises_unlinks(settings, tmp_path): - settings.WORKSPACE_DIR = tmp_path / "ws" - settings.WORKSPACE_DIR.mkdir(parents=True) - tok = _phony_token() - gid, cid = 440011, 440022 - staging = tmp_path / "st5" - staging.mkdir() - jpath = staging / "ok.json" - jpath.write_text(json.dumps(_minimal_envelope(gid, cid)), encoding="utf-8") - - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - collector._persist_channel = AsyncMock(side_effect=RuntimeError("db")) - - with ( - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.export_guild_to_json", - return_value=[ - _channel_day_export(jpath, day_str="2026-01-15", channel_id=cid) - ], - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_exporter_staging_dir", - return_value=staging, - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.clear_exporter_staging_dir", - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_channel_raw_dir", - return_value=tmp_path / "raw" / str(gid) / str(cid), - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_raw_dir", - return_value=tmp_path / "raw", - ), - ): - task_discord_sync( - dry_run=False, - skip_discord_sync=False, - user_token=tok, - guild_id=gid, - channel_ids=[], - after_date=None, - before_date=None, - per_channel_incremental=False, - collector=collector, - ) - assert not jpath.exists() - - -@pytest.mark.django_db -def test_task_discord_sync_stdout_includes_before_date(settings, tmp_path): - settings.WORKSPACE_DIR = tmp_path / "ws" - settings.WORKSPACE_DIR.mkdir(parents=True) - tok = _phony_token() - gid, cid = 410011, 410022 - staging = tmp_path / "st6" - staging.mkdir() - jpath = staging / "bd.json" - jpath.write_text(json.dumps(_minimal_envelope(gid, cid)), encoding="utf-8") - - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - collector._persist_channel = AsyncMock(return_value=0) - before = datetime(2026, 12, 31, tzinfo=timezone.utc) - with ( - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.export_guild_to_json", - return_value=[ - _channel_day_export(jpath, day_str="2026-01-15", channel_id=cid) - ], - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_exporter_staging_dir", - return_value=staging, - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.clear_exporter_staging_dir", - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_channel_raw_dir", - return_value=tmp_path / "raw" / str(gid) / str(cid), - ), - patch( - "discord_activity_tracker.management.commands.run_discord_activity_tracker.get_raw_dir", - return_value=tmp_path / "raw", - ), - ): - task_discord_sync( - dry_run=False, - skip_discord_sync=False, - user_token=tok, - guild_id=gid, - channel_ids=[], - after_date=None, - before_date=before, - per_channel_incremental=False, - collector=collector, - ) - out = cmd.stdout.getvalue() - assert "Upper bound" in out diff --git a/discord_activity_tracker/tests/test_task_markdown_coverage.py b/discord_activity_tracker/tests/test_task_markdown_coverage.py deleted file mode 100644 index 60600b7e..00000000 --- a/discord_activity_tracker/tests/test_task_markdown_coverage.py +++ /dev/null @@ -1,163 +0,0 @@ -"""Coverage for task_markdown_export_and_push.""" - -from __future__ import annotations - -from io import StringIO -from unittest.mock import MagicMock, patch - -import pytest - -from discord_activity_tracker.management.commands.run_discord_activity_tracker import ( - DiscordActivityCollector, - task_markdown_export_and_push, -) -from discord_activity_tracker.models import DiscordServer - - -@pytest.mark.django_db -def test_task_markdown_skip_export(): - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.WARNING = lambda x: x - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - task_markdown_export_and_push( - dry_run=False, - skip_markdown_export=True, - skip_remote_push=False, - guild_id=1, - collector=collector, - ) - - -@pytest.mark.django_db -def test_task_markdown_no_context_path(monkeypatch, tmp_path, settings): - monkeypatch.setattr(settings, "DISCORD_CONTEXT_REPO_PATH", "") - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.WARNING = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - task_markdown_export_and_push( - dry_run=False, - skip_markdown_export=False, - skip_remote_push=False, - guild_id=1, - collector=collector, - ) - - -@pytest.mark.django_db -def test_task_markdown_dry_run(tmp_path, settings): - p = tmp_path / "ctx" - p.mkdir() - settings.DISCORD_CONTEXT_REPO_PATH = str(p) - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.WARNING = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - task_markdown_export_and_push( - dry_run=True, - skip_markdown_export=False, - skip_remote_push=False, - guild_id=1, - collector=collector, - ) - assert "ctx" in cmd.stdout.getvalue() or "dry-run" in cmd.stdout.getvalue().lower() - - -@pytest.mark.django_db -def test_task_markdown_server_not_in_db(tmp_path, settings): - settings.DISCORD_CONTEXT_REPO_PATH = str(tmp_path / "ctx") - (tmp_path / "ctx").mkdir() - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.WARNING = lambda x: x - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - task_markdown_export_and_push( - dry_run=False, - skip_markdown_export=False, - skip_remote_push=True, - guild_id=999888777666, - collector=collector, - ) - assert "not in DB" in cmd.stdout.getvalue() or "Server" in cmd.stdout.getvalue() - - -@pytest.mark.django_db -def test_task_markdown_export_success(tmp_path, settings): - settings.DISCORD_CONTEXT_REPO_PATH = str(tmp_path / "ctx") - (tmp_path / "ctx").mkdir() - srv = DiscordServer.objects.create(server_id=424242, server_name="S", icon_url="") - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.WARNING = lambda x: x - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - with patch( - "discord_activity_tracker.sync.export.export_and_push", - return_value=True, - ): - task_markdown_export_and_push( - dry_run=False, - skip_markdown_export=False, - skip_remote_push=True, - guild_id=srv.server_id, - collector=collector, - ) - assert "Exported" in cmd.stdout.getvalue() - - -@pytest.mark.django_db -def test_task_markdown_export_warns_on_false(tmp_path, settings): - settings.DISCORD_CONTEXT_REPO_PATH = str(tmp_path / "ctx") - (tmp_path / "ctx").mkdir() - srv = DiscordServer.objects.create(server_id=424243, server_name="S2", icon_url="") - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.WARNING = lambda x: x - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - with patch( - "discord_activity_tracker.sync.export.export_and_push", - return_value=False, - ): - task_markdown_export_and_push( - dry_run=False, - skip_markdown_export=False, - skip_remote_push=True, - guild_id=srv.server_id, - collector=collector, - ) - assert "No markdown" in cmd.stdout.getvalue() - - -@pytest.mark.django_db -def test_task_markdown_auto_commit_path(tmp_path, settings): - settings.DISCORD_CONTEXT_REPO_PATH = str(tmp_path / "ctx") - (tmp_path / "ctx").mkdir() - settings.DISCORD_CONTEXT_AUTO_COMMIT = True - srv = DiscordServer.objects.create(server_id=424245, server_name="S4", icon_url="") - cmd = MagicMock() - cmd.stdout = StringIO() - cmd.style = MagicMock() - cmd.style.WARNING = lambda x: x - cmd.style.SUCCESS = lambda x: x - collector = DiscordActivityCollector(cmd=cmd, options={}) - with patch( - "discord_activity_tracker.sync.export.export_and_push", - return_value=True, - ): - task_markdown_export_and_push( - dry_run=False, - skip_markdown_export=False, - skip_remote_push=False, - guild_id=srv.server_id, - collector=collector, - ) diff --git a/discord_activity_tracker/tests/test_workspace.py b/discord_activity_tracker/tests/test_workspace.py deleted file mode 100644 index 328ce68b..00000000 --- a/discord_activity_tracker/tests/test_workspace.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Tests for discord_activity_tracker.workspace.""" - -from unittest.mock import patch - -import pytest - -from discord_activity_tracker.workspace import ( - get_channel_json_path, - get_channel_raw_dir, - get_exporter_staging_dir, - get_messages_json_path, - get_raw_dir, - get_server_dir, - get_workspace_root, - iter_existing_message_jsons, -) - - -@pytest.fixture -def mock_discord_workspace(tmp_path): - root = tmp_path / "discord_activity_tracker" - root.mkdir(parents=True) - return root - - -def test_get_workspace_root(mock_discord_workspace): - with patch( - "discord_activity_tracker.workspace.get_workspace_path", - return_value=mock_discord_workspace, - ): - assert get_workspace_root() == mock_discord_workspace - - -def test_get_raw_dir_matches_boost_style_layout(settings, tmp_path): - """Raw JSON lives under WORKSPACE_DIR/raw/discord_activity_tracker/.""" - settings.WORKSPACE_DIR = str(tmp_path) - raw = get_raw_dir() - assert raw == tmp_path / "raw" / "discord_activity_tracker" - assert raw.is_dir() - - -def test_get_channel_raw_dir_nested(settings, tmp_path): - settings.WORKSPACE_DIR = str(tmp_path) - p = get_channel_raw_dir(4242, 9001) - assert p == tmp_path / "raw" / "discord_activity_tracker" / "4242" / "9001" - assert p.is_dir() - - -def test_get_exporter_staging_dir_under_workspace_root(mock_discord_workspace): - with patch( - "discord_activity_tracker.workspace.get_workspace_path", - return_value=mock_discord_workspace, - ): - staging = get_exporter_staging_dir() - assert staging == mock_discord_workspace / "_exporter_staging" - assert staging.is_dir() - - -def test_get_server_dir(mock_discord_workspace): - with patch( - "discord_activity_tracker.workspace.get_workspace_path", - return_value=mock_discord_workspace, - ): - p = get_server_dir(4242) - assert p == mock_discord_workspace / "4242" - assert p.is_dir() - - -def test_get_channel_json_path(mock_discord_workspace): - with patch( - "discord_activity_tracker.workspace.get_workspace_path", - return_value=mock_discord_workspace, - ): - path = get_channel_json_path(1, 999) - assert path.name == "999.json" - assert path.parent.name == "channels" - - -def test_get_messages_json_path(mock_discord_workspace): - with patch( - "discord_activity_tracker.workspace.get_workspace_path", - return_value=mock_discord_workspace, - ): - path = get_messages_json_path(1, 2, "2026-05-01") - assert path.name == "2026-05-01.json" - - -def test_iter_existing_message_jsons_yields_files(mock_discord_workspace): - with patch( - "discord_activity_tracker.workspace.get_workspace_path", - return_value=mock_discord_workspace, - ): - msg_dir = mock_discord_workspace / "7" / "messages" / "8" - msg_dir.mkdir(parents=True) - (msg_dir / "day.json").write_text("{}", encoding="utf-8") - paths = list(iter_existing_message_jsons(7, 8)) - assert len(paths) == 1 - assert paths[0].name == "day.json" - - -def test_iter_existing_message_jsons_skips_appledouble_sidecars(mock_discord_workspace): - with patch( - "discord_activity_tracker.workspace.get_workspace_path", - return_value=mock_discord_workspace, - ): - msg_dir = mock_discord_workspace / "7" / "messages" / "8" - msg_dir.mkdir(parents=True) - (msg_dir / "day.json").write_text("{}", encoding="utf-8") - (msg_dir / "._day.json").write_bytes(b"\xb0") - paths = list(iter_existing_message_jsons(7, 8)) - assert len(paths) == 1 - assert paths[0].name == "day.json" - - -def test_iter_existing_message_jsons_empty_when_missing(mock_discord_workspace): - with patch( - "discord_activity_tracker.workspace.get_workspace_path", - return_value=mock_discord_workspace, - ): - assert list(iter_existing_message_jsons(99, 99)) == [] - - -def test_iter_existing_message_jsons_sorted_by_filename(mock_discord_workspace): - """Paths must be yielded in sorted order for deterministic incremental reads.""" - with patch( - "discord_activity_tracker.workspace.get_workspace_path", - return_value=mock_discord_workspace, - ): - msg_dir = mock_discord_workspace / "7" / "messages" / "8" - msg_dir.mkdir(parents=True) - (msg_dir / "2026-01-02.json").write_text("{}", encoding="utf-8") - (msg_dir / "2026-01-01.json").write_text("{}", encoding="utf-8") - names = [p.name for p in iter_existing_message_jsons(7, 8)] - assert names == ["2026-01-01.json", "2026-01-02.json"] diff --git a/discord_activity_tracker/tests/test_workspace_clear_staging.py b/discord_activity_tracker/tests/test_workspace_clear_staging.py deleted file mode 100644 index 1a2914f0..00000000 --- a/discord_activity_tracker/tests/test_workspace_clear_staging.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Coverage for workspace.clear_exporter_staging_dir.""" - -from __future__ import annotations - -import pytest - -from discord_activity_tracker.workspace import ( - clear_exporter_staging_dir, - get_exporter_staging_dir, -) - - -@pytest.mark.django_db -def test_clear_exporter_staging_dir_removes_children(tmp_path, settings): - settings.WORKSPACE_DIR = tmp_path / "ws" - settings.WORKSPACE_DIR.mkdir(parents=True) - staging = get_exporter_staging_dir() - (staging / "a.json").write_text("{}", encoding="utf-8") - sub = staging / "sub" - sub.mkdir() - (sub / "x.txt").write_text("x", encoding="utf-8") - clear_exporter_staging_dir() - assert list(staging.iterdir()) == [] diff --git a/discord_activity_tracker/tests/test_write_staging_json_schema_script.py b/discord_activity_tracker/tests/test_write_staging_json_schema_script.py deleted file mode 100644 index 724f03e9..00000000 --- a/discord_activity_tracker/tests/test_write_staging_json_schema_script.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Coverage for write_staging_json_schema script entrypoint.""" - -from __future__ import annotations - -from unittest.mock import patch - -import discord_activity_tracker.scripts.write_staging_json_schema as mod - - -def test_write_staging_json_schema_main_prints_path(tmp_path, capsys): - target = tmp_path / "schema.json" - with patch.object(mod, "write_staging_json_schema", return_value=target): - mod.main() - out = capsys.readouterr().out.strip() - assert str(target) in out diff --git a/discord_activity_tracker/utils/__init__.py b/discord_activity_tracker/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/discord_activity_tracker/utils/discord_internal_tokens_store.py b/discord_activity_tracker/utils/discord_internal_tokens_store.py deleted file mode 100644 index 21a481c5..00000000 --- a/discord_activity_tracker/utils/discord_internal_tokens_store.py +++ /dev/null @@ -1,186 +0,0 @@ -"""Persist Discord session credentials as JSON under workspace/discord_activity_tracker/.""" - -from __future__ import annotations - -import json -import logging -import os -from datetime import datetime, timezone -from pathlib import Path -from typing import Any - -from django.conf import settings - -from discord_activity_tracker.workspace import get_discord_internal_tokens_json_path - -logger = logging.getLogger(__name__) - -DISCORD_TOKENS_RELOGIN_HINT = "Session credentials invalid or unavailable. Check workspace configuration per .env.example." - - -def discord_internal_tokens_json_path() -> Path: - """Resolved path to the tokens JSON file.""" - override = (getattr(settings, "DISCORD_INTERNAL_TOKENS_JSON", "") or "").strip() - if override: - path = Path(override).expanduser() - if not path.is_absolute(): - path = Path.cwd() / path - return path.resolve() - return get_discord_internal_tokens_json_path().resolve() - - -def _read_document(path: Path) -> dict[str, Any]: - if not path.is_file(): - return {} - raw = path.read_text(encoding="utf-8") - if not raw.strip(): - return {} - data = json.loads(raw) - if not isinstance(data, dict): - raise ValueError(f"Invalid tokens file (expected object): {path}") - return data - - -def _write_document(path: Path, data: dict[str, Any]) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - tmp = path.with_suffix(".json.tmp") - payload = json.dumps(data, indent=2, sort_keys=True) - tmp.write_text(payload + "\n", encoding="utf-8") - os.replace(tmp, path) - try: - os.chmod(path, 0o600) - except OSError: - pass - logger.debug("Saved Discord session credentials to %s", path) - - -def save_discord_internal_tokens( - user_token: str, - *, - user_id: str | None = None, - username: str | None = None, -) -> Path: - """Write session credential into workspace JSON. Returns path written.""" - user_token = (user_token or "").strip() - if not user_token: - raise ValueError("user_token is required") - - path = discord_internal_tokens_json_path() - entry: dict[str, Any] = { - "user_token": user_token, - "updated_at": datetime.now(timezone.utc).isoformat(), - } - if user_id: - entry["user_id"] = user_id - if username: - entry["username"] = username - _write_document(path, entry) - return path - - -def load_discord_internal_tokens() -> dict[str, str] | None: - """Load credential record, or None if missing.""" - path = discord_internal_tokens_json_path() - try: - doc = _read_document(path) - except (OSError, json.JSONDecodeError, ValueError) as e: - logger.warning( - "Could not read Discord session credentials from %s: %s", path, e - ) - return None - user_token = (doc.get("user_token") or "").strip() - if not user_token: - return None - out: dict[str, str] = {"user_token": user_token} - if doc.get("user_id"): - out["user_id"] = str(doc["user_id"]) - if doc.get("username"): - out["username"] = str(doc["username"]) - return out - - -def extract_and_save_discord_internal_tokens() -> str | None: - """Load credentials from workspace storage and persist to workspace JSON.""" - from discord_activity_tracker.utils.discord_tokens import extract_discord_token_auto - - tokens = extract_discord_token_auto() - if not tokens or "user_token" not in tokens: - return None - save_discord_internal_tokens( - tokens["user_token"], - user_id=tokens.get("user_id"), - username=tokens.get("username"), - ) - return tokens["user_token"] - - -def _allow_internal_discord_tokens() -> bool: - allow = getattr(settings, "ALLOW_INTERNAL_DISCORD_TOKENS", False) - if isinstance(allow, str): - return allow.strip().lower() == "true" - return bool(allow) - - -def get_discord_user_token_from_json() -> str | None: - """Return session credential from workspace JSON when internal mode is enabled.""" - if not _allow_internal_discord_tokens(): - return None - record = load_discord_internal_tokens() - if not record: - return None - return record["user_token"] - - -def log_discord_internal_tokens_still_invalid() -> None: - """Log when session credentials remain invalid after refresh.""" - logger.error( - "Discord session credentials still invalid. %s", - DISCORD_TOKENS_RELOGIN_HINT, - ) - - -def log_discord_internal_tokens_extract_failed() -> None: - """Log when session credentials could not be loaded from workspace storage.""" - logger.error( - "Failed to load Discord session credentials. %s", - DISCORD_TOKENS_RELOGIN_HINT, - ) - - -def _extract_validate_and_return() -> str | None: - """Refresh credentials from workspace storage; return token only if auth probe passes.""" - from discord_activity_tracker.utils.discord_tokens import probe_discord_user_token - - token = extract_and_save_discord_internal_tokens() - if not token: - log_discord_internal_tokens_extract_failed() - return None - if probe_discord_user_token(token): - return token - log_discord_internal_tokens_still_invalid() - return None - - -def get_or_load_discord_user_token() -> str | None: - """ - Return Discord credential for DiscordChatExporter. - - Reads workspace JSON when internal mode is enabled and refreshes when stale. - Otherwise returns credential from settings (.env). - """ - if not _allow_internal_discord_tokens(): - return (getattr(settings, "DISCORD_USER_TOKEN", "") or "").strip() or None - - from discord_activity_tracker.utils.discord_tokens import probe_discord_user_token - - token = get_discord_user_token_from_json() - if token: - if probe_discord_user_token(token): - return token - logger.info("Discord session credentials in JSON are stale; refreshing") - return _extract_validate_and_return() - - logger.info( - "Discord session credentials not in JSON; loading from workspace storage" - ) - return _extract_validate_and_return() diff --git a/discord_activity_tracker/utils/discord_tokens.py b/discord_activity_tracker/utils/discord_tokens.py deleted file mode 100644 index f47f01f4..00000000 --- a/discord_activity_tracker/utils/discord_tokens.py +++ /dev/null @@ -1,253 +0,0 @@ -"""Discord session credential helpers for DiscordChatExporter flows.""" - -from __future__ import annotations - -import logging -import re -import shutil -import tempfile -from pathlib import Path - -import requests -from django.conf import settings - -logger = logging.getLogger(__name__) - -DISCORD_USERS_ME_URL = "https://discord.com/api/v9/users/@me" - -# Local storage keys for Discord session credentials. -DISCORD_TOKEN_KEY = b"_https://discord.com\x00\x01token" -DISCORD_TOKEN_KEY_LEGACY = b"_https://discordapp.com\x00\x01token" -DISCORD_TOKEN_MARKER = b"\x01token" - -CHROME_PROFILE_PATH_PATTERN = re.compile(r"^[a-zA-Z0-9/_. \-:]+$") - -# Substrings in DiscordChatExporter stderr that indicate auth failure. -DISCORD_EXPORTER_AUTH_MARKERS = ( - "401", - "403", - "unauthorized", - "Unauthorized", - "invalid token", - "Invalid token", - "not authorized", - "Not authorized", -) - - -def _validate_chrome_profile_path(path: str) -> str: - """Validate DISCORD_CHROME_PROFILE_PATH format. Raises ValueError if invalid.""" - if not path or not isinstance(path, str): - raise ValueError("DISCORD_CHROME_PROFILE_PATH must be a non-empty string") - path = path.strip() - if "\x00" in path: - raise ValueError("DISCORD_CHROME_PROFILE_PATH must not contain null bytes") - normalized = Path(path).as_posix() - if not CHROME_PROFILE_PATH_PATTERN.match(normalized): - raise ValueError( - "DISCORD_CHROME_PROFILE_PATH must contain only path characters " - "(letters, digits, /, _, ., -, space, :), got: %s" % (path[:100],) - ) - return path - - -def _resolve_discord_chrome_profile_root() -> Path: - """Return validated session storage directory for Discord credentials.""" - from discord_activity_tracker.workspace import get_chrome_profile_path - - raw = (getattr(settings, "DISCORD_CHROME_PROFILE_PATH", "") or "").strip() - if not raw: - return get_chrome_profile_path() - validated = _validate_chrome_profile_path(raw) - root = Path(validated).expanduser() - if not root.is_absolute(): - root = Path.cwd() / root - return root.resolve() - - -def _leveldb_path(profile_root: Path) -> Path: - return profile_root / "Default" / "Local Storage" / "leveldb" - - -def _parse_discord_token_raw(raw: bytes) -> str: - """Parse credential value from local storage (strip prefix byte + JSON quotes).""" - if not raw: - raise ValueError("Discord token value is empty") - if raw[0:1] in (b"\x00", b"\x01"): - text = raw[1:].decode("utf-8", errors="replace") - else: - text = raw.decode("utf-8", errors="replace") - text = text.strip() - if len(text) >= 2 and text[0] == '"' and text[-1] == '"': - text = text[1:-1] - token = text.strip() - if not token: - raise ValueError("Discord token value is empty after parsing") - return token - - -def _read_leveldb_value(leveldb_dir: Path, key: bytes) -> bytes | None: - """Read a single key from local storage; copy to temp dir if locked.""" - try: - import plyvel - except ImportError: - logger.warning( - "plyvel is not installed; cannot read session storage at %s. " - "See .env.example for supported environments.", - leveldb_dir, - ) - return None - - keys_to_try = (key,) - - def _get_from_db(db_path: str) -> bytes | None: - db = plyvel.DB(db_path, create_if_missing=False) - try: - for k in keys_to_try: - value = db.get(k) - if value is not None: - return value - for db_key, db_value in db.iterator(): - if DISCORD_TOKEN_MARKER in db_key and db_key.endswith(b"token"): - return db_value - return None - finally: - db.close() - - try: - return _get_from_db(str(leveldb_dir)) - except plyvel.Error as e: - err = str(e).lower() - if "lock" not in err and "resource temporarily unavailable" not in err: - raise - logger.debug("LevelDB locked at %s, copying to temp dir", leveldb_dir) - with tempfile.TemporaryDirectory(prefix="leveldb-") as tmp: - shutil.copytree(leveldb_dir, Path(tmp) / "leveldb", dirs_exist_ok=True) - return _get_from_db(str(Path(tmp) / "leveldb")) - - -def _read_discord_token_from_leveldb(profile_root: Path) -> str | None: - """Load Discord credential from configured session storage.""" - leveldb_dir = _leveldb_path(profile_root) - if not leveldb_dir.is_dir(): - logger.warning("LevelDB not found at %s", leveldb_dir) - return None - for key in (DISCORD_TOKEN_KEY, DISCORD_TOKEN_KEY_LEGACY): - try: - raw = _read_leveldb_value(leveldb_dir, key) - if raw: - return _parse_discord_token_raw(raw) - except ValueError as e: - logger.warning( - "Error parsing Discord credential from session storage: %s", e - ) - continue - except Exception as e: - logger.warning( - "Error reading Discord credential from session storage: %s", e - ) - continue - logger.warning("Discord credential not found in %s", leveldb_dir) - return None - - -def probe_discord_user_token(token: str) -> bool: - """Return True if credential authenticates against Discord GET /users/@me.""" - token = (token or "").strip() - if not token: - return False - try: - response = requests.get( - DISCORD_USERS_ME_URL, - headers={"Authorization": token}, - timeout=30, - ) - if response.status_code == 200: - return True - if response.status_code in (401, 403): - logger.debug( - "Discord token probe auth error: HTTP %s", response.status_code - ) - return False - logger.debug( - "Discord token probe unexpected status %s (treating as invalid)", - response.status_code, - ) - return False - except Exception as e: - logger.debug("Discord token probe request failed: %s", e) - return False - - -def probe_discord_user_token_details(token: str) -> dict | None: - """Return user details from GET /users/@me when credential is valid, else None.""" - token = (token or "").strip() - if not token: - return None - try: - response = requests.get( - DISCORD_USERS_ME_URL, - headers={"Authorization": token}, - timeout=30, - ) - if response.status_code != 200: - return None - data = response.json() - if not isinstance(data, dict): - return None - user_id = str(data.get("id") or "").strip() - username = str(data.get("username") or "").strip() - out: dict[str, str] = {} - if user_id: - out["user_id"] = user_id - if username: - out["username"] = username - return out or None - except Exception as e: - logger.debug("Discord token probe details failed: %s", e) - return None - - -def is_discord_exporter_auth_error(message: str) -> bool: - """True if DiscordChatExporter stderr/message indicates auth failure.""" - text = (message or "").lower() - if not text: - return False - if "401" in message or "403" in message: - return True - for marker in DISCORD_EXPORTER_AUTH_MARKERS: - if marker.lower() in text: - return True - return False - - -def extract_discord_token_auto() -> dict | None: - """Load Discord session credentials from configured workspace paths.""" - logger.debug("Loading Discord session credentials") - try: - profile_root = _resolve_discord_chrome_profile_root() - except ValueError as e: - logger.error("%s", e) - return None - if not profile_root.is_dir(): - logger.error( - "Session storage not found at %s. See .env.example.", - profile_root, - ) - return None - user_token = _read_discord_token_from_leveldb(profile_root) - if not user_token: - logger.error( - "Failed to read Discord credentials from workspace storage. See .env.example." - ) - return None - if not probe_discord_user_token(user_token): - logger.error( - "Discord credentials failed auth probe. Session may be expired or invalid." - ) - return None - result: dict[str, str] = {"user_token": user_token} - details = probe_discord_user_token_details(user_token) - if details: - result.update(details) - return result diff --git a/discord_activity_tracker/workspace.py b/discord_activity_tracker/workspace.py deleted file mode 100644 index c3335db7..00000000 --- a/discord_activity_tracker/workspace.py +++ /dev/null @@ -1,110 +0,0 @@ -""" -Workspace utilities - path helpers for raw export JSON and per-server data. - -Layout: workspace/discord_activity_tracker/ - - chrome_profile/ (session storage for exporter credentials) - - discord_internal_tokens.json (session credentials, not .env) - - _exporter_staging/ (temporary DiscordChatExporter output; cleared each run) -""" - -from pathlib import Path - -from django.conf import settings - -from config.workspace import get_workspace_path - -_APP_SLUG = "discord_activity_tracker" - -# Pre-exported DiscordChatExporter JSON dropped here for DB import (see backfill command). -CPP_DISCUSSION_IMPORT_SUBDIR = "Discussion - c-cpp-discussion" -CHROME_PROFILE_DIRNAME = "chrome_profile" -DISCORD_INTERNAL_TOKENS_FILENAME = "discord_internal_tokens.json" - - -def get_workspace_root() -> Path: - """Return workspace/discord_activity_tracker/.""" - return get_workspace_path(_APP_SLUG) - - -def get_chrome_profile_path() -> Path: - """Session storage directory for Discord exporter credentials.""" - path = get_workspace_root() / CHROME_PROFILE_DIRNAME - path.mkdir(parents=True, exist_ok=True) - return path - - -def get_discord_internal_tokens_json_path() -> Path: - """JSON file storing Discord session credentials.""" - return get_workspace_root() / DISCORD_INTERNAL_TOKENS_FILENAME - - -def get_cpp_discussion_import_dir() -> Path: - """Return workspace/discord_activity_tracker/Discussion - c-cpp-discussion/ (creates if missing).""" - path = get_workspace_root() / CPP_DISCUSSION_IMPORT_SUBDIR - path.mkdir(parents=True, exist_ok=True) - return path - - -def get_raw_dir() -> Path: - """Return WORKSPACE_DIR/raw/discord_activity_tracker/ for archived JSON (Boost-style layout).""" - path = Path(settings.WORKSPACE_DIR) / "raw" / _APP_SLUG - path.mkdir(parents=True, exist_ok=True) - return path - - -def get_exporter_staging_dir() -> Path: - """Temporary directory for DiscordChatExporter output before per-day archival.""" - path = get_workspace_root() / "_exporter_staging" - path.mkdir(parents=True, exist_ok=True) - return path - - -def get_channel_raw_dir(server_id: int, channel_id: int) -> Path: - """Return raw/discord_activity_tracker/// for saved exports.""" - path = get_raw_dir() / str(server_id) / str(channel_id) - path.mkdir(parents=True, exist_ok=True) - return path - - -def clear_exporter_staging_dir() -> None: - """Remove all files and subdirectories under the exporter staging directory.""" - import shutil - - staging = get_exporter_staging_dir() - for child in staging.iterdir(): - if child.is_file(): - child.unlink(missing_ok=True) - elif child.is_dir(): - shutil.rmtree(child, ignore_errors=True) - - -def get_server_dir(server_id: int) -> Path: - """Return workspace/discord_activity_tracker// (creates if needed).""" - path = get_workspace_root() / str(server_id) - path.mkdir(parents=True, exist_ok=True) - return path - - -def get_channel_json_path(server_id: int, channel_id: int) -> Path: - """Path for /channels/.json""" - path = get_server_dir(server_id) / "channels" - path.mkdir(parents=True, exist_ok=True) - return path / f"{channel_id}.json" - - -def get_messages_json_path(server_id: int, channel_id: int, date_str: str) -> Path: - """Path for /messages//.json""" - path = get_server_dir(server_id) / "messages" / str(channel_id) - path.mkdir(parents=True, exist_ok=True) - return path / f"{date_str}.json" - - -def iter_existing_message_jsons(server_id: int, channel_id: int): - """Yield paths for messages//*.json""" - messages_dir = get_server_dir(server_id) / "messages" / str(channel_id) - if not messages_dir.is_dir(): - return - for path in sorted(messages_dir.glob("*.json")): - if path.name.startswith("._"): - continue - yield path diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 8f2a1f11..de16b493 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -57,6 +57,3 @@ services: limits: cpus: "0.5" memory: 256M - - slack-chromium: - profiles: ["slack-session"] diff --git a/docker-compose.yml b/docker-compose.yml index 1d19806e..03c2fb4b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,5 @@ # Boost Data Collector - Docker Compose # Runs: Redis, Django (gunicorn), Celery worker, Celery beat. -# Optional: slack-chromium (profile slack-session) / discord-chromium (discord-session) for noVNC login. # DATABASE_URL must be set in .env (host Postgres, or postgres://...@db:5432/... if you enable db). services: @@ -30,35 +29,6 @@ services: timeout: 3s retries: 5 - slack-chromium: - profiles: ["slack-session"] - image: selenium/standalone-chrome:145.0-chromedriver-145.0-20260222 - platform: linux/amd64 - shm_size: "2g" - ports: - - "127.0.0.1:7900:7900" - volumes: - # Chrome profile only (no full workspace mount — least privilege for noVNC login). - - ./workspace/slack_event_handler/chrome_profile:/home/seluser/.config/google-chrome - environment: - SE_BROWSER_ARGS_USER_DATA_DIR: "--user-data-dir=/home/seluser/.config/google-chrome" - SE_BROWSER_ARGS_NO_SANDBOX: "--no-sandbox" - SE_BROWSER_ARGS_DISABLE_DEV_SHM: "--disable-dev-shm-usage" - - discord-chromium: - profiles: ["discord-session"] - image: selenium/standalone-chrome:145.0-chromedriver-145.0-20260222 - platform: linux/amd64 - shm_size: "2g" - ports: - - "127.0.0.1:7901:7900" - volumes: - - ./workspace/discord_activity_tracker/chrome_profile:/home/seluser/.config/google-chrome - environment: - SE_BROWSER_ARGS_USER_DATA_DIR: "--user-data-dir=/home/seluser/.config/google-chrome" - SE_BROWSER_ARGS_NO_SANDBOX: "--no-sandbox" - SE_BROWSER_ARGS_DISABLE_DEV_SHM: "--disable-dev-shm-usage" - web: build: . restart: unless-stopped @@ -74,8 +44,6 @@ services: CELERY_BROKER_URL: redis://redis:6379/0 CELERY_RESULT_BACKEND: redis://redis:6379/0 ALLOWED_HOSTS: ${ALLOWED_HOSTS:-localhost,127.0.0.1,web,0.0.0.0} - CHROME_PROFILE_PATH: /app/workspace/slack_event_handler/chrome_profile - DISCORD_CHROME_PROFILE_PATH: /app/workspace/discord_activity_tracker/chrome_profile volumes: - ./workspace:/app/workspace - ./logs:/app/logs @@ -108,8 +76,6 @@ services: DATABASE_URL: ${DATABASE_URL:?Set DATABASE_URL in .env (e.g. host.docker.internal or db:5432 with db service enabled)} CELERY_BROKER_URL: redis://redis:6379/0 CELERY_RESULT_BACKEND: redis://redis:6379/0 - CHROME_PROFILE_PATH: /app/workspace/slack_event_handler/chrome_profile - DISCORD_CHROME_PROFILE_PATH: /app/workspace/discord_activity_tracker/chrome_profile volumes: - ./workspace:/app/workspace - ./logs:/app/logs diff --git a/docs/Architecture_data_flow.md b/docs/Architecture_data_flow.md index f04075cd..f7f658ab 100644 --- a/docs/Architecture_data_flow.md +++ b/docs/Architecture_data_flow.md @@ -85,10 +85,8 @@ flowchart TB | `boost_mailing_list_tracker` | Mailing list archives | PostgreSQL, `WORKSPACE_DIR` (raw / message JSON) | | `clang_github_tracker` | LLVM/Clang GitHub activity | PostgreSQL, `WORKSPACE_DIR` | | `cppa_slack_tracker` | Slack messages and channels | PostgreSQL, `WORKSPACE_DIR` (per-channel JSON, raw) | -| `discord_activity_tracker` | Discord server activity | PostgreSQL, `WORKSPACE_DIR` | | `cppa_youtube_script_tracker` | YouTube transcript collection | PostgreSQL, `WORKSPACE_DIR` (metadata, raw VTT) | | `wg21_paper_tracker` | WG21 committee papers pipeline | PostgreSQL, `WORKSPACE_DIR` | | `cppa_pinecone_sync` | Hybrid vector upserts / namespaces | PostgreSQL (sync status, fail lists), Pinecone | -| `slack_event_handler` | Slack Bolt (Socket Mode): huddles / PR bot — **long-running process**, not the same as the YAML nightly batch | `WORKSPACE_DIR` (JSON state / queue; no ORM models), GitHub (optional MD uploads) | **Pinecone paths:** Many collectors write rows first; **`cppa_pinecone_sync`** (and some commands’ built-in sync phases) read from PostgreSQL and/or files and upsert into Pinecone. Namespace and field conventions vary by source; see [Pinecone_preprocess_guideline.md](Pinecone_preprocess_guideline.md) and per-app docs under [service_api/](service_api/). diff --git a/docs/Architecture_overview.md b/docs/Architecture_overview.md index 21162931..720d5302 100644 --- a/docs/Architecture_overview.md +++ b/docs/Architecture_overview.md @@ -46,10 +46,8 @@ Columns: **persistence** (usual durable stores), **coupling** (one-line upstream | **`cppa_pinecone_sync`** | Vector upserts, fail lists, sync status | Yes | PostgreSQL, Pinecone | **Upstream:** doc/GitHub/mailing collectors. **Downstream:** Pinecone index | [README](../cppa_pinecone_sync/README.md), [service_api](service_api/cppa_pinecone_sync.md), [Pinecone_preprocess_guideline](Pinecone_preprocess_guideline.md) | | **`clang_github_tracker`** | LLVM/Clang GitHub activity | Yes | PostgreSQL, workspace | **Upstream:** `github_activity_tracker` (via `sync_api`), `cppa_user_tracker` | [README](../clang_github_tracker/README.md), [service_api](service_api/clang_github_tracker.md) | | **`cppa_slack_tracker`** | Slack teams, channels, messages | Yes | PostgreSQL, workspace | **Upstream:** `cppa_user_tracker` | [README](../cppa_slack_tracker/README.md), [service_api](service_api/cppa_slack_tracker.md) | -| **`discord_activity_tracker`** | Discord servers, channels, messages | Yes | PostgreSQL, workspace | **Upstream:** `cppa_user_tracker` | [README](../discord_activity_tracker/README.md), [service_api](service_api/discord_activity_tracker.md) | | **`wg21_paper_tracker`** | WG21 papers and authors | Yes | PostgreSQL, workspace | **Upstream:** `cppa_user_tracker` | [README](../wg21_paper_tracker/README.md), [service_api](service_api/wg21_paper_tracker.md) | | **`cppa_youtube_script_tracker`** | YouTube metadata and transcripts | Yes | PostgreSQL, workspace | **Upstream:** `cppa_user_tracker` | [README](../cppa_youtube_script_tracker/README.md), [service_api](service_api/cppa_youtube_script_tracker.md) | -| **`slack_event_handler`** | Slack Socket Mode listener (PR bot / huddles) — **long-running**, not YAML batch | **No ORM** / no `services.py` | Workspace JSON, GitHub optional | **Upstream:** Slack events. **Downstream:** GitHub MD via operations | [README](../slack_event_handler/README.md) — *no [service_api](service_api/) page* | **Primary scheduled commands** (YAML / Celery batch via `config/boost_collector_schedule.yaml`; non-exhaustive — see [Workflow.md](Workflow.md)): @@ -66,16 +64,9 @@ Columns: **persistence** (usual durable stores), **coupling** (one-line upstream | `cppa_pinecone_sync` | `run_cppa_pinecone_sync` | | `clang_github_tracker` | `run_clang_github_tracker` | | `cppa_slack_tracker` | `run_cppa_slack_tracker` | -| `discord_activity_tracker` | `run_discord_activity_tracker` | | `wg21_paper_tracker` | `run_wg21_paper_tracker` | | `cppa_youtube_script_tracker` | `run_cppa_youtube_script_tracker` | -**Long-running entrypoint services** (not in the YAML schedule; run as a persistent process, e.g. Compose / `runserver` integration): - -| App | Entry command | Notes | -|-----|---------------|-------| -| `slack_event_handler` | `run_slack_event_handler` | Slack Socket Mode listener (PR bot / huddles) | - --- ## 4. Vertical slices diff --git a/docs/CONCURRENCY.md b/docs/CONCURRENCY.md index 5a0e8872..6890bd0f 100644 --- a/docs/CONCURRENCY.md +++ b/docs/CONCURRENCY.md @@ -17,25 +17,12 @@ For isolating external SDK state behind explicit types, see [`core/adapters/prot | `_ChannelJoinCoordinator._stop_event` | same | Signal background join thread to exit | Process-global | Used with `_check_lock`, not nested | | `_CloneRegistry._lock` | `github_activity_tracker/workspace.py` | Set of clone paths for end-of-run cleanup | Process-global | After per-repo lock when nested (see below) | | `_RepoLockRegistry._guard` + per-repo locks | `github_activity_tracker/big_commit.py` | Concurrent clone/fetch for same repo | Per (owner, repo) | Before clone registry lock when nested | -| `_TeamThreadLockRegistry._guard` + per-path locks | `slack_event_handler/utils/state.py` | In-process mutex paired with file advisory lock | Per state file path | Before file lock (see below) | -| Advisory file lock | `slack_event_handler/utils/state.py` | Per-team JSON state read-modify-write | Per team / file | After in-process team lock | -| `_JobQueueRuntime._apps_lock` | `slack_event_handler/utils/job_queue.py` | Per-team Bolt app registry | Per team | Independent of busy lock and state locks | -| `_JobQueueRuntime._busy_lock` | same | Per-team “waiting for rate slot” flag | Per team | Independent of apps lock and state locks | --- ## Acquisition-order rules -Only two places nest locks within a subsystem. **No cross-module lock nesting** exists. - -### Slack PR-bot state (`slack_event_handler`) - -1. `_TeamThreadLockRegistry` in-process lock (per state file path) -2. Advisory file lock (`fcntl` on Unix, `portalocker` on Windows) - -Always this order inside `state_file_lock()`. The registry guard is held only briefly to create/lookup per-path locks; it is never held while waiting on the file lock. - -`_JobQueueRuntime._apps_lock` and `_JobQueueRuntime._busy_lock` are never held together and never nest inside `state_file_lock`. +Only one place nests locks within a subsystem. **No cross-module lock nesting** exists. ### GitHub big commits (`github_activity_tracker`) @@ -51,8 +38,6 @@ Always this order inside `state_file_lock()`. The registry guard is held only br | Component | Location | Mechanism | |-----------|----------|-----------| | `PineconeIngestion` | `cppa_pinecone_sync/ingestion.py` | `ThreadPoolExecutor` scoped to `update_documents` batches | -| `DiscordSyncClient` | `discord_activity_tracker/sync/client.py` | Dedicated `_asyncio_loop` per client instance | -| Huddle dedup cache | `slack_event_handler/utils/slack_listener.py` | Instance `_processed_file_ids_lock` | | Log handler emit | `config/logging_handlers.py` | Instance `_emit_lock` on handler class | --- diff --git a/docs/Core_public_API.md b/docs/Core_public_API.md index 7fa8b878..3caebe93 100644 --- a/docs/Core_public_API.md +++ b/docs/Core_public_API.md @@ -33,8 +33,6 @@ All **application** collectors listed below subclass **`AbstractCollector`** (`n | `run_cppa_slack_tracker` | `CppaSlackTrackerCollector` | `cppa_slack_tracker.management.commands.run_cppa_slack_tracker` | | `run_cppa_youtube_script_tracker` | `CppaYoutubeScriptTrackerCollector` | `cppa_youtube_script_tracker.management.commands.run_cppa_youtube_script_tracker` | | `run_wg21_paper_tracker` | `Wg21PaperTrackerCollector` | `wg21_paper_tracker.collectors` | -| `run_discord_activity_tracker` | `DiscordActivityCollector` | `discord_activity_tracker.management.commands.run_discord_activity_tracker` | -| `backfill_discord_activity_tracker` | `DiscordBackfillCollector` | `discord_activity_tracker.management.commands.backfill_discord_activity_tracker` | ## Failure classification @@ -57,13 +55,13 @@ Structural contracts for **data** that crosses tracker layers (sync outcomes, ac | `core.protocols.IncrementalState` | `@runtime_checkable` protocol: `checkpoint_token`, `human_readable_marker`, `extras`. | | `core.protocols.require_tracker_result` / `require_activity_record` / `require_incremental_state` | Runtime guards raising `TypeError` when an object does not satisfy the protocol. | -Implementations are frozen dataclasses in each tracker app's `protocol_impl.py` (for example `github_activity_tracker.protocol_impl`, `discord_activity_tracker.protocol_impl`, `boost_library_tracker.protocol_impl`). They subclass shared bases in **`core.protocol_dto`** (`TrackerResultDataclass`, `IncrementalStateDataclass`, `ActivityRecordDataclass`) which provide canonical `asdict()`, `to_json()`, `from_dict()`, and log-friendly `__repr__`. Simple collectors may return `GenericTrackerResult` directly. Prefer dataclasses over plain `dict` for reliable `isinstance` checks with `@runtime_checkable`. +Implementations are frozen dataclasses in each tracker app's `protocol_impl.py` (for example `github_activity_tracker.protocol_impl`, `boost_library_tracker.protocol_impl`). They subclass shared bases in **`core.protocol_dto`** (`TrackerResultDataclass`, `IncrementalStateDataclass`, `ActivityRecordDataclass`) which provide canonical `asdict()`, `to_json()`, `from_dict()`, and log-friendly `__repr__`. Simple collectors may return `GenericTrackerResult` directly. Prefer dataclasses over plain `dict` for reliable `isinstance` checks with `@runtime_checkable`. `BaseCollectorCommand` structured logs include `result_repr` and `result_json` in `extra` when the collector returns a `TrackerResultDataclass` subclass. `AbstractCollector.collect()` must return a `TrackerResult`. Override `load_incremental_state()` / `persist_incremental_state()` when a collector needs checkpoint read/write between runs (default hooks are no-ops). -**Local static check:** with dev dependencies installed (`requirements-dev.lock`), from the repo root run **`uv run pyright`** (same as the **`pyright`** job in [`.github/workflows/actions.yml`](../.github/workflows/actions.yml)). Root **`pyrightconfig.json`** scopes analysis to `core`, `github_activity_tracker`, `discord_activity_tracker`, `cppa_slack_tracker`, `cppa_user_tracker`, and `cppa_pinecone_sync`, and excludes **`core/pyright_samples/**`** from that run; **`core/tests/test_protocols.py`** still exercises positive/negative protocol assignment snippets via subprocess. +**Local static check:** with dev dependencies installed (`requirements-dev.lock`), from the repo root run **`uv run pyright`** (same as the **`pyright`** job in [`.github/workflows/actions.yml`](../.github/workflows/actions.yml)). Root **`pyrightconfig.json`** scopes analysis to `core`, `github_activity_tracker`, `cppa_slack_tracker`, `cppa_user_tracker`, and `cppa_pinecone_sync`, and excludes **`core/pyright_samples/**`** from that run; **`core/tests/test_protocols.py`** still exercises positive/negative protocol assignment snippets via subprocess. ## External adapters diff --git a/docs/Deployment.md b/docs/Deployment.md index 0b127839..453227f5 100644 --- a/docs/Deployment.md +++ b/docs/Deployment.md @@ -172,7 +172,7 @@ Add the private key content (`~/.ssh/deploy_key`) as the **`SSH_PRIVATE_KEY`** s This matches a common production/staging layout for this repo: - **On the host:** PostgreSQL (package install), **nginx** (TLS + reverse proxy). -- **In Docker Compose:** `web` (Gunicorn), `celery_worker`, `celery_beat`, `redis`. Optional profile **`slack-session`** (`slack-chromium` noVNC) for Slack login on headless hosts. The bundled **`db` service is commented out** in `docker-compose.yml`; the app uses **`DATABASE_URL`** to reach PostgreSQL on the host. +- **In Docker Compose:** `web` (Gunicorn), `celery_worker`, `celery_beat`, `redis`. The bundled **`db` service is commented out** in `docker-compose.yml`; the app uses **`DATABASE_URL`** to reach PostgreSQL on the host. Compose already sets `extra_hosts: host.docker.internal:host-gateway` on app containers so `DATABASE_URL` can use host `host.docker.internal` (see `.env.example`). **`DATABASE_URL` is required** in `.env` for `docker compose` (there is no default to a bundled `db` service while that service stays commented out). @@ -476,7 +476,7 @@ cd /opt/boost-data-collector && make down && make up ## Production Compose overlay -For VM production, use the prod overlay (resource limits, `LOG_FORMAT=json`, `slack-session` off by default): +For VM production, use the prod overlay (resource limits, `LOG_FORMAT=json`): ```bash docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d diff --git a/docs/Development_guideline.md b/docs/Development_guideline.md index 1f778d9e..af2f37ff 100644 --- a/docs/Development_guideline.md +++ b/docs/Development_guideline.md @@ -105,7 +105,7 @@ When adding or changing in-process locks, semaphores, or shared mutable state ac Run tests often so you catch problems early. - **PostgreSQL for pytest:** `config.test_settings` requires `DATABASE_URL` pointing at PostgreSQL (see [README.md](../README.md#running-tests): `docker compose -f docker-compose.test.yml up -d`, then export `DATABASE_URL` / `SECRET_KEY`). This matches CI and avoids SQLite-only passes that fail in production. -- **Pyright:** Install dev dependencies (`requirements-dev.lock`), then from the project root run **`uv run pyright`**. Configuration lives in **`pyrightconfig.json`** at the repo root (typed paths: `core`, `github_activity_tracker`, `discord_activity_tracker`, `cppa_slack_tracker`, `cppa_user_tracker`, `cppa_pinecone_sync`; `core/pyright_samples/**` is excluded from the default run—see **`core/tests/test_protocols.py`** for protocol assignment checks). The **`pyright`** job in [`.github/workflows/actions.yml`](../.github/workflows/actions.yml) runs the same check in CI. +- **Pyright:** Install dev dependencies (`requirements-dev.lock`), then from the project root run **`uv run pyright`**. Configuration lives in **`pyrightconfig.json`** at the repo root (typed paths: `core`, `github_activity_tracker`, `cppa_slack_tracker`, `cppa_user_tracker`, `cppa_pinecone_sync`; `core/pyright_samples/**` is excluded from the default run—see **`core/tests/test_protocols.py`** for protocol assignment checks). The **`pyright`** job in [`.github/workflows/actions.yml`](../.github/workflows/actions.yml) runs the same check in CI. - **Before each commit:** run the test suite for the code you changed (`python -m pytest` or a subset). - **For app commands:** ensure the command runs successfully (e.g. `python manage.py run_boost_github_activity_tracker` exits with 0 and does the expected work). - **Full workflow:** run `python manage.py run_scheduled_collectors --schedule default --group ` / `--schedule interval --interval-minutes ` when testing the YAML-driven path (matches how Celery Beat invokes it). diff --git a/docs/GCP_Production_Checklist.md b/docs/GCP_Production_Checklist.md index c9239114..cf5da824 100644 --- a/docs/GCP_Production_Checklist.md +++ b/docs/GCP_Production_Checklist.md @@ -41,7 +41,7 @@ Mirror [`.env.example`](../.env.example) groups; inject via Secret Manager → e ## Collectors in Beat schedule -Configured in [`config/boost_collector_schedule.yaml`](../config/boost_collector_schedule.yaml): `github`, `boost_library_docs`, `slack`, `discord`, `mailing_list`. +Configured in [`config/boost_collector_schedule.yaml`](../config/boost_collector_schedule.yaml): `github`, `boost_library_docs`, `slack`, `mailing_list`, `reddit`. **Not** on Beat yet (manual / future): WG21, YouTube, Clang — `/health/` shows `last_success_at: null` until scheduled or `record_group_success` is updated. @@ -52,7 +52,6 @@ Configured in [`config/boost_collector_schedule.yaml`](../config/boost_collector | `web` | Gunicorn `gthread`; resource limits in `docker-compose.prod.yml` | | `celery_worker` | `--max-tasks-per-child` (default 50) | | `celery_beat` | Persistent `celerybeat` volume | -| `slack-chromium` | Profile `slack-session` — off by default in prod (noVNC Slack login only) | ## Ingress diff --git a/docs/Onboarding.md b/docs/Onboarding.md index b6f3f364..1f6f3c2e 100644 --- a/docs/Onboarding.md +++ b/docs/Onboarding.md @@ -53,10 +53,8 @@ These are the Django apps under **`INSTALLED_APPS`** (excluding `django.contrib. | **cppa_pinecone_sync** | Vector index | Pinecone upsert / failure tracking; used by doc and GitHub pipelines. | | **clang_github_tracker** | LLVM/clang mirror | **`run_clang_github_tracker`**; heavy workspace/raw patterns. | | **cppa_slack_tracker** | Slack messages | **`run_cppa_slack_tracker`**. | -| **discord_activity_tracker** | Discord | **`run_discord_activity_tracker`**, **`run_discord_exporter`**. | | **wg21_paper_tracker** | WG21 papers | **`run_wg21_paper_tracker`**. | | **cppa_youtube_script_tracker** | YouTube scripts | **`run_cppa_youtube_script_tracker`**. | -| **slack_event_handler** | Slack events | **`run_slack_event_handler`** (webhook/event path differs from tracker sync). | **Finding the real command names:** Run `python manage.py help` or list `/management/commands/*.py`. **`config/boost_collector_schedule.yaml`** lists what production *schedules*; names must match actual Django commands (if something fails with “Unknown command”, the YAML or docs may be ahead of or behind the repo). diff --git a/docs/README.md b/docs/README.md index 2ffae68f..7a710ccb 100644 --- a/docs/README.md +++ b/docs/README.md @@ -21,7 +21,6 @@ Documentation is organized **by topic**, not by app. Each doc covers one cross-c | **Core API** | [Core_public_API.md](Core_public_API.md) | Stable `core` imports: collectors, error classification. | | **Operations** | [operations/](operations/README.md) | **Group:** shared I/O (GitHub, Discord, etc.) used by multiple apps. Index in [operations/README.md](operations/README.md). | | → GitHub | [operations/github.md](operations/github.md) | Clone, push, fetch file, create PR/issue/comment; token use. | -| → DiscordChatExporter | [operations/discord_chat_exporter.md](operations/discord_chat_exporter.md) | Install CLI, workspace path, `.env` for Tyrrrz exporter used by Discord ingestion. | | **Workspace** | [Workspace.md](Workspace.md) | Workspace layout and usage for file processing (`workspace//...`). | | **Schema** | [Schema.md](Schema.md) | Database schema and table relationships. | | **Development** | [Development_guideline.md](Development_guideline.md) | Development setup, app requirements, and step-by-step workflow. | @@ -37,12 +36,11 @@ Documentation is organized **by topic**, not by app. Each doc covers one cross-c **Operations** = external integrations used by many apps (not the same as **Service API**, which is for DB writes). See **[operations/README.md](operations/README.md)** for the full list and when to add one. - **GitHub:** [operations/github.md](operations/github.md) — `core.operations.github_ops` (clone, push, PR, issue, comment). -- **Discord (ingestion):** [operations/discord_chat_exporter.md](operations/discord_chat_exporter.md) — DiscordChatExporter CLI; [service_api/discord_activity_tracker.md](service_api/discord_activity_tracker.md) — commands, sync layout, Pinecone. *(Notifications / webhooks: add an operations doc when implemented.)* ## Finding app-specific info - **Service layer (create/update/delete):** [service_api/](service_api/) → e.g. [github_activity_tracker.md](service_api/github_activity_tracker.md). -- **Operations (GitHub, Discord, …):** [operations/README.md](operations/README.md) and the docs in [operations/](operations/). +- **Operations (GitHub, …):** [operations/README.md](operations/README.md) and the docs in [operations/](operations/). - **Workspace (file paths, JSON cache):** [Workspace.md](Workspace.md) — which apps use workspace and the folder layout. - **Schema (models):** [Schema.md](Schema.md). - **Workflow (when an app runs):** [Workflow.md](Workflow.md). diff --git a/docs/Schema.md b/docs/Schema.md index fcf973b5..e0d9f05c 100644 --- a/docs/Schema.md +++ b/docs/Schema.md @@ -21,7 +21,7 @@ erDiagram BaseProfile ||--o| MailingListProfile : "extends" BaseProfile ||--o| WG21PaperAuthorProfile : "extends" BaseProfile ||--o| YoutubeSpeaker : "extends" - BaseProfile ||--o| DiscordProfile : "extends" + BaseProfile ||--o| RedditUser : "extends" Identity }o--|| BaseProfile : "has" TempProfileIdentityRelation ||--o{ BaseProfile : "has" TmpIdentity ||--o{ TempProfileIdentityRelation : "has" @@ -81,12 +81,10 @@ erDiagram datetime updated_at } - DiscordProfile { - bigint discord_user_id "UK IX" - string username "IX" + RedditUser { + string reddit_user_id "UK IX" + string username "UK IX" string display_name "IX" - string avatar_url - boolean is_bot datetime created_at datetime updated_at } @@ -116,7 +114,7 @@ erDiagram } ``` -**Note:** Each extended table has `id` as primary key and foreign key to `BaseProfile.id`. The value is the same: one auto-increment in BaseProfile, and that same id is stored in exactly one extended profile row. Other tables (e.g. GitCommit, Issue) reference the profile via this single `id`. **DiscordProfile** (in `cppa_user_tracker`) is the author profile for **DiscordMessage** rows in `discord_activity_tracker` (`author_id` → `DiscordProfile.id`). +**Note:** Each extended table has `id` as primary key and foreign key to `BaseProfile.id`. The value is the same: one auto-increment in BaseProfile, and that same id is stored in exactly one extended profile row. Other tables (e.g. GitCommit, Issue) reference the profile via this single `id`. **Note:** The **Email** table references BaseProfile via `base_profile_id` (FK to `BaseProfile.id`). One profile can have multiple email addresses; `is_primary` marks the primary email; `is_active` indicates whether the email is currently active. Other tables (e.g. MailingListMessage) can link to a profile via Email. **Note:** The `email` field is **not unique**; the same email address may appear in multiple rows (e.g. for different profiles or over time). @@ -869,84 +867,7 @@ erDiagram --- -### 11. Discord Activity Tracker (`discord_activity_tracker`) - -Guilds, channels, messages, and reactions ingested from **DiscordChatExporter** JSON (see [service_api/discord_activity_tracker.md](service_api/discord_activity_tracker.md)). **Discord user rows** live in **`cppa_user_tracker.DiscordProfile`** (extends `BaseProfile`, section 1); this app only stores server/channel/message/reaction tables. - -```mermaid -erDiagram - direction LR - DiscordServer ||--o{ DiscordChannel : "has" - DiscordChannel ||--o{ DiscordMessage : "contains" - DiscordProfile ||--o{ DiscordMessage : "author" - DiscordMessage ||--o{ DiscordReaction : "has" - - DiscordServer { - bigint server_id "UK IX" - string server_name "IX" - string icon_url - datetime created_at - datetime updated_at - } - - DiscordChannel { - int id PK - int server_id FK - bigint channel_id "UK IX" - string channel_name "IX" - string channel_type - bigint category_id "IX nullable" - string category_name - text topic - int position - datetime created_at - datetime updated_at - } - - DiscordMessage { - int id PK - int channel_id FK - int author_id FK - bigint message_id "UK IX" - text content - string message_type "IX default Default" - boolean is_pinned "IX" - datetime message_created_at "IX" - datetime message_edited_at - boolean is_deleted "IX" - datetime deleted_at - bigint reply_to_message_id "IX nullable" - boolean has_attachments - json attachment_urls - datetime created_at - datetime updated_at - } - - DiscordReaction { - int id PK - int message_id FK - string emoji "IX" - int count - datetime created_at - datetime updated_at - } - - DiscordProfile { - int baseprofile_ptr_id PK "FK BaseProfile" - bigint discord_user_id "UK IX" - string username "IX" - } -``` - -**Note:** **DiscordServer** is keyed by Discord guild snowflake `server_id` (unique). **DiscordChannel** is keyed by `channel_id` (unique); `server_id` FK uses `db_column="server_id"` to the parent server’s PK `id` (Django default), not the snowflake — join in ORM via `channel.server.server_id` when you need the guild snowflake. - -**Note:** **DiscordMessage** is keyed by `message_id` (Discord snowflake, unique). `author_id` → **DiscordProfile** (`cppa_user_tracker`). `reply_to_message_id` stores the parent message snowflake when the message is a reply (no FK to another row). `message_type` and `is_pinned` mirror exporter metadata (migration `0005`). - -**Note:** **DiscordReaction** has a unique constraint on `(message, emoji)` (`discord_activity_tracker_msg_emoji_uniq`). - ---- - -### 12. Reddit Activity Tracker (`reddit_activity_tracker`) +### 11. Reddit Activity Tracker (`reddit_activity_tracker`) Subreddit posts and comments ingested from the Reddit OAuth API. Workspace JSON uses LangChain Document format (`page_content` + `metadata`); see PR2 workspace layout under `workspace/reddit_activity_tracker/{YYYY-MM}/`. No cross-app FKs — author identity is stored as plain strings (`author`, `author_id`). @@ -1004,7 +925,7 @@ erDiagram | **SlackUser** | Profile for Slack; extends BaseProfile. | 1 | | **MailingListProfile** | Profile for mailing list; extends BaseProfile. | 1 | | **WG21PaperAuthorProfile** | Profile for WG21 paper authors; extends BaseProfile. | 1 | -| **DiscordProfile** | Discord user profile (`cppa_user_tracker`); extends BaseProfile. `discord_user_id` UK; used as `DiscordMessage.author`. | 1, 11 | +| **RedditUser** | Profile for Reddit; extends BaseProfile. `reddit_user_id` UK; `username` UK. | 1 | | **TmpIdentity** | Temporary identity for staging (CPPA User Tracker). | 1 | | **TempProfileIdentityRelation** | Staging table: base_profile_id -> target_identity_id (CPPA User Tracker). | 1 | | **GitHubRepository** | Repository metadata (owner, repo_name, stars, forks, etc.). Base table for repo subtypes. | 2 | @@ -1052,17 +973,13 @@ erDiagram | **WebsiteVisitCount** | Per-date, per-country visit count. | 8 | | **WebsiteWordCount** | Per-date, per-word count. | 8 | | **PineconeFailList** | Failed sync records (failed_id, type) for retry/audit. | 9 | -| **PineconeSyncStatus** | Last sync per type (`app_type`, `final_sync_at`, …); includes Discord when `PINECONE_DISCORD_APP_TYPE` is set. | 9 | +| **PineconeSyncStatus** | Last sync per type (`app_type`, `final_sync_at`, …). | 9 | | **YoutubeSpeaker** | Profile for YouTube speakers; extends BaseProfile. Identified by `display_name`. | 1, 10 | | **YouTubeChannel** | Publisher channel; `channel_id` is PK (no auto-increment id). | 10 | | **YouTubeVideo** | Video metadata, transcript state, and channel FK; `video_id` is PK (no auto-increment id). | 10 | | **YouTubeVideoSpeaker** | M2M join between YouTubeVideo and YoutubeSpeaker (video_id, speaker_id). | 10 | | **CppaTags** | C++ community tag vocabulary (tag_name, unique/lowercase). | 10 | | **YouTubeVideoTags** | M2M join between YouTubeVideo and CppaTags (youtube_video_id, cppa_tag_id). | 10 | -| **DiscordServer** | Discord guild (`server_id` snowflake UK, name, icon). | 11 | -| **DiscordChannel** | Channel in a guild (channel_id UK, type, category, topic, sync/activity timestamps). | 11 | -| **DiscordMessage** | Message (`message_id` UK, content, type, pin, reply_to, attachments JSON, soft-delete flags). | 11 | -| **DiscordReaction** | Emoji aggregate per message (unique on message + emoji). | 11 | | **RedditSubmission** | Reddit post (`reddit_id` t3_* UK, subreddit, title, selftext, score, created_utc). | 12 | | **RedditComment** | Reddit comment (`reddit_id` t1_* UK, submission FK, parent_id, body, score, created_utc). | 12 | | **BoostDocContent** | Globally unique scraped page by content hash (url, content_hash UK, first_version_id, last_version_id, is_upserted, scraped_at). One row per unique content hash across all versions. | 10 | @@ -1074,7 +991,7 @@ erDiagram | --------------------------- | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------------- | | Identity | BaseProfile | One identity has many profiles | | BaseProfile | Email | One profile has many emails | -| BaseProfile | GitHubAccount, SlackUser, MailingListProfile, WG21PaperAuthorProfile, DiscordProfile, YoutubeSpeaker | Extends (1:1 subtype) | +| BaseProfile | GitHubAccount, SlackUser, MailingListProfile, WG21PaperAuthorProfile, RedditUser, YoutubeSpeaker | Extends (1:1 subtype) | | TmpIdentity | TempProfileIdentityRelation | Has many (target) | | TempProfileIdentityRelation | BaseProfile | Has many (base_profile_id) | | GitHubAccount | GitHubRepository | Owns many | @@ -1111,9 +1028,5 @@ erDiagram | YouTubeVideo | YouTubeVideoSpeaker | Has many speakers | | YouTubeVideo | YouTubeVideoTags | Has many tags | | CppaTags | YouTubeVideoTags | Tagged in many videos | -| DiscordServer | DiscordChannel | Has many channels | -| DiscordChannel | DiscordMessage | Contains many messages | -| DiscordProfile | DiscordMessage | Author (has many messages) | -| DiscordMessage | DiscordReaction | Has many reactions | | BoostLibraryVersion | BoostLibraryDocumentation | Has many (boost_library_version_id) | | BoostDocContent | BoostLibraryDocumentation | Used in many (boost_doc_content_id) | diff --git a/docs/Service_API.md b/docs/Service_API.md index d913cf67..6832069a 100644 --- a/docs/Service_API.md +++ b/docs/Service_API.md @@ -14,7 +14,6 @@ All writes to app models must go through the service layer. The API is documente | **boost_library_tracker** | `boost_library_tracker.services` | Boost libraries, versions, dependencies, categories, maintainers/authors. | | **boost_library_docs_tracker** | `boost_library_docs_tracker.services` | Globally unique doc content (BoostDocContent) and (library-version, page) relation tracking (BoostLibraryDocumentation). | | **boost_usage_tracker** | `boost_usage_tracker.services` | External repos, Boost usage, missing-header tmp. | -| **discord_activity_tracker** | `discord_activity_tracker.services` | Discord servers, channels, messages, reactions (authors: `cppa_user_tracker.DiscordProfile`). | | **cppa_youtube_script_tracker** | `cppa_youtube_script_tracker.services` | YouTube channels, videos, tags, transcript state; speaker links. | | **clang_github_tracker** | `clang_github_tracker.services` | Upsert llvm issue/PR/commit rows; fetch watermarks. | | **boost_mailing_list_tracker** | `boost_mailing_list_tracker.services` | Mailing list messages and names. | @@ -33,7 +32,6 @@ All writes to app models must go through the service layer. The API is documente - **[service_api/boost_library_docs_tracker.md](service_api/boost_library_docs_tracker.md)** – API for `boost_library_docs_tracker.services`. - **[service_api/cppa_pinecone_sync.md](service_api/cppa_pinecone_sync.md)** – API for `cppa_pinecone_sync.services`. - **[service_api/boost_usage_tracker.md](service_api/boost_usage_tracker.md)** – API for `boost_usage_tracker.services`. -- **[service_api/discord_activity_tracker.md](service_api/discord_activity_tracker.md)** – API for `discord_activity_tracker.services`; management commands, sync modules, and Pinecone notes. - **[service_api/cppa_youtube_script_tracker.md](service_api/cppa_youtube_script_tracker.md)** – API for `cppa_youtube_script_tracker.services`; preprocessor, fetcher, workspace, and transcript helpers. - **[service_api/clang_github_tracker.md](service_api/clang_github_tracker.md)** – API for `clang_github_tracker.services`. - **[service_api/boost_mailing_list_tracker.md](service_api/boost_mailing_list_tracker.md)** – API for `boost_mailing_list_tracker.services`. @@ -57,8 +55,6 @@ Some service functions validate arguments and raise before writing: - `get_or_create_boost_library(repo, name)`, `get_or_create_boost_version(version)`, `get_or_create_boost_library_category(name)` – Raise **`ValueError`** if name/version is empty or whitespace-only. - **boost_library_docs_tracker.services** - `get_or_create_doc_content(url, ...)` – Raises **`ValueError`** if `url` is empty or whitespace-only. -- **discord_activity_tracker.services** - - No intentional **`ValueError`** on invalid inputs; bulk helpers may **skip** rows and log warnings (see [discord_activity_tracker.md](service_api/discord_activity_tracker.md#raises-and-edge-behavior)). **`CollectorFailureCategory`** is not set in this module; see [discord_activity_tracker.md](service_api/discord_activity_tracker.md#collectorfailurecategory). See each app’s doc in [service_api/](service_api/) for parameter types, return types, and any **Raises** section. diff --git a/docs/Workspace.md b/docs/Workspace.md index b9f64c08..34a63ed3 100644 --- a/docs/Workspace.md +++ b/docs/Workspace.md @@ -20,16 +20,10 @@ workspace/ # WORKSPACE_DIR (configurable via │ │ └── prs/.json │ └── boost_mailing_list_tracker/ # Raw API responses (kept, not removed) │ └── /.json -│ └── discord_activity_tracker/ # DiscordChatExporter output (see below) -│ └── // # Archived JSON after DB import (YYYY-MM-DD.json) ├── clang_github_tracker/ # Markdown export for clang_github_tracker (md_export/) ├── boost_mailing_list_tracker/ # Mailing list messages (see below) │ └── / │ └── messages/.json # Formatted cache (processed then removed) -├── discord_activity_tracker/ # CLI install + backfill drop folder -│ ├── _exporter_staging/ # Temporary per-day export (cleared each run) -│ ├── script/ # DiscordChatExporter.Cli (default layout; optional) -│ └── Discussion - c-cpp-discussion/ # Pre-exported JSON for backfill (removed after import) └── shared/ # Temp files used by more than one app ``` @@ -49,13 +43,6 @@ So the workspace acts as a short-lived cache: files are deleted once they are in So: **raw/** = permanent archive of scraped API responses; **messages/** = short-lived cache (removed after DB persist). -### discord_activity_tracker paths - -1. **`run_discord_activity_tracker`** — DiscordChatExporter runs **per channel per UTC day**, writing scratch JSON under `discord_activity_tracker/_exporter_staging/`. Each file is parsed, upserted into the DB, then **merged** into `raw/discord_activity_tracker///YYYY-MM-DD.json` (same-day re-runs append/update by message id). -2. **`backfill_discord_activity_tracker`** — Place DiscordChatExporter JSON under `discord_activity_tracker/Discussion - c-cpp-discussion/` (any depth). Each file is imported, then **deleted** so it is not processed twice. - -See [service_api/discord_activity_tracker.md](service_api/discord_activity_tracker.md) and [operations/discord_chat_exporter.md](operations/discord_chat_exporter.md). - ## Configuration - **Setting:** `settings.WORKSPACE_DIR` (default: project root `workspace/`). @@ -122,33 +109,6 @@ for json_path in iter_existing_message_jsons("boost@lists.boost.org"): ... ``` -**discord_activity_tracker** (raw archive, staging, backfill folder): - -```python -from discord_activity_tracker.workspace import ( - get_workspace_root, - get_raw_dir, - get_exporter_staging_dir, - get_channel_raw_dir, - get_cpp_discussion_import_dir, -) - -# workspace/discord_activity_tracker/ (CLI script/, backfill drop folder) -app_root = get_workspace_root() - -# workspace/raw/discord_activity_tracker/ (archived per-channel JSON) -raw_root = get_raw_dir() - -# Staging dir used before per-channel archival -staging = get_exporter_staging_dir() - -# workspace/raw/discord_activity_tracker/// -channel_dir = get_channel_raw_dir(server_id=123, channel_id=456) - -# Backfill import root: .../Discussion - c-cpp-discussion/ -drop = get_cpp_discussion_import_dir() -``` - **Generic (any app):** ```python @@ -212,7 +172,6 @@ Related settings (see `config/settings.py`): - **github_activity_tracker:** JSON cache for commits, issues, and PRs; files are removed after being saved to the DB. - **boost_mailing_list_tracker:** JSON cache for mailing list messages; files are removed after being saved to the DB. -- **discord_activity_tracker:** Exporter CLI under `discord_activity_tracker/script/` (or `DISCORD_CHAT_EXPORTER_CLI`); **raw/** subtree keeps archived exports; backfill JSON in `Discussion - c-cpp-discussion/` is deleted after import. - **boost_library_tracker:** Downloaded PDFs, converted documents. - **shared:** Files that multiple apps read or write; clean up when no longer needed. diff --git a/docs/adr/README.md b/docs/adr/README.md index 2f01b7a1..9b59ea4b 100644 --- a/docs/adr/README.md +++ b/docs/adr/README.md @@ -7,7 +7,7 @@ ADRs capture significant architectural choices, context, and consequences. They | ADR | Summary | Status | |-----|---------|--------| | [identity-hub-decoupling.md](identity-hub-decoupling.md) | Identity hub data-layer decoupling (soft profile IDs) | Accepted (pilot: `boost_mailing_list_tracker`) | -| [paradigm-unification.md](paradigm-unification.md) | Batch (YAML/Celery) vs event-driven (Slack Socket Mode) paradigms, target swim-lane deployables, app mapping, migration path | See document | +| [paradigm-unification.md](paradigm-unification.md) | Batch (YAML/Celery) vs event-driven collection paradigms, target swim-lane deployables, app mapping, migration path | See document | ## Format diff --git a/docs/adr/paradigm-unification.md b/docs/adr/paradigm-unification.md index 3a33906e..358e018e 100644 --- a/docs/adr/paradigm-unification.md +++ b/docs/adr/paradigm-unification.md @@ -1,245 +1,60 @@ # ADR: Unify batch and event-driven collection paradigms **Date:** 2026-06-02 +**Updated:** 2026-06-18 — This public repository is batch collectors only. ## Context -Boost Data Collector runs two collection paradigms in one Django project without explicit architectural separation: +Boost Data Collector originally ran two collection paradigms in one Django project: 1. **Batch (scheduled) collectors** — YAML-driven schedules executed by Celery Beat → `run_scheduled_collectors_task` → `run_scheduled_collectors`, which invokes `run_*` management commands sequentially within each group batch. -2. **Event-driven (real-time) services** — Slack Socket Mode in `slack_event_handler`, handling WebSocket callbacks and background worker threads concurrently. +2. **Event-driven (real-time) services** — long-running Socket Mode listeners and similar processes that react to external events as they arrive. -Both paradigms share the same process model at deploy time (one monorepo, one PostgreSQL database, one `workspace/` tree, shared `INSTALLED_APPS` and settings). The batch path’s sequential guarantee applies only inside a single `run_scheduled_collectors` invocation; it does not extend to the event-driven path. Yet both can touch the same database and filesystem state. +**Current state (public repo):** This repository contains **batch collectors only**. Real-time listeners and related long-running entrypoints are not part of this tree. -Production scheduling is defined in [`config/boost_collector_schedule.yaml`](../../config/boost_collector_schedule.yaml). As of this ADR, that file defines **five groups** and **ten tasks** (UTC `default_time` per group): +Production scheduling for the public repo is defined in [`config/boost_collector_schedule.yaml`](../../config/boost_collector_schedule.yaml). See [Architecture_overview.md](../Architecture_overview.md) for the current app inventory. -| Group | `default_time` (UTC) | Tasks (count) | -|-------|----------------------|---------------| -| `github` | 00:05 | 6 (daily, monthly, on_release) | -| `boost_library_docs` | 16:20 | 1 (on_release) | -| `slack` | 16:30 | 1 (daily) | -| `discord` | 16:40 | 1 (daily) | -| `mailing_list` | 00:10 | 1 (daily) | +Cross-app coupling—especially Foreign Keys into [`cppa_user_tracker`](../cross-app-dependencies.md) as the identity hub—makes module boundaries expensive to move. The sequential batch model remains the primary paradigm **in this repo**. -Several collectors have `run_*` commands but are **not** in the production YAML (see [App classification](#app-classification)). [`config/boost_collector_schedule.yaml.example`](../../config/boost_collector_schedule.yaml.example) shows additional patterns (e.g. `run_clang_github_tracker` weekly, `run_wg21_paper_tracker` interval) that production may adopt later. - -Cross-app coupling—especially Foreign Keys into [`cppa_user_tracker`](../cross-app-dependencies.md) as the identity hub—makes module boundaries expensive to move. Untested apps and the absence of versioned deprecation paths ([`STABILITY.md`](../../STABILITY.md)) increase the cost of delaying paradigm separation. - -The sequential batch model is simple and reliable for nightly runs across many sources today, but it is a poor fit for planned growth: more sources, more organizations, and higher-frequency collection. Without documented swim lanes, every release that mixes paradigms in one deployable increases operational and correctness risk. - -For system context, see [Architecture overview](../Architecture_overview.md), [Architecture data flow](../Architecture_data_flow.md), [Workflow](../Workflow.md), and [Cross-app dependencies](../cross-app-dependencies.md). Bus-factor and onboarding material: [BUS_FACTOR_DELIVERABLES.md](../BUS_FACTOR_DELIVERABLES.md), [onboarding walkthroughs](../onboarding/README.md). +For system context, see [Architecture overview](../Architecture_overview.md), [Architecture data flow](../Architecture_data_flow.md), [Workflow](../Workflow.md), and [Cross-app dependencies](../cross-app-dependencies.md). ## Decision drivers -- **Correctness** — Concurrency and state-sharing bugs (e.g. file-queue races) must not be masked by “it works on the nightly schedule.” -- **Operability** — Operators need to know which process runs batch work vs real-time listeners, and what fails independently. -- **Scale** — More collectors, schedules, and orgs imply more parallel batch groups and possibly more event-driven entry points. -- **Maintainability** — Cross-app FKs and import-linter contracts ([`cross-app-dependencies.md`](../cross-app-dependencies.md)) favor incremental separation over a big-bang split. -- **Safety net** — Weak test coverage on some apps increases the value of clear boundaries before refactors. +- **Correctness** — Batch collectors must not assume global sequential guarantees across Celery groups. +- **Operability** — Operators need to know which process runs batch work vs optional realtime listeners. +- **Maintainability** — Cross-app FKs and import-linter contracts favor incremental separation over a big-bang split. ## Paradigm definitions -### Batch (scheduled) paradigm +### Batch (scheduled) paradigm — **in this repo** - **Trigger:** Celery Beat entries built from [`boost_collector_runner/schedule_config.py`](../../boost_collector_runner/schedule_config.py) reading the YAML schedule. -- **Entry point:** [`run_scheduled_collectors_task`](../../boost_collector_runner/tasks.py) → management command [`run_scheduled_collectors`](../../boost_collector_runner/management/commands/run_scheduled_collectors.py). -- **Execution model:** Within one batch, commands run **one after another** (`call_command` in a loop). **Different YAML groups** get **separate Beat entries** and may run **in parallel** on different Celery workers ([`tasks.py`](../../boost_collector_runner/tasks.py) passes `group_id` so each Beat run executes one group’s task list). +- **Entry point:** [`run_scheduled_collectors_task`](../../boost_collector_runner/tasks.py) → [`run_scheduled_collectors`](../../boost_collector_runner/management/commands/run_scheduled_collectors.py). +- **Execution model:** Within one batch, commands run **one after another**. **Different YAML groups** get **separate Beat entries** and may run **in parallel** on different Celery workers. - **State:** PostgreSQL via each app’s `services.py`; optional files under `workspace//`. -- **Sub-mode — interval batch:** Tasks with `schedule: interval` and `minutes: N` are still batch collectors (management commands), but Beat fires them every N minutes independently of group `default_time` ([Workflow](../Workflow.md)). - -```mermaid -flowchart TB - subgraph batch [Batch paradigm] - Beat[Celery Beat] - Task[run_scheduled_collectors_task] - Cmd[run_scheduled_collectors] - Trackers["run_* collector commands"] - PG[(PostgreSQL)] - WS1[workspace per tracker] - Beat --> Task --> Cmd --> Trackers - Trackers --> PG - Trackers --> WS1 - end -``` - -### Event-driven (real-time) paradigm - -- **Trigger:** Slack events over Socket Mode (and related Bolt handlers), not the YAML schedule. -- **Entry point:** [`run_slack_event_handler`](../../slack_event_handler/) (long-running process). -- **Execution model:** Concurrent event callbacks; per-team FIFO job queue with daemon worker threads ([`job_queue.py`](../../slack_event_handler/utils/job_queue.py)). -- **State:** JSON files under `workspace/slack_event_handler/` ([`state.py`](../../slack_event_handler/utils/state.py)); optional GitHub writes via `core.operations`. **No ORM models** in this app. - -```mermaid -flowchart TB - subgraph realtime [Event-driven paradigm] - Socket[Slack Socket Mode] - Handler[slack_event_handler] - Queue[JSON state and job_queue] - WS2[workspace/slack_event_handler] - Socket --> Handler --> Queue - Handler --> WS2 - end -``` - -### Platform layer (not a collection paradigm) - -- **`core`** — Collector contracts, errors, shared operations (no domain DB). -- **`cppa_user_tracker`** — Identity hub; batch collectors write through it via FKs and service calls. It is shared infrastructure, not “batch” or “event-driven” by itself. - -## Current boundaries - -| Separated today | Not separated today | -|-----------------|---------------------| -| Batch orchestration isolated in `boost_collector_runner` | Single Django project and `config/settings.py` | -| Celery worker + Beat run batch tasks ([`docker-compose.yml`](../../docker-compose.yml)) | `slack_event_handler` not a first-class Compose service; often run manually or alongside `web` ([Deployment](../Deployment.md)) | -| Sequential order **within** one `run_scheduled_collectors` run | Parallel **across** YAML groups on multiple workers | -| Event-driven code lives in its own app package | Same PostgreSQL and often same `workspace/` volume mount as batch collectors | -| File locks for Slack PR-bot state **within** `slack_event_handler` | No process-level fence between batch Celery work and Socket Mode in production layout | - -```mermaid -flowchart LR - subgraph deploy [Current production layout] - BatchSvc[celery_worker and celery_beat] - WebSvc[web gunicorn] - end - DB[(PostgreSQL)] - Redis[(Redis)] - WS[workspace volume] - BatchSvc --> DB - BatchSvc --> WS - WebSvc --> DB - WebSvc -.->|may host| Realtime[run_slack_event_handler] - Realtime --> WS -``` - -## App classification - -| Paradigm | Django app | Primary entry | Scheduled in production YAML? | -|----------|------------|---------------|-------------------------------| -| **Platform** | `core` | (library) | N/A | -| **Platform** | `cppa_user_tracker` | `run_cppa_user_tracker` | No | -| **Batch orchestration** | `boost_collector_runner` | `run_scheduled_collectors` | N/A (runner) | -| **Batch collector** | `github_activity_tracker` | via `run_boost_github_activity_tracker` | Yes (via `github` group) | -| **Batch collector** | `boost_library_tracker` | `run_boost_github_activity_tracker`, `collect_boost_libraries`, … | Yes | -| **Batch collector** | `boost_library_docs_tracker` | `run_boost_library_docs_tracker` | Yes (`on_release`) | -| **Batch collector** | `boost_library_usage_dashboard` | `run_boost_library_usage_dashboard` | Yes | -| **Batch collector** | `boost_usage_tracker` | `run_boost_usage_tracker`, `run_update_created_repos_by_language` | Yes | -| **Batch collector** | `boost_mailing_list_tracker` | `run_boost_mailing_list_tracker` | Yes | -| **Batch collector** | `cppa_slack_tracker` | `run_cppa_slack_tracker` | Yes | -| **Batch collector** | `discord_activity_tracker` | `run_discord_activity_tracker` | Yes | -| **Batch collector** | `cppa_pinecone_sync` | `run_cppa_pinecone_sync` | No | -| **Batch collector** | `clang_github_tracker` | `run_clang_github_tracker` | No (in `.example` only) | -| **Batch collector** | `wg21_paper_tracker` | `run_wg21_paper_tracker` | No (in `.example` as interval) | -| **Batch collector** | `cppa_youtube_script_tracker` | `run_cppa_youtube_script_tracker` | No | -| **Event-driven** | `slack_event_handler` | `run_slack_event_handler` | No (by design) | - -**Future event-driven candidates** (document only; no commitment): live Discord gateway ingestion, webhooks, or push-based pipelines that cannot be expressed as periodic `run_*` commands. Any such service should follow the same swim lane as `slack_event_handler`, not the YAML batch runner. - -## Example: `enqueue_job` race and paradigm boundaries - -The Slack PR comment bot illustrates why **in-paradigm** locking matters, and why **process boundaries** help but do not remove **shared-resource** risk. - -| Code path | Locking | -|-----------|---------| -| [`enqueue_job`](../../slack_event_handler/utils/job_queue.py) | Mutates queue under [`modify_state`](../../slack_event_handler/utils/state.py) (advisory file lock + in-process mutex) | -| [`estimated_delay_sec`](../../slack_event_handler/utils/job_queue.py) | Calls [`load_state`](../../slack_event_handler/utils/state.py) **without** the file lock | -| Worker loop [`_worker`](../../slack_event_handler/utils/job_queue.py) | Peeks `load_state(...)["queue"]` **without** lock, then dequeues under `modify_state` | - -That asymmetry is a time-of-check/time-of-use (TOCTOU) race **inside** the event-driven paradigm: delay estimates and empty-queue sleeps can disagree with concurrent enqueue/dequeue. - -**Intended fix (separate work):** Run read-only queue inspection and delay simulation under `state_file_lock` / `modify_state`, or otherwise align all queue readers with the same critical section as writers. - -**Why a process boundary helps:** Running `run_slack_event_handler` in a dedicated process (not mixed with Celery worker threads or ad hoc `web` usage) isolates failure domains and makes it obvious that batch sequential guarantees do not apply. It does **not** automatically fix the file-lock bug above. - -**What a process boundary does not fix:** Batch collectors and the Slack handler still share **PostgreSQL** (including `cppa_user_tracker` FK writes) and may share **`workspace/`** mounts. Cross-paradigm contention (long-running batch job vs realtime DB load) remains a scheduling and schema-design concern—see [cross-app-dependencies.md](../cross-app-dependencies.md). - -## Options considered - -### Option A — Monolith, documented swim lanes only - -Keep one deployable; document paradigms (this ADR) and coding rules. Lowest effort; weakest isolation. - -### Option B — Same monorepo, separate deployable processes (recommended) - -- **`collector-batch`:** `celery_worker` + `celery_beat` (unchanged responsibility). -- **`collector-realtime`:** dedicated service running only `run_slack_event_handler`. -- **`web`:** Gunicorn/admin; do not run Socket Mode in the web container. - -Shared: PostgreSQL, Redis, `workspace/`, Django settings. No identity-hub DB split in the first phase. - -### Option C — Separate services with API boundary for identity - -Extract `cppa_user_tracker` behind a versioned API or message bus. Maximum flexibility; highest cost given MTI/FK graph and import-linter contracts. Defer unless scale forces it. - -## Decision - -Adopt **Option B in phases** (see [Migration path](#migration-path)): -1. Document paradigms and app mapping (this ADR). -2. Harden event-driven state handling (`enqueue_job` / `estimated_delay_sec` locking). -3. Add a first-class realtime deployable in Compose/systemd docs. -4. Improve batch schedule coverage and cross-paradigm DB scheduling discipline. -5. Optionally tighten cross-app contracts via [`core.protocols`](../Core_public_API.md) and [`STABILITY.md`](../../STABILITY.md). +### Event-driven (real-time) paradigm — **out of scope for this repo** -Stay in **one repository and one database** until operational swim lanes prove insufficient. Do not plan a multi-repo split until import/schema coupling is reduced. +Long-running Socket Mode listeners and similar services are **not maintained in this public repository**. When such collectors are deployed alongside BDC, run them in a **dedicated process** separate from Celery batch workers. -**Target deployables (same repo):** +## App classification (public repo) -```mermaid -flowchart LR - subgraph target [Target deployables] - BatchSvc[celery_worker and beat] - RealtimeSvc[slack_event_handler service] - WebSvc[gunicorn web] - end - DB[(PostgreSQL)] - Redis[(Redis)] - WS[workspace volume] - BatchSvc --> DB - BatchSvc --> WS - RealtimeSvc --> WS - WebSvc --> DB -``` +| Paradigm | Examples in this repo | +|----------|----------------------| +| **Batch collector** | `github_activity_tracker`, `cppa_slack_tracker`, `boost_mailing_list_tracker`, … | +| **Batch orchestration** | `boost_collector_runner` | +| **Platform** | `core`, `cppa_user_tracker` | -## Consequences - -### Positive - -- Clear vocabulary for reviews and onboarding ([Architecture overview](../Architecture_overview.md) can link here). -- Safer path to more collectors and higher-frequency batch (`interval`) without overloading one mental model. -- Realtime failures and restarts do not require redeploying the entire batch stack once processes are split. - -### Negative / retained coupling - -- **`cppa_user_tracker` FK hub** remains; batch and future realtime DB writers still coordinate through PostgreSQL. -- **Import-linter and cross-app imports** still constrain refactors ([cross-app-dependencies.md](../cross-app-dependencies.md)). -- **Two operational surfaces** (Beat schedule + long-running listener) require monitoring and runbooks. -- Phased work does not by itself improve test coverage on untested apps. - -## Migration path - -| Phase | Scope | Outcome | -|-------|--------|---------| -| **0** | Publish this ADR; link from [docs/README.md](../README.md) | Shared terminology and app→paradigm map | -| **1** | Fix `enqueue_job` / `estimated_delay_sec` / worker peek locking; extend [`test_job_queue.py`](../../slack_event_handler/tests/test_job_queue.py) | Event-driven lane internally consistent under concurrency | -| **2** | Add `slack_event_handler` service to Compose/systemd; update [Docker.md](../Docker.md) and [Deployment.md](../Deployment.md) | Process boundary without repo split | -| **3** | Schedule hygiene: add or document unscheduled `run_*` commands; align prod YAML with `.example` where intended | Batch lane completeness | -| **4** | Operational: stagger heavy batch groups vs realtime peaks; document shared-DB contention mitigations | Reduced cross-paradigm interference | -| **5** (optional) | Expand [`core.protocols`](../Core_public_API.md) DTOs at app edges; follow [`STABILITY.md`](../../STABILITY.md) for deprecations | Safer future extraction | +## Decision -Phase 0 is satisfied by this document. Phases 1–5 are **future implementation**; they are not part of the ADR file change itself. +Keep **batch collectors in this public repo**. Stay in **one database** until operational swim lanes prove insufficient. -## Related work +## Migration path (historical) -- **`enqueue_job` race fix** — Phase 1; see [Example: `enqueue_job` race](#example-enqueue_job-race-and-paradigm-boundaries). -- **Schedule gaps** — `run_cppa_user_tracker`, `run_cppa_pinecone_sync`, `run_clang_github_tracker`, `run_wg21_paper_tracker`, `run_cppa_youtube_script_tracker`. -- **Import boundaries** — `lint-imports`, [`scripts/list_cross_app_imports.py`](../../scripts/list_cross_app_imports.py). -- **Workspace orphan grace** — Separate file-writer races documented in [Workspace.md](../Workspace.md) (`WORKSPACE_ORPHAN_INVALID_JSON_GRACE_SECONDS`); analogous “don’t read partial state without coordination” lesson. +Phases 1–2 in the original ADR (Compose service for realtime Slack listeners, queue locking fixes) applied to collectors that are no longer in this tree. ## References - [Workflow.md](../Workflow.md) — batch execution order and Celery Beat behavior - [boost_collector_runner/README.md](../../boost_collector_runner/README.md) -- [slack_event_handler/README.md](../../slack_event_handler/README.md) -- [Architecture_data_flow.md](../Architecture_data_flow.md) — batch vs long-running in persistence table -- [BUS_FACTOR_DELIVERABLES.md](../BUS_FACTOR_DELIVERABLES.md) +- [Architecture_data_flow.md](../Architecture_data_flow.md) diff --git a/docs/cross-app-dependencies.md b/docs/cross-app-dependencies.md index 9ae50957..7f1e7174 100644 --- a/docs/cross-app-dependencies.md +++ b/docs/cross-app-dependencies.md @@ -25,7 +25,7 @@ document. `core` is excluded because it is shared infrastructure, not a peer tr | App | Role | Has models? | | --- | --- | --- | -| `cppa_user_tracker` | Identity hub — GitHub, Discord, Slack, WG21, mailing-list, and YouTube speaker profiles | Yes | +| `cppa_user_tracker` | Identity hub — GitHub, Slack, Reddit, WG21, mailing-list, and YouTube speaker profiles | Yes | | `github_activity_tracker` | GitHub repos, files, commits, issues, pull requests | Yes | | `boost_library_tracker` | Boost libraries, versions, files, dependencies, maintainer roles | Yes | | `boost_library_docs_tracker` | Boost documentation content and sync status | Yes | @@ -35,11 +35,9 @@ document. `core` is excluded because it is shared infrastructure, not a peer tr | `cppa_pinecone_sync` | Pinecone vector sync status | Yes | | `clang_github_tracker` | Clang/LLVM GitHub activity | Yes | | `cppa_slack_tracker` | Slack teams, channels, messages | Yes | -| `discord_activity_tracker` | Discord servers, channels, messages | Yes | | `reddit_activity_tracker` | Reddit subreddit submissions and comments | Yes | | `wg21_paper_tracker` | WG21 paper tracking | Yes | | `cppa_youtube_script_tracker` | YouTube video metadata and transcripts | Yes | -| `slack_event_handler` | Slack event listener | No (no domain models) | | `boost_collector_runner` | YAML-driven schedule orchestration | No (no domain models) | --- @@ -74,7 +72,6 @@ These are hard database-level dependencies. They cannot be removed without migr | `wg21_paper_tracker` | `cppa_user_tracker` | FK | `WG21PaperAuthor.profile` → `WG21PaperAuthorProfile` | Intentional — paper author identity | | `cppa_youtube_script_tracker` | `cppa_user_tracker` | FK | `YouTubeVideoSpeaker.speaker` → `YoutubeSpeaker` | Intentional — speaker identity | | `cppa_slack_tracker` | `cppa_user_tracker` | Direct import + FK | `SlackChannel.creator`, `SlackMessage.user`, `SlackChannelMembership.user`, `SlackChannelMembershipChangeLog.user` → `SlackUser` | Intentional — channel/message author identity | -| `discord_activity_tracker` | `cppa_user_tracker` | Direct import + FK | `DiscordMessage.author` → `DiscordProfile` | Intentional — message author identity | --- @@ -89,7 +86,6 @@ queries a foreign app's model via `.objects`: | Source file | Foreign model queried | Query pattern | Intentional or tech debt | | --- | --- | --- | --- | -| `discord_activity_tracker/services.py` | `cppa_user_tracker.DiscordProfile` | `.objects.filter(discord_user_id__in=...)` | Intentional — bulk prefetch before delegating to `cppa_user_tracker.services`; avoids N+1 | | `boost_usage_tracker/post_process.py` | `boost_library_tracker.BoostFile` | `.objects.filter(github_file__filename__endswith=...)` | Intentional — resolves Boost header path to BoostFile; no service wrapper exists yet | | `boost_usage_tracker/update_boostusage_from_csv.py` | `github_activity_tracker.GitHubFile` | `.objects.filter(repo_id=..., filename=...)` | Intentional — CSV import resolves FK targets by field values | | `boost_usage_tracker/update_boostusage_from_csv.py` | `boost_library_tracker.BoostFile` | `.objects.filter(github_file__filename=...)` | Intentional — CSV import resolves Boost header FK | @@ -163,9 +159,6 @@ The **Kind** column classifies the imported symbol: | `cppa_slack_tracker` | `…/services.py` | `cppa_user_tracker` | `SlackUser`, `get_or_create_slack_user` | model + service | Intentional — correctly delegates user upsert | | `cppa_slack_tracker` | `…/sync/sync_user.py` | `cppa_user_tracker` | `get_or_create_slack_user` | service | Intentional — correctly delegates | | `cppa_slack_tracker` | `…/run_cppa_slack_tracker.py` | `cppa_pinecone_sync` | `sync_to_pinecone` | sync_api / lazy | Intentional — Pinecone upsert via `cppa_pinecone_sync.sync_api` from collector `sync_pinecone()` | -| `discord_activity_tracker` | `…/models.py` | `cppa_user_tracker` | `DiscordProfile` | model | Intentional — FK base class (see schema coupling §1) | -| `discord_activity_tracker` | `…/services.py` | `cppa_user_tracker` | `DiscordProfile`, `get_or_create_discord_profile` | model + service | Intentional — services reference FK target and delegate upsert | -| `discord_activity_tracker` | `…/sync/messages.py` | `cppa_user_tracker` | `get_or_create_discord_profile` | service | Intentional — correctly delegates | | `wg21_paper_tracker` | `…/services.py` | `cppa_user_tracker` | `WG21PaperAuthorProfile`, `get_or_create_wg21_paper_author_profile` | model + service | Intentional — correctly delegates author identity | | `wg21_paper_tracker` | `…/import_wg21_metadata_from_csv.py` | `cppa_user_tracker` | `get_or_create_wg21_paper_author_profile` | service / lazy | Intentional — CSV import delegates author upsert | | `cppa_youtube_script_tracker` | `…/run_cppa_youtube_script_tracker.py` | `cppa_user_tracker` | `get_or_create_youtube_speaker` | service | Intentional — correctly delegates speaker upsert | @@ -200,7 +193,6 @@ flowchart LR cppa_pinecone[cppa_pinecone_sync] clang_github[clang_github_tracker] cppa_slack[cppa_slack_tracker] - discord_act[discord_activity_tracker] wg21_paper[wg21_paper_tracker] cppa_youtube[cppa_youtube_script_tracker] boost_runner[boost_collector_runner] @@ -221,7 +213,6 @@ flowchart LR cppa_youtube -->|"ORM + import"| cppa_user cppa_slack -->|"ORM + import"| cppa_user cppa_slack -.->|"services (lazy)"| cppa_pinecone - discord_act -->|"ORM + import"| cppa_user clang_github -->|"sync_api"| github_act boost_runner -.->|"import (lazy)"| boost_lib ``` diff --git a/docs/discord-tracker-schema.md b/docs/discord-tracker-schema.md deleted file mode 100644 index d1817ef4..00000000 --- a/docs/discord-tracker-schema.md +++ /dev/null @@ -1,127 +0,0 @@ -# Discord activity tracker — staging JSON schema - -This document describes the JSON shapes used when Discord data is staged on disk or normalized immediately before database writes in `discord_activity_tracker`. Runtime validation is implemented with **Pydantic** in [`discord_activity_tracker/staging_schema.py`](../discord_activity_tracker/staging_schema.py) (`validate_envelope`, `validate_normalized_message`). - -## 1. Envelope (DiscordChatExporter file) - -A single exported channel file is one JSON object with three top-level keys: - -| Key | Type | Description | -| --- | --- | --- | -| `guild` | object | Guild metadata from DiscordChatExporter. | -| `channel` | object | Channel metadata. | -| `messages` | array | Message objects in export order. | - -Common **guild** keys (camelCase as emitted by the exporter; `extra` fields are allowed and ignored by validation): - -- `id` — guild snowflake (string or number in JSON). -- `name` — guild name. -- `iconUrl` — optional guild icon URL. - -Common **channel** keys: - -- `id`, `name`, `type`, `topic`, `category`, `categoryId` — as provided by the exporter. - -**Normalization contract:** After `json.load`, ingestion validates the envelope with `validate_envelope`, then converts each raw message with `convert_exporter_message_to_dict` in [`discord_activity_tracker/sync/chat_exporter.py`](../discord_activity_tracker/sync/chat_exporter.py) before bulk DB upsert. - -## 2. Normalized message record - -The dict returned by `convert_exporter_message_to_dict` (and consumed by `_prepare_message_data` in [`discord_activity_tracker/sync/messages.py`](../discord_activity_tracker/sync/messages.py), which **drops unknown keys** before ORM bulk write) uses **snake_case** for nested author fields aligned with the Discord bot API shape. - -| Field | Type | Notes | -| --- | --- | --- | -| `id` | integer | Message snowflake. | -| `content` | string | Message body; may be empty. | -| `created_at` | string | ISO 8601 timestamp (from exporter `timestamp`). Required non-empty for validation. | -| `edited_at` | string or null | ISO 8601 if edited; otherwise JSON `null` or omitted when absent. | -| `message_type` | string | Exporter/API `type` string (e.g. `Default`, `Reply`). **Opaque passthrough** — see [Limitations](#6-limitations--out-of-scope). | -| `is_pinned` | boolean | | -| `author` | object | `id`, `username`, `global_name`, `avatar_url`, `bot`. | -| `attachments` | array | Objects with optional `url`. | -| `reactions` | array | Only entries with a non-empty resolved emoji; `{ "emoji": string, "count": integer >= 0 }`. | -| `reference` | object or null | When present: `{ "message_id": integer or null }`. | - -### Canonical cross-tracker fields (additive) - -These are set by `convert_exporter_message_to_dict` when enough context exists. They are **not** persisted as separate ORM columns; they exist on the normalized dict for validation, logs, and downstream consumers. - -| Field | Type | When set | -| --- | --- | --- | -| `occurred_at` | string | ISO 8601 instant in UTC with `Z` suffix, from `created_at` when non-empty (implementation: `core.utils.datetime_parsing.format_instant_iso_z`). | -| `actor_id` | string | Discord user snowflake as decimal string when author `id` is non-zero. | -| `source_url` | string | When `server_id` and `channel_id` are passed into the converter and message id is non-zero: `https://discord.com/channels/{server}/{channel}/{message}` via `format_discord_url`. | - -### Null vs omitted - -- Prefer JSON **`null`** for nullable scalars when serializing (e.g. `edited_at`, `reference`) to match common REST-style workspace files elsewhere in the monorepo. -- Omit optional keys when the exporter does not provide them (e.g. `edited_at` absent vs `null`). - -## 3. Reactions - -Each reaction in the normalized message: - -- `emoji` — non-empty string (custom emoji name or Unicode). -- `count` — integer `>= 0`. - -Exporter rows with no resolvable emoji are **dropped** during conversion (they are not stored). - -## 4. `message_type` - -Treated as an **opaque string** from DiscordChatExporter or the Discord API. The app stores it on `DiscordMessage.message_type` without interpreting join/leave semantics from this field alone. See [Limitations](#6-limitations--out-of-scope). - -## 5. Channel activity summary (derived) - -Not materialized as a separate JSON file in this iteration. For a given export envelope, a logical summary can be computed as: - -- `server_id` / `channel_id` from `guild.id` / `channel.id`. -- `message_count` — `len(messages)`. -- `first_message_at` / `last_message_at` — from the first and last message `timestamp` / `created_at` after conversion, in UTC, if messages are non-empty. - -## 6. Limitations / out of scope - -The collector’s primary path fetches **per-channel** message history (DiscordChatExporter export or bot API sync). Therefore: - -- **`message_type` is not a membership lifecycle log.** System or non-default types may appear when the exporter includes them, but rows are **not** a complete or authoritative log of users joining or leaving the **server** or a **channel**. -- **Single-channel export/fetch** cannot infer server-wide join/leave; Discord does not guarantee join system messages appear in every text channel, and leaves often have no built-in chat message. Authoritative membership tracking would require gateway events, audit log (where permitted), multi-channel export including the guild system channel, or dedicated bot logging — outside the current design. - -Do not document join/leave **detection** as a capability of this schema. - -## 7. JSON Schema artifact vs runtime validation - -The committed file [`discord_activity_tracker/schemas/discord_staging_v1.json`](../discord_activity_tracker/schemas/discord_staging_v1.json) is an **optional** JSON document for reviewers who prefer raw [JSON Schema](https://json-schema.org/). It bundles `model_json_schema()` output for: - -- `DiscordChatExporterEnvelope` -- `NormalizedDiscordMessage` - -**Single source of truth at runtime:** the Pydantic models in [`discord_activity_tracker/staging_schema.py`](../discord_activity_tracker/staging_schema.py). The `.json` file can **drift** if models change and the file is not regenerated. - -**Regenerate** (from repository root, with `discord_activity_tracker` importable, e.g. `PYTHONPATH=.`): - -```bash -python -m discord_activity_tracker.scripts.write_staging_json_schema -``` - -or: - -```bash -python -c "from discord_activity_tracker.staging_schema import write_staging_json_schema; write_staging_json_schema()" -``` - -## Alignment with other trackers (conventions) - -| Concern | `github_activity_tracker` | `cppa_slack_tracker` | `discord_activity_tracker` (this doc) | -| --- | --- | --- | --- | -| Workspace layout | Per-owner/repo trees; JSON per commit/issue/PR under [`github_activity_tracker/workspace.py`](../github_activity_tracker/workspace.py). | Per team/channel; daily `YYYY-MM-DD.json`; iterators **sorted** by path. | Per-server under `workspace/discord_activity_tracker/`; raw archive under `WORKSPACE_DIR/raw/discord_activity_tracker/`; `iter_existing_message_jsons` yields **sorted** paths. | -| Field naming | Mostly GitHub REST / snake_case in cached JSON. | Slack API native keys in daily lists (`ts`, `text`, `user`, …). | Exporter camelCase in file → **normalized** snake_case + ISO timestamps on message dict. | -| Links | e.g. `html_url` on GitHub entities. | Slack permalinks vary by payload. | Canonical `source_url` on normalized message when guild/channel ids are known. | - -### Shared conceptual fields (mapping) - -| Concept | Discord (normalized dict) | Slack (workspace message) | GitHub (example) | -| --- | --- | --- | --- | -| When | `created_at`, `occurred_at` | `ts` (Unix fractional string) | `created_at`, `commit.author.date`, … | -| Actor | `author.id` + `actor_id` string | `user` | `author.login`, `user.login`, … | -| Body | `content` | `text` | `body`, commit `message`, … | -| Link | `source_url` | (construct from team/channel/ts) | `html_url` | - -Discord ingestion keeps legacy keys (`created_at`, `id`, …) for `_prepare_message_data` compatibility and adds **parallel** canonical fields above rather than renaming bulk keys. diff --git a/docs/operations/README.md b/docs/operations/README.md index 0929d69d..cc18ea22 100644 --- a/docs/operations/README.md +++ b/docs/operations/README.md @@ -1,11 +1,10 @@ # Operations -**Operations** are shared, app-agnostic helpers under **`core/operations/`** for talking to the outside world (GitHub, Discord, CLIs, and similar). They are **not** the per-app **service API**, which is reserved for **database** writes. +**Operations** are shared, app-agnostic helpers under **`core/operations/`** for talking to the outside world (GitHub, CLIs, and similar). They are **not** the per-app **service API**, which is reserved for **database** writes. | Name | Where it lives | Doc | Role | |------|----------------|-----|------| | GitHub | `core.operations.github_ops` | [github.md](github.md) | Clone, fetch, PR/issue/comment helpers; tokens. | -| DiscordChatExporter | External CLI | [discord_chat_exporter.md](discord_chat_exporter.md) | Install, paths, and `.env` for the exporter used by `discord_activity_tracker`. | **Adding a new operation:** Put shared integration code in **`core/operations/`**, document it here, and add a row to the table (use a new Django app only if you need models). diff --git a/docs/operations/discord_chat_exporter.md b/docs/operations/discord_chat_exporter.md deleted file mode 100644 index 0b0f1719..00000000 --- a/docs/operations/discord_chat_exporter.md +++ /dev/null @@ -1,92 +0,0 @@ -# DiscordChatExporter (CLI setup) - -This project uses **[DiscordChatExporter](https://github.com/Tyrrrz/DiscordChatExporter)** (CLI), not a separate product named “DiscordExpert.” The GUI and CLI come from the same Tyrrrz releases; ingestion here runs the **CLI** only (`export`, `exportguild`, `channels`), driven by `discord_activity_tracker/sync/chat_exporter.py` and `manage.py run_discord_activity_tracker`. - -Exporter credentials and Discord server/channel IDs are configured via `.env` (see `.env.example`). User-account automation may violate Discord’s Terms of Service; prefer official APIs and bots when possible. - ---- - -## 1. Download a release - -1. Open **[DiscordChatExporter releases](https://github.com/Tyrrrz/DiscordChatExporter/releases/latest)**. -2. Download the archive for your OS: - - **Windows:** e.g. `DiscordChatExporter.win-x64.zip` (contains `DiscordChatExporter.Cli.exe` and dependencies). - - **macOS Apple Silicon:** e.g. `DiscordChatExporter.osx-arm64.zip`. - - **macOS Intel:** e.g. `DiscordChatExporter.osx-x64.zip`. - - **Linux:** pick the matching `linux-*` zip. - -Official overview: [Tyrrrz/DiscordChatExporter](https://github.com/Tyrrrz/DiscordChatExporter). - ---- - -## 2. Where to install (this repo) - -Default layout (no `DISCORD_CHAT_EXPORTER_CLI` in `.env`): - -| Piece | Path | -|-------|------| -| Workspace root | `{WORKSPACE_DIR}/discord_activity_tracker/` (see [Workspace.md](../Workspace.md)) | -| CLI directory | `{WORKSPACE_DIR}/discord_activity_tracker/script/` | -| Binary (Windows) | `DiscordChatExporter.Cli.exe` | -| Binary (macOS / Linux) | `DiscordChatExporter.Cli` (no extension) | - -Create `script/` if it does not exist, extract the CLI **and** any bundled files from the zip into that folder, then ensure the binary is executable on Unix (`chmod +x DiscordChatExporter.Cli`). - -Alternatively, install the CLI anywhere and set **`DISCORD_CHAT_EXPORTER_CLI`** in `.env` to the **absolute path** of the executable. - ---- - -## 3. Configure environment variables - -All variables live in `.env` (see `.env.example` in the repo root). The ones that matter for the CLI: - -| Variable | Purpose | -|----------|---------| -| `DISCORD_SERVER_ID` | Guild snowflake to export. | -| `DISCORD_CHANNEL_IDS` | Optional comma-separated channel IDs; empty often means “all text channels” depending on exporter mode. | -| `DISCORD_CHAT_EXPORTER_CLI` | Optional absolute path to `DiscordChatExporter.Cli` / `.exe` if not using `workspace/.../script/`. | -| `DISCORD_CHAT_EXPORTER_DOTNET_DLL` | Optional path to `DiscordChatExporter.Cli.dll` — use with system `dotnet` on macOS when the bundled host fails (external disks / quarantine). | -| `DISCORD_CHAT_EXPORTER_DOTNET` | Optional explicit `dotnet` binary if not on `PATH`. | -| `DISCORD_CHAT_EXPORTER_MACOS_CLEAR_QUARANTINE` | If `true`, runs `xattr` cleanup on the CLI folder before export (only if you trust the files). | -| `DISCORD_CHAT_EXPORTER_PARALLEL` | Parallelism for `exportguild` (keep low if you hit OOM / SIGKILL). | -| `DISCORD_CHAT_EXPORTER_SEQUENTIAL_EXPORT` | When `true`, exports channels one-by-one (safer on huge guilds). | -| `DISCORD_CHAT_EXPORTER_INCLUDE_VC` | Whether to include voice channels in listings where applicable. | - -Optional **.NET GC** env vars (`DOTNET_GCConserveMemory`, etc.) are documented in `.env.example`; they are forwarded into the exporter subprocess to reduce memory spikes. - ---- - -## 4. macOS tips - -- **Architecture:** Use an **arm64** build on Apple Silicon and **x64** on Intel. The code validates the binary with `file(1)` where possible and errors with a hint if the ABI is wrong. -- **External volumes / Gatekeeper:** If the native CLI fails to start, use **`DISCORD_CHAT_EXPORTER_DOTNET_DLL`** plus a system-installed **`dotnet`** SDK/runtime (`brew install dotnet`), pointing at `DiscordChatExporter.Cli.dll` next to your extracted CLI files. -- **Quarantine:** Downloaded zips may carry quarantine flags; `DISCORD_CHAT_EXPORTER_MACOS_CLEAR_QUARANTINE` or manual `xattr -cr` on the `script/` folder can help (only for trusted binaries). - ---- - -## 5. How the project invokes it - -- **`manage.py run_discord_activity_tracker`** — Runs DiscordChatExporter → parses JSON → DB → archives under `{WORKSPACE_DIR}/raw/discord_activity_tracker///`, then optional Markdown export and Pinecone sync. -- **`manage.py backfill_discord_activity_tracker`** — Does **not** call the CLI by default in the current design: it imports JSON already placed under - `{WORKSPACE_DIR}/discord_activity_tracker/Discussion - c-cpp-discussion/` (recursive), then deletes each file after a successful DB import. Use the CLI manually or elsewhere to produce those JSON files if needed. - -For command-line flags on the Django side, see [service_api/discord_activity_tracker.md](../service_api/discord_activity_tracker.md). - ---- - -## 6. Quick sanity check - -After placing the CLI: - -```bash -# Replace with your actual binary path if needed -/path/to/DiscordChatExporter.Cli --help -``` - -Then a dry run (no writes): - -```bash -python manage.py run_discord_activity_tracker --dry-run -``` - -If Django reports a missing CLI or wrong architecture, follow the error text — it usually points at the releases page and expected binary name. diff --git a/docs/service_api/README.md b/docs/service_api/README.md index df113631..9cff2566 100644 --- a/docs/service_api/README.md +++ b/docs/service_api/README.md @@ -13,7 +13,6 @@ Index of all app service modules. All writes to app models must go through the s | [boost_library_docs_tracker.services](boost_library_docs_tracker.md) | boost_library_docs_tracker | BoostDocContent (per-content metadata and sync state: is_upserted, first/last_version); BoostLibraryDocumentation (join row linking library-version to doc content only). | | [cppa_pinecone_sync.services](cppa_pinecone_sync.md) | cppa_pinecone_sync | Pinecone fail list and sync status (failure tracking, last-sync bookkeeping). | | [boost_usage_tracker.services](boost_usage_tracker.md) | boost_usage_tracker | External repos, Boost usage, missing-header tmp. | -| [discord_activity_tracker.services](discord_activity_tracker.md) | discord_activity_tracker | Servers, channels, messages, reactions (user profiles in cppa_user_tracker). | | [cppa_youtube_script_tracker.services](cppa_youtube_script_tracker.md) | cppa_youtube_script_tracker | YouTube channels, videos, transcript state, and speaker links for C++ conference talks. | | [clang_github_tracker.services](clang_github_tracker.md) | clang_github_tracker | Upsert llvm issue/PR/commit rows; DB watermarks for API fetch windows. | | [boost_mailing_list_tracker.services](boost_mailing_list_tracker.md) | boost_mailing_list_tracker | Mailing list messages and list names. | @@ -32,7 +31,6 @@ Index of all app service modules. All writes to app models must go through the s - **boost_library_tracker** – Get-or-create BoostLibraryRepository, BoostLibrary, BoostVersion, BoostLibraryVersion; add dependencies, categories, and role relationships. - **boost_library_docs_tracker** – Get-or-create BoostDocContent (by content_hash; holds url, first/last_version, is_upserted); link to BoostLibraryVersion via BoostLibraryDocumentation (join row only); Pinecone sync driven by BoostDocContent.is_upserted. - **boost_usage_tracker** – Get-or-create BoostExternalRepository, create/update BoostUsage, record missing headers (BoostMissingHeaderTmp). -- **discord_activity_tracker** – Get-or-create DiscordServer, DiscordChannel; create/update DiscordMessage, DiscordReaction. Discord user profiles in cppa_user_tracker. - **cppa_youtube_script_tracker** – Get-or-create YouTubeChannel, YouTubeVideo; update transcript state; link speakers to videos. Speaker profiles (`YoutubeSpeaker`) in cppa_user_tracker. - **cppa_pinecone_sync** – Get/clear/record failed IDs in PineconeFailList; get/update PineconeSyncStatus. - **clang_github_tracker** – Upsert `ClangGithubIssueItem` / `ClangGithubCommit` during sync or backfill; read `Max(github_updated_at)` / `Max(github_committed_at)` for fetch cursors. diff --git a/docs/service_api/cppa_user_tracker.md b/docs/service_api/cppa_user_tracker.md index 2d921687..159b154a 100644 --- a/docs/service_api/cppa_user_tracker.md +++ b/docs/service_api/cppa_user_tracker.md @@ -1,7 +1,7 @@ # cppa_user_tracker.services **Module path:** `cppa_user_tracker.services` -**Description:** Identity, profiles (GitHubAccount, SlackUser, MailingListProfile, DiscordProfile, etc.), emails, and staging (TmpIdentity, TempProfileIdentityRelation). Single place for all writes to cppa_user_tracker models. +**Description:** Identity, profiles (GitHubAccount, SlackUser, MailingListProfile, etc.), emails, and staging (TmpIdentity, TempProfileIdentityRelation). Single place for all writes to cppa_user_tracker models. **Type notation:** Model types refer to `cppa_user_tracker.models` (e.g. `Identity`, `BaseProfile`, `Email`). @@ -19,7 +19,6 @@ | `get_github_account_by_username` | username: str | GitHubAccount \| None | Return GitHubAccount for username, or None if not found (read-only lookup). | | `get_mailing_list_profile_by_id` | profile_id: int | MailingListProfile \| None | Return MailingListProfile for profile_id, or None if not found (read-only lookup). | | `get_mailing_list_profiles_by_ids` | profile_ids: list[int] | dict[int, MailingListProfile] | Return mailing-list profiles keyed by pk for the given ids (read-only bulk lookup). | -| `get_or_create_discord_profile` | discord_user_id: int, username: str = '', display_name: str = '', avatar_url: str = '', is_bot: bool = False, identity: Identity \| None = None | tuple[DiscordProfile, bool] | Get or create a DiscordProfile by discord_user_id. Returns (profile, created). | | `get_or_create_github_account` | github_account_id: int, username: str = '', display_name: str = '', avatar_url: str = '', account_type: str = GitHubAccountType.USER, identity: Identity \| None = None | tuple[GitHubAccount, bool] | Get or create a GitHubAccount by github_account_id. Returns (account, created). | | `get_or_create_identity` | display_name: str = '', description: str = '', defaults: dict[str, Any] \| None = None | tuple[Identity, bool] | Get or create an Identity by display_name. If exists, updates description from defaults. | | `get_or_create_mailing_list_profile` | display_name: str = '', email: str = '' | tuple[MailingListProfile, bool] | Get or create a MailingListProfile by display_name and email. Returns (profile, created). | diff --git a/docs/service_api/discord_activity_tracker.md b/docs/service_api/discord_activity_tracker.md deleted file mode 100644 index 37a08947..00000000 --- a/docs/service_api/discord_activity_tracker.md +++ /dev/null @@ -1,151 +0,0 @@ -# discord_activity_tracker.services - -**Module path:** `discord_activity_tracker.services` -**Description:** Discord servers, channels, messages, and reactions. Single place for all writes to discord_activity_tracker models. Discord user profiles live in `cppa_user_tracker.DiscordProfile`. - -**Type notation:** Model types refer to `discord_activity_tracker.models` unless noted. `DiscordProfile` refers to `cppa_user_tracker.models.DiscordProfile`. - ---- - - -## Public API (generated) - -| Function | Parameters | Return type | Summary | -| --- | --- | --- | --- | -| `add_or_update_reaction` | message: DiscordMessage, emoji: str, count: int | Tuple[DiscordReaction, bool] | Upsert one reaction row per (message, emoji) with the given reaction count. | -| `bulk_process_message_batch` | message_data_list: List[Union[DiscordLivePreparedMessage, Dict[str, Any]]], channel: DiscordChannel | int | Run user upsert, message upsert, and reaction upsert inside one DB transaction. | -| `bulk_upsert_discord_messages` | message_data_list: Sequence[Union[DiscordLivePreparedMessage, Dict[str, Any]]], channel: DiscordChannel, user_map: Dict[int, DiscordProfile] | Dict[int, DiscordMessage] | Bulk upsert messages for one channel using ``bulk_create(update_conflicts=True)``. | -| `bulk_upsert_discord_reactions` | reaction_data_list: Sequence[Union[DiscordReactionPayload, Dict[str, Any]]], message_map: Dict[int, DiscordMessage] | None | Bulk upsert reactions using ``bulk_create(update_conflicts=True)``. | -| `bulk_upsert_discord_users` | user_data_list: List[Union[DiscordLiveUserPayload, Dict[str, Any]]] | Dict[int, DiscordProfile] | Upsert author profiles for a batch of messages. | -| `create_or_update_discord_message` | message_id: int, channel: DiscordChannel, author: DiscordProfile, content: str, message_created_at: datetime, message_edited_at: Optional[datetime] = None, reply_to_message_id: Optional[int] = None, attachment_urls: Optional[list] = None, message_type: str = 'Default', is_pinned: bool = False | Tuple[DiscordMessage, bool] | Create or update a single message by Discord ``message_id`` (upsert). | -| `get_active_channels` | server: DiscordServer, days: int = 30, channel_ids: Optional[List[int]] = None | QuerySet[DiscordChannel] | Same as ``queryset_channels_with_recent_messages`` with ``cutoff = now - days``. | -| `get_channel_latest_message_at` | channel: DiscordChannel | Optional[datetime] | Return the latest ``message_created_at`` among non-deleted messages in a channel. | -| `get_or_create_discord_channel` | server: DiscordServer, channel_id: int, channel_name: str, channel_type: str, topic: str = '', position: int = 0, category_id: Optional[int] = None, category_name: str = '' | Tuple[DiscordChannel, bool] | Get or create a channel row and refresh fields when the row already exists. | -| `get_or_create_discord_server` | server_id: int, server_name: str, icon_url: str = '' | Tuple[DiscordServer, bool] | Get or create a Discord guild (server) row and refresh metadata when it already exists. | -| `mark_message_deleted` | message: DiscordMessage, deleted_at: Optional[datetime] = None | DiscordMessage | Soft-delete a message: set ``is_deleted`` and ``deleted_at``. | -| `queryset_channels_with_recent_messages` | server: DiscordServer, cutoff: datetime, channel_ids: Optional[List[int]] = None | QuerySet[DiscordChannel] | Channels on ``server`` with at least one non-deleted message at or after ``cutoff``. | - - - -## Service contract - -- **get_or_create pattern:** `get_or_create_discord_server` and `get_or_create_discord_channel` return `tuple[Model, bool]` where the `bool` is Django's `created` flag (a new row was inserted on this call). -- **update_or_create pattern:** `create_or_update_discord_message` and `add_or_update_reaction` return `tuple[Model, bool]` with Django `update_or_create` semantics for `created`. -- **Partial updates:** On existing rows, server and channel helpers use `save(update_fields=[...])` when metadata changed; `mark_message_deleted` updates `is_deleted`, `deleted_at`, and `updated_at` via `update_fields`. -- **Bulk upsert:** `bulk_upsert_discord_messages` and `bulk_upsert_discord_reactions` use `bulk_create(..., update_conflicts=True, unique_fields=..., update_fields=...)`. **`bulk_upsert_discord_users`** uses per-row queries and `get_or_create_discord_profile` because `DiscordProfile` uses multi-table inheritance (no `bulk_create(update_conflicts=True)`). -- **Transactions:** `bulk_process_message_batch` wraps user → message → reaction upserts in a single `transaction.atomic()`; an unhandled exception rolls back all phases. -- **`bulk_process_message_batch` return value:** Returns `len(message_data_list)` when the input list is non-empty, **not** the count of rows successfully written. Individual messages may still be skipped inside `bulk_upsert_discord_messages` (see below). - ---- - -## Raises and edge behavior - -- **`discord_activity_tracker.services` does not intentionally raise `ValueError`** for invalid arguments; validate inputs at sync/staging boundaries where appropriate. -- **`bulk_upsert_discord_users`:** Each dict must include `user_id` (and keys used in the loop); malformed payloads can raise **`KeyError`**. -- **`bulk_upsert_discord_messages`:** If `user_map` has no profile for `message_data["author"]["user_id"]`, that message is **skipped** and a **warning** is logged (no exception). If every message in the batch is skipped, no bulk insert runs and `{}` is returned. -- **`bulk_upsert_discord_reactions`:** If `message_map` has no message for `discord_message_id`, that reaction is skipped **silently**. Duplicate `(message, emoji)` pairs in one batch keep the **last** entry. -- **ORM:** Functions may propagate Django database exceptions (e.g. `IntegrityError`, `OperationalError`) under concurrency or infrastructure faults. - ---- - -## CollectorFailureCategory - -`discord_activity_tracker.services` performs **database I/O only**. It does not call Discord HTTP APIs and does **not** assign [`CollectorFailureCategory`](../../core/errors.py) values. - -Collectors, management commands, and sync layers classify failures with [`classify_failure`](../../core/errors.py) when handling exceptions (e.g. DiscordChatExporter subprocess failures wrapped in `CommandError`, discord.py HTTP errors, rate limits). If ORM errors are passed through `classify_failure`, mapping follows **`core/errors.py`** (for example `django.core.exceptions.ValidationError` may map to **`VALIDATION`** in typical paths). - ---- - -## Sync package (`discord_activity_tracker.sync`) - -| Module / symbol | Role | -| --------------- | ---- | -| `sync/chat_exporter.py` | Runs **DiscordChatExporter** per channel per UTC day (`export`), date bounds in UTC. Used by **`run_discord_activity_tracker`**. | -| `sync/raw_archive.py` | `merge_exporter_json` — merge daily JSON archives by message id under `raw/discord_activity_tracker/`. | -| `sync/messages.py` | `_prepare_message_data`, `_process_messages_in_batches` (calls `bulk_process_message_batch`). Also exposes **discord.py** helpers (`DiscordSyncClient`, `sync_all_channels`, …) for Bot API–style sync; those entry points are **not** wired to `run_discord_activity_tracker` today (that command uses the DiscordChatExporter CLI only). | -| `sync/client.py` | `DiscordSyncClient` — discord.py wrapper (intents, fetch guild/channel/messages). | -| `sync/exporter_window.py` | `latest_message_created_at_for_guild`, `iter_channel_export_days` — DB lower bound and UTC day windows for exporter runs. | -| `sync/utils.py` | Parsing helpers shared by exporter and message pipelines. | -| `sync/export.py` | Markdown export from DB (used downstream of sync; see command help for `DISCORD_CONTEXT_*` settings). | - ---- - -## Ingestion commands - -Two management commands handle message ingestion. Both use **`AbstractCollector`** via **`BaseCollectorCommand`**, with four phases: **fetch → db_sync → save_raw → pinecone_sync**. - -### `run_discord_activity_tracker` — incremental / scheduled - -Uses **DiscordChatExporter** CLI with configured exporter credentials. Setup (download, install path, env vars): [DiscordChatExporter operations doc](../operations/discord_chat_exporter.md). - -Fetches into a staging directory, persists to the database, then archives JSON under: - -`{WORKSPACE_DIR}/raw/discord_activity_tracker///` - -DiscordChatExporter runs **once per channel per UTC calendar day** in the resolved window. Date bounds use **UTC** (see `sync/chat_exporter.py` and `sync/exporter_window.py`). When `--since` is omitted, the lower bound is the latest stored message time for this guild (and channel allowlist). If the database has no matching rows, only **today (UTC)** is exported. When `--until` is omitted, there is no upper bound (export through the present). Raw archives are stored as `YYYY-MM-DD.json` per channel; later runs **merge** new messages into the same file by message id. If `--since` and `--until` are both set but **since is after until**, the command logs a warning and treats both as unset, then recomputes bounds from the rules above. - -``` -python manage.py run_discord_activity_tracker [options] - -Options: - --dry-run No fetch, export, push, or Pinecone writes; planned steps logged at INFO - --skip-discord-sync Skip DiscordChatExporter, DB upserts, and raw JSON - --skip-markdown-export Skip writing Markdown from DB to DISCORD_CONTEXT_REPO_PATH - --skip-remote-push Skip git commit/push after export (see DISCORD_CONTEXT_AUTO_COMMIT) - --skip-pinecone Skip run_cppa_pinecone_sync - --ignore-pinecone Deprecated alias for --skip-pinecone - --since, --until ISO or YYYY-MM-DD window (UTC; aliases: --from-date, --to-date, --start-time, --end-time). Omit `--since` to continue from latest DB message; omit `--until` for no upper bound. - --channels IDS Comma-separated channel ID override - --task {sync,export,all} Deprecated: maps to the skip flags (prefer --skip-*) -``` - -### `backfill_discord_activity_tracker` — import JSON from workspace - -Imports **existing** DiscordChatExporter JSON files from: - -`{WORKSPACE_DIR}/discord_activity_tracker/Discussion - c-cpp-discussion/` - -(recursively; skips macOS `._*.json` sidecars). Each file is parsed, upserted into the database, then **deleted** after a successful import so it is not processed again. Does **not** invoke DiscordChatExporter itself — export JSON elsewhere or manually, then drop it into that folder. - -``` -python manage.py backfill_discord_activity_tracker [options] - -Options: - --skip-pinecone Skip Pinecone sync after import - --ignore-pinecone Deprecated alias for --skip-pinecone - --dry-run List files that would be imported; no DB writes or deletes -``` - -### Channel allowlist - -`run_discord_activity_tracker` respects `DISCORD_CHANNEL_IDS` in `settings.py` (from the `DISCORD_CHANNEL_IDS` env var, comma-separated snowflake IDs). The `--channels` CLI argument overrides the setting for a single run. - -`backfill_discord_activity_tracker` imports every JSON file under the drop folder; it does not filter by `DISCORD_CHANNEL_IDS`. - ---- - -## Pinecone integration - -`discord_activity_tracker/preprocessor.py` exposes `preprocess_discord_for_pinecone(failed_ids, final_sync_at)` which: - -1. Queries `DiscordMessage` rows (incremental: `updated_at` after `final_sync_at`, plus any `failed_ids` retry; first run with no watermark indexes all non-deleted messages). -2. Groups messages into reply chains (`reply_to_message_id` linking). -3. Filters documents with fewer than `PINECONE_MIN_TEXT_LENGTH` (default 20) characters. -4. Emits `{"content": str, "metadata": {...}}` dicts with metadata keys: `doc_id`, `type`, `channel_id`, `channel_name`, `server_id`, `server_name`, `author`, `timestamp`, `is_reply_chain`, `source_ids`. - -Settings: - -| Setting | Default | Description | -| ----------------------------- | ------------------- | ---------------------------------------- | -| `PINECONE_DISCORD_APP_TYPE` | (empty skips sync) | Passed to `run_cppa_pinecone_sync` as `--app-type`. If unset/empty, Pinecone sync is skipped. | -| `PINECONE_DISCORD_NAMESPACE` | (empty skips sync) | Pinecone namespace. If unset/empty, Pinecone sync is skipped. | - ---- - -## Related - -- [DiscordChatExporter setup](../operations/discord_chat_exporter.md) — download, install, `.env` -- [Service API index](README.md) -- [CONTRIBUTING](../../CONTRIBUTING.md) -- [Schema](../Schema.md) -- [Workspace](../Workspace.md) – raw archives under `{WORKSPACE_DIR}/raw/discord_activity_tracker///`; app folder `{WORKSPACE_DIR}/discord_activity_tracker/` (CLI `script/`, backfill drop `Discussion - c-cpp-discussion/`) diff --git a/pyrightconfig.json b/pyrightconfig.json index f5bf5eb7..b7ed3fe0 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -2,7 +2,6 @@ "include": [ "core", "github_activity_tracker", - "discord_activity_tracker", "cppa_slack_tracker", "cppa_user_tracker", "cppa_pinecone_sync" diff --git a/reddit_activity_tracker/tests/test_fetcher_helpers.py b/reddit_activity_tracker/tests/test_fetcher_helpers.py new file mode 100644 index 00000000..06f14055 --- /dev/null +++ b/reddit_activity_tracker/tests/test_fetcher_helpers.py @@ -0,0 +1,181 @@ +"""Unit tests for reddit_activity_tracker.fetcher helpers (no live Reddit API).""" + +from __future__ import annotations + +import base64 +import json +import time +from unittest.mock import MagicMock, patch + +import pytest +import requests +from django.test import override_settings + +from reddit_activity_tracker.fetcher import ( + RedditSession, + _credentials_configured, + _is_bearer_expired, + _jwt_expiry, + _normalize_bearer, + build_session, +) + + +def _make_jwt(exp: float | None) -> str: + header = ( + base64.urlsafe_b64encode(b'{"alg":"none","typ":"JWT"}').decode().rstrip("=") + ) + payload_dict: dict[str, float] = {} + if exp is not None: + payload_dict["exp"] = exp + payload = ( + base64.urlsafe_b64encode(json.dumps(payload_dict).encode()).decode().rstrip("=") + ) + return f"{header}.{payload}.sig" + + +def test_normalize_bearer_strips_prefix(): + assert _normalize_bearer("Bearer abc.def.ghi") == "abc.def.ghi" + assert _normalize_bearer(" bearer token ") == "token" + + +def test_jwt_expiry_reads_exp_claim(): + exp = time.time() + 3600 + token = _make_jwt(exp) + assert _jwt_expiry(token) == pytest.approx(exp) + + +def test_jwt_expiry_returns_none_for_invalid_token(): + assert _jwt_expiry("not-a-jwt") is None + assert _jwt_expiry(_make_jwt(None)) is None + + +def test_is_bearer_expired_respects_leeway(): + past = time.time() - 120 + future = time.time() + 3600 + assert _is_bearer_expired(_make_jwt(past)) is True + assert _is_bearer_expired(_make_jwt(future)) is False + assert _is_bearer_expired("opaque-token") is False + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + (None, None), + ("", None), + (" ", None), + ("your_client_id", None), + ("real-id", "real-id"), + ], +) +def test_credentials_configured(value, expected): + assert _credentials_configured(value) == expected + + +@override_settings( + REDDIT_USER_AGENT="test-agent", + REDDIT_CLIENT_ID="cid", + REDDIT_CLIENT_SECRET="secret", + REDDIT_BEARER_TOKEN="", + REDDIT_SESSION_COOKIE="", +) +def test_build_session_uses_client_credentials(): + session = build_session() + assert isinstance(session, RedditSession) + assert session._client_id == "cid" + assert session._client_secret == "secret" + + +@override_settings( + REDDIT_USER_AGENT="test-agent", + REDDIT_CLIENT_ID="", + REDDIT_CLIENT_SECRET="", + REDDIT_BEARER_TOKEN="", + REDDIT_SESSION_COOKIE="", +) +def test_build_session_raises_when_no_credentials(): + with pytest.raises(EnvironmentError, match="No Reddit credentials"): + build_session() + + +@override_settings( + REDDIT_USER_AGENT="", + REDDIT_CLIENT_ID="cid", + REDDIT_CLIENT_SECRET="secret", +) +def test_build_session_requires_user_agent(): + with pytest.raises(EnvironmentError, match="REDDIT_USER_AGENT"): + build_session() + + +@override_settings( + REDDIT_USER_AGENT="test-agent", + REDDIT_CLIENT_ID="", + REDDIT_CLIENT_SECRET="", + REDDIT_BEARER_TOKEN="", + REDDIT_SESSION_COOKIE="", +) +def test_build_session_raises_when_bearer_expired(): + expired = _make_jwt(time.time() - 120) + with override_settings(REDDIT_BEARER_TOKEN=expired): + with pytest.raises(EnvironmentError, match="expired"): + build_session() + + +@override_settings( + REDDIT_USER_AGENT="test-agent", + REDDIT_CLIENT_ID="", + REDDIT_CLIENT_SECRET="", + REDDIT_BEARER_TOKEN="", + REDDIT_SESSION_COOKIE="cookie", + REDDIT_CSRF_TOKEN="csrf", +) +@patch( + "reddit_activity_tracker.fetcher.mint_bearer_from_session", + return_value="fresh.jwt.sig", +) +def test_build_session_mints_from_session_cookie(mock_mint): + session = build_session() + mock_mint.assert_called_once_with("cookie", "test-agent", "csrf") + assert session._session_cookie == "cookie" + + +def test_reddit_session_apply_bearer_sets_authorization_header(): + token = _make_jwt(time.time() + 3600) + session = RedditSession(None, None, "agent", bearer_token=token) + assert ( + session._session.headers["Authorization"] + == f"Bearer {_normalize_bearer(token)}" + ) + + +def test_reddit_session_backoff_honors_retry_after(): + session = RedditSession("id", "secret", "agent") + resp = MagicMock() + resp.headers = {"Retry-After": "3"} + with patch("reddit_activity_tracker.fetcher.random.uniform", return_value=0.5): + assert session._backoff_seconds(resp, 1.0) == 3.5 + + +def test_reddit_session_backoff_uses_rate_limit_reset(): + session = RedditSession("id", "secret", "agent") + resp = MagicMock() + resp.headers = {"X-Ratelimit-Reset": "2"} + with patch("reddit_activity_tracker.fetcher.random.uniform", return_value=1.0): + assert session._backoff_seconds(resp, 1.0) == 3.0 + + +def test_reddit_session_backoff_default_delay_with_jitter(): + session = RedditSession("id", "secret", "agent") + with patch("reddit_activity_tracker.fetcher.random.uniform", return_value=0.25): + assert session._backoff_seconds(None, 2.0) == 2.25 + + +def test_reddit_session_update_rate_limit_state(): + session = RedditSession("id", "secret", "agent") + resp = requests.Response() + resp.headers["X-Ratelimit-Remaining"] = "4.5" + resp.headers["X-Ratelimit-Reset"] = "12" + session._update_rate_limit_state(resp) + assert session._remaining == 4.5 + assert session._reset == 12.0 diff --git a/requirements.in b/requirements.in index e3b49770..c0fc1e40 100644 --- a/requirements.in +++ b/requirements.in @@ -2,8 +2,7 @@ # unless pinning is required for a known breakage. Regenerate locks after edits (Linux/CI/Docker): # python -m uv pip compile requirements.in -o requirements.lock --python-version 3.13 --python-platform linux # python -m uv pip compile requirements-dev.in -o requirements-dev.lock --python-version 3.13 --python-platform linux -# Do not compile on Windows alone: browser-cookie3 pulls pywin32 into the lock and breaks Linux CI. -# After recompiling, confirm plyvel keeps `; sys_platform != "win32"` in both lock files (uv may omit it). +# Compile locks on Linux/CI/Docker (not Windows alone) so platform-specific markers stay consistent. # --- Core web / config --- Django>=4.2,<5 @@ -18,8 +17,6 @@ idna>=3.15,<4 PyJWT>=2.13.0,<3 # GHSA-537c-gmf6-5ccf: fixed in cryptography 48.0.1 (transitive via google-auth, etc.). cryptography>=48.0.1,<49 -discord.py>=2.3.0,<3 -# CVE-2026-34993, CVE-2026-47265: fixed in aiohttp 3.14.0; CVE-2026-54273..54280: fixed in 3.14.1. aiohttp>=3.14.1,<4 python-dateutil>=2.8.0,<3 celery[redis]>=5.3,<6 @@ -43,18 +40,7 @@ python-json-logger>=2.0,<4 slack-bolt>=1.18,<2 pytz>=2024.1,<2026 -# --- slack_event_handler: Chrome profile read for Slack xoxc/xoxd token extraction --- -# (slack_event_handler.utils.slack_tokens; used from core.operations.slack_ops.fetcher) -# plyvel builds from source: install LevelDB headers (Debian: libleveldb-dev; macOS: brew install leveldb). -# Omitted on Windows (no reliable LevelDB C++ build); cookie/SQLite paths still work there. -plyvel>=1.5,<2; sys_platform != "win32" -browser-cookie3>=0.19,<1 - # --- wg21_paper_tracker / cppa_youtube_script_tracker --- google-api-python-client>=2.100,<3 # CVE-2026-50019, CVE-2026-50023, CVE-2026-50574, GHSA-69qj-pvh9-c5wg: fixed in 2026.6.9. yt-dlp>=2026.6.9,<2027 - -# --- slack_event_handler (GitHub PR comments) --- -PyGithub>=2.0,<3 -portalocker>=2.8,<3 diff --git a/requirements.lock b/requirements.lock index a7329ac2..853f6650 100644 --- a/requirements.lock +++ b/requirements.lock @@ -3,9 +3,7 @@ aiohappyeyeballs==2.6.1 # via aiohttp aiohttp==3.14.1 - # via - # -r requirements.in - # discord-py + # via -r requirements.in aiosignal==1.4.0 # via aiohttp amqp==5.3.1 @@ -16,14 +14,10 @@ asgiref==3.11.1 # via django attrs==26.1.0 # via aiohttp -audioop-lts==0.2.2 - # via discord-py beautifulsoup4==4.14.3 # via -r requirements.in billiard==4.2.4 # via celery -browser-cookie3==0.20.1 - # via -r requirements.in celery==5.6.3 # via -r requirements.in certifi==2026.4.22 @@ -31,9 +25,7 @@ certifi==2026.4.22 # pinecone # requests cffi==2.0.0 - # via - # cryptography - # pynacl + # via cryptography charset-normalizer==3.4.7 # via requests click==8.3.3 @@ -52,9 +44,6 @@ cryptography==48.0.1 # via # -r requirements.in # google-auth - # pyjwt -discord-py==2.7.1 - # via -r requirements.in django==4.2.30 # via -r requirements.in django-environ==0.13.0 @@ -87,14 +76,10 @@ idna==3.16 # -r requirements.in # requests # yarl -jeepney==0.9.0 - # via browser-cookie3 kombu==5.6.2 # via celery lxml==6.1.0 # via -r requirements.in -lz4==4.4.5 - # via browser-cookie3 multidict==6.7.1 # via # aiohttp @@ -107,10 +92,6 @@ pinecone==6.0.2 # via -r requirements.in pinecone-plugin-interface==0.0.7 # via pinecone -plyvel==1.5.1 ; sys_platform != "win32" - # via -r requirements.in -portalocker==2.10.1 - # via -r requirements.in prompt-toolkit==3.0.52 # via click-repl propcache==0.5.2 @@ -134,21 +115,14 @@ pyasn1-modules==0.4.2 # via google-auth pycparser==3.0 # via cffi -pycryptodomex==3.23.0 - # via browser-cookie3 pydantic==2.13.4 # via -r requirements.in pydantic-core==2.46.4 # via pydantic -pygithub==2.9.1 - # via -r requirements.in pyjwt==2.13.0 # via # -r requirements.in - # pygithub # redis -pynacl==1.6.2 - # via pygithub pypandoc==1.17 # via -r requirements.in pyparsing==3.3.2 @@ -172,7 +146,6 @@ requests==2.33.1 # via # -r requirements.in # google-api-core - # pygithub six==1.17.0 # via python-dateutil slack-bolt==1.28.0 @@ -189,7 +162,6 @@ typing-extensions==4.15.0 # pinecone # pydantic # pydantic-core - # pygithub # typing-inspection typing-inspection==0.4.2 # via pydantic @@ -203,7 +175,6 @@ urllib3==2.7.0 # via # -r requirements.in # pinecone - # pygithub # requests vine==5.1.0 # via diff --git a/scripts/check_service_layer_writes.py b/scripts/check_service_layer_writes.py index 6b74fff3..7cbcd3d7 100644 --- a/scripts/check_service_layer_writes.py +++ b/scripts/check_service_layer_writes.py @@ -37,10 +37,8 @@ "cppa_pinecone_sync", "clang_github_tracker", "cppa_slack_tracker", - "discord_activity_tracker", "wg21_paper_tracker", "cppa_youtube_script_tracker", - "slack_event_handler", ] TRACKER_APP_SET = set(TRACKER_APPS) diff --git a/scripts/list_cross_app_imports.py b/scripts/list_cross_app_imports.py index c0cc96c4..2001406a 100644 --- a/scripts/list_cross_app_imports.py +++ b/scripts/list_cross_app_imports.py @@ -39,10 +39,8 @@ "cppa_pinecone_sync", "clang_github_tracker", "cppa_slack_tracker", - "discord_activity_tracker", "wg21_paper_tracker", "cppa_youtube_script_tracker", - "slack_event_handler", ] TRACKER_APP_SET = set(TRACKER_APPS) diff --git a/scripts/wait_discord_chrome_profile.sh b/scripts/wait_discord_chrome_profile.sh deleted file mode 100644 index 88e1cc64..00000000 --- a/scripts/wait_discord_chrome_profile.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash -# Wait until discord-chromium has written a usable Chrome profile (Cookies + LevelDB). -set -euo pipefail - -PROFILE_ROOT="${1:-workspace/discord_activity_tracker/chrome_profile}" -COOKIES="${PROFILE_ROOT}/Default/Cookies" -LEVELDB="${PROFILE_ROOT}/Default/Local Storage/leveldb" -TIMEOUT_SEC="${DISCORD_PROFILE_WAIT_TIMEOUT:-600}" -INTERVAL_SEC="${DISCORD_PROFILE_WAIT_INTERVAL:-5}" - -if ! [[ "${TIMEOUT_SEC}" =~ ^[0-9]+$ ]] || ! [[ "${INTERVAL_SEC}" =~ ^[0-9]+$ ]]; then - echo "DISCORD_PROFILE_WAIT_TIMEOUT and DISCORD_PROFILE_WAIT_INTERVAL must be non-negative integers." >&2 - exit 1 -fi -if (( TIMEOUT_SEC <= 0 || INTERVAL_SEC <= 0 )); then - echo "DISCORD_PROFILE_WAIT_TIMEOUT and DISCORD_PROFILE_WAIT_INTERVAL must be > 0." >&2 - exit 1 -fi - -deadline=$((SECONDS + TIMEOUT_SEC)) -echo "Waiting for Discord Chrome profile under ${PROFILE_ROOT}" -echo " Sign in at http://127.0.0.1:7901 → https://discord.com" -echo " Timeout: ${TIMEOUT_SEC}s (override with DISCORD_PROFILE_WAIT_TIMEOUT)" - -while (( SECONDS < deadline )); do - if [[ -f "${COOKIES}" && -s "${COOKIES}" && -d "${LEVELDB}" ]]; then - if compgen -G "${LEVELDB}/*" > /dev/null; then - echo "Profile ready (${COOKIES}, ${LEVELDB})." - exit 0 - fi - fi - sleep "${INTERVAL_SEC}" -done - -echo "Timed out waiting for Chrome profile. Check noVNC login and discord-chromium logs." >&2 -exit 1 diff --git a/scripts/wait_slack_chrome_profile.sh b/scripts/wait_slack_chrome_profile.sh deleted file mode 100644 index cf03ed65..00000000 --- a/scripts/wait_slack_chrome_profile.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash -# Wait until slack-chromium has written a usable Chrome profile (Cookies + LevelDB). -set -euo pipefail - -PROFILE_ROOT="${1:-workspace/slack_event_handler/chrome_profile}" -COOKIES="${PROFILE_ROOT}/Default/Cookies" -LEVELDB="${PROFILE_ROOT}/Default/Local Storage/leveldb" -TIMEOUT_SEC="${SLACK_PROFILE_WAIT_TIMEOUT:-600}" -INTERVAL_SEC="${SLACK_PROFILE_WAIT_INTERVAL:-5}" - -if ! [[ "${TIMEOUT_SEC}" =~ ^[0-9]+$ ]] || ! [[ "${INTERVAL_SEC}" =~ ^[0-9]+$ ]]; then - echo "SLACK_PROFILE_WAIT_TIMEOUT and SLACK_PROFILE_WAIT_INTERVAL must be non-negative integers." >&2 - exit 1 -fi -if (( TIMEOUT_SEC <= 0 || INTERVAL_SEC <= 0 )); then - echo "SLACK_PROFILE_WAIT_TIMEOUT and SLACK_PROFILE_WAIT_INTERVAL must be > 0." >&2 - exit 1 -fi - -deadline=$((SECONDS + TIMEOUT_SEC)) -echo "Waiting for Slack Chrome profile under ${PROFILE_ROOT}" -echo " Sign in at http://127.0.0.1:7900 → https://app.slack.com" -echo " Timeout: ${TIMEOUT_SEC}s (override with SLACK_PROFILE_WAIT_TIMEOUT)" - -while (( SECONDS < deadline )); do - if [[ -f "${COOKIES}" && -s "${COOKIES}" && -d "${LEVELDB}" ]]; then - if compgen -G "${LEVELDB}/*" > /dev/null; then - echo "Profile ready (${COOKIES}, ${LEVELDB})." - exit 0 - fi - fi - sleep "${INTERVAL_SEC}" -done - -echo "Timed out waiting for Chrome profile. Check noVNC login and slack-chromium logs." >&2 -exit 1 diff --git a/slack_event_handler/README.md b/slack_event_handler/README.md deleted file mode 100644 index 2f082b03..00000000 --- a/slack_event_handler/README.md +++ /dev/null @@ -1,55 +0,0 @@ -# Slack Event Handler - -## Overview - -Django app that runs a **Slack Socket Mode** listener during **`runserver`** so inbound Slack events can be handled in-process. Production-style deployments typically use a different entrypoint; see module docstrings in [`runner.py`](runner.py) and [`apps.py`](apps.py) for startup behavior. - -## Data workflow - -This app is **event-driven**, not YAML-scheduled like the batch collectors. It reacts to Slack events (for example huddle canvases), writes lightweight **workspace JSON/HTML**, and can upload generated Markdown to GitHub. Per-app service APIs: [docs/service_api/README.md](../docs/service_api/README.md). It **does not** define ORM models for long-term analytics—that work belongs to [`cppa_slack_tracker`](../cppa_slack_tracker/README.md). - -### Where we fetch data - -**Slack Web API / Socket Mode** events (bot tokens per configured workspace). Huddle flows download private HTML/transcript payloads Slack exposes for a file/canvas id. - -### How data is saved to the database - -**No Django ORM persistence in this app.** Working state, downloaded JSON, and HTML live under the **workspace** (`slack_event_handler` helpers in [`workspace.py`](workspace.py)). For long-lived Slack rows, see [`cppa_slack_tracker`](../cppa_slack_tracker/README.md) and [docs/Schema.md, section 6 — Slack Activity Tracker](../docs/Schema.md#6-slack-activity-tracker). - -### How content is published to GitHub - -[`utils/huddle_processor.py`](utils/huddle_processor.py) renders Markdown, then **`core.operations.github_ops.upload_file`** commits it to **`GITHUB_SLACK_HUDDLE_REPO_OWNER` / `GITHUB_SLACK_HUDDLE_REPO_NAME`** (default branch from `GITHUB_DEFAULT_BRANCH`). Requires a token with contents write access to that repository. - -### How vectors sync to Pinecone - -**Not applicable.** Huddle transcripts are not upserted by this listener; use batch pipelines + [`cppa_pinecone_sync`](../cppa_pinecone_sync/README.md) if Slack text should also live in the vector index. See [docs/Pinecone_preprocess_guideline.md](../docs/Pinecone_preprocess_guideline.md). - -## Common tasks - -- Local dev with events: `python manage.py runserver` (listener starts in the reloader child only). -- Run the collector-style command directly: `python manage.py run_slack_event_handler --help`. -- Cross-cutting docs: [docs/service_api/README.md](../docs/service_api/README.md) (per-app service API index). - -## Main command: `run_slack_event_handler` - -Starts the unified **Socket Mode** listener (huddle AI note / transcript tracking and Slack PR-comment bot). Requires Slack app tokens configured in Django settings (see command module and `core.operations.slack_ops`). - -| Option | Description | -| --- | --- | -| `--dry-run` | Validate `SLACK_BOT_TOKEN_` / `SLACK_APP_TOKEN_` per configured team; **do not** start the listener. | - -## Management commands - -| Command | Purpose | -| --- | --- | -| `run_slack_event_handler` | Long-running Slack event handling entrypoint (see module docstring and `--help`). | - -Run `python manage.py COMMAND --help` for options. - -## Tests - -```bash -python -m pytest slack_event_handler/tests/ -v -``` - -(from repo root; see root [README](../README.md#running-tests) for `DATABASE_URL` and prerequisites.) diff --git a/slack_event_handler/__init__.py b/slack_event_handler/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/slack_event_handler/admin.py b/slack_event_handler/admin.py deleted file mode 100644 index 1faa983c..00000000 --- a/slack_event_handler/admin.py +++ /dev/null @@ -1 +0,0 @@ -# No models to register — state is stored in JSON files. diff --git a/slack_event_handler/apps.py b/slack_event_handler/apps.py deleted file mode 100644 index 8cee4651..00000000 --- a/slack_event_handler/apps.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import sys -import threading - -from django.apps import AppConfig - - -class SlackEventHandlerConfig(AppConfig): - default_auto_field = "django.db.models.BigAutoField" - name = "slack_event_handler" - verbose_name = "Slack Event Handler" - - def ready(self): - if "runserver" not in sys.argv: - return - # Runserver reloader: parent watches files, child runs the server. Only start the - # listener in the child so we don't open two Socket Mode connections to Slack. - if os.environ.get("RUN_MAIN") != "true": - return - - def start_listener(): - from slack_event_handler.runner import run_slack_event_handler - - run_slack_event_handler() - - t = threading.Thread( - target=start_listener, daemon=True, name="slack-event-handler" - ) - t.start() diff --git a/slack_event_handler/management/__init__.py b/slack_event_handler/management/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/slack_event_handler/management/commands/__init__.py b/slack_event_handler/management/commands/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/slack_event_handler/management/commands/extract_slack_tokens.py b/slack_event_handler/management/commands/extract_slack_tokens.py deleted file mode 100644 index 8ef0726a..00000000 --- a/slack_event_handler/management/commands/extract_slack_tokens.py +++ /dev/null @@ -1,78 +0,0 @@ -""" -Management command: extract_slack_tokens - -Persist Slack session credentials to workspace JSON. -""" - -import logging - -from django.conf import settings -from django.core.management.base import BaseCommand, CommandError - -from core.operations.slack_ops.tokens import get_default_team_key -from slack_event_handler.utils.slack_internal_tokens_store import ( - extract_and_save_slack_internal_tokens, - slack_internal_tokens_json_path, -) -from slack_event_handler.utils.slack_tokens import _resolve_chrome_profile_root -from slack_event_handler.workspace import get_chrome_profile_path - -logger = logging.getLogger(__name__) - - -class Command(BaseCommand): - help = ( - "Persist Slack session credentials to " - "workspace/slack_event_handler/slack_internal_tokens.json." - ) - - def add_arguments(self, parser): - parser.add_argument( - "--team-id", - dest="team_id", - default=None, - help="Slack team ID (default: first team from SLACK_TEAM_IDS).", - ) - - def handle(self, *args, **options): - team_id = (options.get("team_id") or "").strip() or get_default_team_key() - if not team_id: - raise CommandError( - "No team id. Pass --team-id or set SLACK_TEAM_IDS in .env." - ) - - allow_raw = getattr(settings, "ALLOW_INTERNAL_SLACK_TOKENS", "") or "" - if isinstance(allow_raw, bool): - allow = allow_raw - else: - allow = str(allow_raw).strip().lower() == "true" - if not allow: - self.stderr.write( - self.style.WARNING( - "Internal Slack session mode is not enabled: credentials will be saved to " - "workspace JSON but ignored by Django until enabled. " - "Restart web/celery after enabling. See .env.example." - ) - ) - - try: - profile = _resolve_chrome_profile_root() - except ValueError as e: - raise CommandError(str(e)) from e - profile_path = str(profile) - if not profile.is_dir(): - raise CommandError( - "Session storage not found " - f"({profile_path}). Expected: {get_chrome_profile_path()}. " - "See .env.example." - ) - - pair = extract_and_save_slack_internal_tokens(team_id) - if not pair: - raise CommandError("Failed to load session credentials. See .env.example.") - out_path = slack_internal_tokens_json_path() - self.stdout.write( - self.style.SUCCESS( - f"Saved session credentials for team {team_id} to {out_path}." - ) - ) diff --git a/slack_event_handler/management/commands/run_slack_event_handler.py b/slack_event_handler/management/commands/run_slack_event_handler.py deleted file mode 100644 index 3686325e..00000000 --- a/slack_event_handler/management/commands/run_slack_event_handler.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -Management command: run_slack_event_handler - -Runs the unified Slack Event Handler: huddle AI note transcript tracking and -Slack PR comment bot, both in a single Socket Mode listener. -""" - -import logging - -from django.conf import settings -from django.core.management.base import BaseCommand - -from core.operations.slack_ops import ( - get_slack_app_token, - get_slack_bot_token, -) - -logger = logging.getLogger(__name__) - - -class Command(BaseCommand): - help = ( - "Run the unified Slack Event Handler: listens for huddle AI note events " - "(transcript tracking) and GitHub PR URL messages (Slack PR comment bot)." - ) - - def add_arguments(self, parser): - parser.add_argument( - "--dry-run", - action="store_true", - help=( - "Only validate that SLACK_BOT_TOKEN_ and SLACK_APP_TOKEN_ are set per team; " - "do not start the listener." - ), - ) - - def handle(self, *args, **options): - tokens_map = getattr(settings, "SLACK_BOT_TOKEN", None) or {} - if not isinstance(tokens_map, dict): - tokens_map = {} - team_ids = list(tokens_map.keys()) if tokens_map else [] - - bot_results = [] - app_results = [] - for tid in team_ids: - try: - bot_token = get_slack_bot_token(team_id=tid) - bot_results.append((tid, bool(bot_token))) - except ValueError: - bot_results.append((tid, False)) - try: - app_token = get_slack_app_token(team_id=tid) - app_results.append((tid, bool(app_token))) - except ValueError: - app_results.append((tid, False)) - - if options["dry_run"]: - for tid in team_ids: - bot_ok = next((r for t, r in bot_results if t == tid), False) - app_ok = next((r for t, r in app_results if t == tid), False) - if bot_ok: - logger.info("SLACK_BOT_TOKEN_%s is set", tid) - else: - logger.warning("SLACK_BOT_TOKEN_%s is not set", tid) - if app_ok: - logger.info("SLACK_APP_TOKEN_%s is set", tid) - else: - logger.warning("SLACK_APP_TOKEN_%s is not set", tid) - if not team_ids: - logger.warning( - "No teams configured (set SLACK_TEAM_IDS and SLACK_BOT_TOKEN_)" - ) - logger.info("Would start unified Slack Event Handler (Socket Mode).") - return - - logger.info("Starting unified Slack Event Handler...") - try: - from slack_event_handler.runner import run_slack_event_handler - - run_slack_event_handler() - except KeyboardInterrupt: - logger.info("Stopped by user (Ctrl+C).") - except Exception as e: - logger.exception("run_slack_event_handler: %s", e) - raise diff --git a/slack_event_handler/migrations/__init__.py b/slack_event_handler/migrations/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/slack_event_handler/models.py b/slack_event_handler/models.py deleted file mode 100644 index 29e4e4a7..00000000 --- a/slack_event_handler/models.py +++ /dev/null @@ -1 +0,0 @@ -# No ORM models — all state is stored as flat JSON files in the workspace directory. diff --git a/slack_event_handler/runner.py b/slack_event_handler/runner.py deleted file mode 100644 index b9e3fb6a..00000000 --- a/slack_event_handler/runner.py +++ /dev/null @@ -1,84 +0,0 @@ -""" -Slack Event Handler runner. -Runs the unified Slack listener (huddle transcript tracking + Slack PR comment bot). -Supports multiple teams: one listener per team in SLACK_BOT_TOKEN, each in its own thread. -""" - -import logging -import threading - -from django.conf import settings - -from config.workspace import WORKSPACE_PATH_SETUP_ERRORS -from slack_event_handler.workspace import get_workspace_root -from core.operations.slack_ops import get_slack_app_token - -logger = logging.getLogger(__name__) - - -def run_slack_event_handler(bot_token=None, app_token=None): - """ - Main entry point for the unified Slack Event Handler. - If multiple teams are configured (SLACK_TEAM_IDS + SLACK_BOT_TOKEN_), starts one - listener per team in a separate thread. Otherwise uses default team key (single/first in SLACK_TEAM_IDS). - """ - # Best-effort debug log only; matches exceptions from config.workspace.get_workspace_path. - try: - root = get_workspace_root() - logger.debug("Slack Event Handler workspace root: %s", root) - except WORKSPACE_PATH_SETUP_ERRORS as e: - logger.exception("Failed to resolve workspace root: %s", e) - - tokens_map = getattr(settings, "SLACK_BOT_TOKEN", None) or {} - if not isinstance(tokens_map, dict): - tokens_map = {} - - if tokens_map: - # Multiple (or single) teams from SLACK_TEAM_IDS + SLACK_BOT_TOKEN_ and SLACK_APP_TOKEN_ - from slack_event_handler.utils.slack_listener import start_slack_listener - - listeners = [] - started = 0 - for team_id, token in tokens_map.items(): - token = (token or "").strip() - if not token: - continue - try: - team_app_token = (app_token or "").strip() or get_slack_app_token( - team_id - ) - except ValueError: - logger.warning( - "Skipping team %s: SLACK_APP_TOKEN_%s not set in .env", - team_id, - team_id, - ) - continue - logger.info("Starting Slack Event Listener for team=%s", team_id) - t = threading.Thread( - target=start_slack_listener, - kwargs={ - "bot_token": token, - "app_token": team_app_token, - "team_id": team_id, - }, - daemon=True, - name=f"slack-listener-{team_id}", - ) - t.start() - listeners.append(t) - started += 1 - if started == 0: - logger.error( - "No valid team with both SLACK_BOT_TOKEN_ and SLACK_APP_TOKEN_ in .env" - ) - else: - for t in listeners: - t.join() - else: - msg = ( - "No teams configured. Set SLACK_TEAM_IDS and SLACK_BOT_TOKEN_ " - "(and SLACK_APP_TOKEN_) in .env." - ) - logger.error("%s", msg) - return diff --git a/slack_event_handler/tests/__init__.py b/slack_event_handler/tests/__init__.py deleted file mode 100644 index 694b7f15..00000000 --- a/slack_event_handler/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Tests for slack_event_handler diff --git a/slack_event_handler/tests/conftest.py b/slack_event_handler/tests/conftest.py deleted file mode 100644 index d11da96f..00000000 --- a/slack_event_handler/tests/conftest.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Slack event handler test fixtures.""" - -import sys -from types import ModuleType -from unittest.mock import patch - -import pytest - - -class ImmediateThread: - """Drop-in threading.Thread replacement that runs target synchronously on start().""" - - def __init__( - self, - group=None, - target=None, - name=None, - args=(), - kwargs=None, - *, - daemon=None, - ): - self._target = target - self._args = args - - def start(self): - if self._target: - self._target(*self._args) - - -@pytest.fixture -def fake_slack_bolt(): - """Minimal slack_bolt package in sys.modules so slack_listener can import.""" - for key in list(sys.modules): - if key == "slack_event_handler.utils.slack_listener" or key.startswith( - "slack_event_handler.utils.slack_listener." - ): - sys.modules.pop(key, None) - - socket_mode = ModuleType("slack_bolt.adapter.socket_mode") - adapter = ModuleType("slack_bolt.adapter") - bolt = ModuleType("slack_bolt") - - class _DummyApp: - def __init__(self, *args, **kwargs): - pass - - socket_mode.SocketModeHandler = lambda *a, **k: None - adapter.socket_mode = socket_mode - bolt.App = _DummyApp - - with patch.dict( - sys.modules, - { - "slack_bolt": bolt, - "slack_bolt.adapter": adapter, - "slack_bolt.adapter.socket_mode": socket_mode, - }, - ): - yield diff --git a/slack_event_handler/tests/test_apps.py b/slack_event_handler/tests/test_apps.py deleted file mode 100644 index c0b3eb33..00000000 --- a/slack_event_handler/tests/test_apps.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Tests for slack_event_handler.apps.""" - -from __future__ import annotations - -from importlib import import_module -from unittest.mock import MagicMock, patch - -from slack_event_handler.apps import SlackEventHandlerConfig - - -def _config() -> SlackEventHandlerConfig: - mod = import_module("slack_event_handler") - return SlackEventHandlerConfig("slack_event_handler", mod) - - -def test_ready_returns_early_when_not_runserver(): - cfg = _config() - with patch("slack_event_handler.apps.sys.argv", ["manage.py", "migrate"]): - cfg.ready() - - -def test_ready_returns_early_when_run_main_not_true(monkeypatch): - monkeypatch.delenv("RUN_MAIN", raising=False) - cfg = _config() - with patch("slack_event_handler.apps.sys.argv", ["manage.py", "runserver"]): - cfg.ready() - - -def test_ready_starts_daemon_thread_when_runserver_child(monkeypatch): - monkeypatch.setenv("RUN_MAIN", "true") - cfg = _config() - mock_thread = MagicMock() - with patch("slack_event_handler.apps.sys.argv", ["manage.py", "runserver"]): - with patch( - "slack_event_handler.apps.threading.Thread", return_value=mock_thread - ) as mock_tc: - cfg.ready() - mock_tc.assert_called_once() - assert mock_tc.call_args.kwargs["daemon"] is True - assert mock_tc.call_args.kwargs["name"] == "slack-event-handler" - mock_thread.start.assert_called_once() - - inner_target = mock_tc.call_args.kwargs["target"] - with patch("slack_event_handler.runner.run_slack_event_handler") as run_handler: - inner_target() - run_handler.assert_called_once() diff --git a/slack_event_handler/tests/test_extract_slack_tokens_command.py b/slack_event_handler/tests/test_extract_slack_tokens_command.py deleted file mode 100644 index 5fe65dc0..00000000 --- a/slack_event_handler/tests/test_extract_slack_tokens_command.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Tests for extract_slack_tokens management command.""" - -from io import StringIO -from unittest.mock import patch - -import pytest -from django.core.management import call_command -from django.core.management.base import CommandError - - -@patch( - "slack_event_handler.management.commands.extract_slack_tokens.extract_and_save_slack_internal_tokens", - return_value=("xc", "xd"), -) -@patch( - "slack_event_handler.management.commands.extract_slack_tokens._resolve_chrome_profile_root", -) -def test_extract_slack_tokens_command_success( - mock_resolve_profile, mock_extract_and_save, tmp_path -): - profile = tmp_path / "chrome_profile" - profile.mkdir() - mock_resolve_profile.return_value = profile - out = StringIO() - call_command("extract_slack_tokens", "--team-id=T1", stdout=out) - mock_extract_and_save.assert_called_once_with("T1") - assert "Saved session credentials" in out.getvalue() - - -@patch( - "slack_event_handler.management.commands.extract_slack_tokens.extract_and_save_slack_internal_tokens", - return_value=None, -) -@patch( - "slack_event_handler.management.commands.extract_slack_tokens._resolve_chrome_profile_root", -) -def test_extract_slack_tokens_command_failure( - mock_resolve_profile, mock_extract_and_save, tmp_path -): - profile = tmp_path / "chrome_profile" - profile.mkdir() - mock_resolve_profile.return_value = profile - with pytest.raises(CommandError, match="Failed to load session credentials"): - call_command("extract_slack_tokens", "--team-id=T1") - mock_extract_and_save.assert_called_once_with("T1") - - -def test_extract_slack_tokens_command_missing_profile(settings, tmp_path): - settings.CHROME_PROFILE_PATH = str(tmp_path / "missing_profile") - with pytest.raises(CommandError, match="Session storage not found"): - call_command("extract_slack_tokens", "--team-id=T21Q22G66") diff --git a/slack_event_handler/tests/test_github_pr_client.py b/slack_event_handler/tests/test_github_pr_client.py deleted file mode 100644 index d8c6d93e..00000000 --- a/slack_event_handler/tests/test_github_pr_client.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Tests for slack_event_handler.utils.github_pr_client.""" - -from unittest.mock import MagicMock, patch - -import pytest - -import slack_event_handler.utils.github_pr_client as gh_client - - -@pytest.fixture(autouse=True) -def reset_github_singleton(): - gh_client._gh = None - yield - gh_client._gh = None - - -@pytest.mark.django_db -def test_get_client_requires_token(settings): - settings.SLACK_PR_BOT_GITHUB_TOKEN = "" - gh_client._gh = None - with pytest.raises(ValueError, match="SLACK_PR_BOT_GITHUB_TOKEN"): - gh_client._get_client() - - -@pytest.mark.django_db -def test_post_pr_comment_uses_template_and_returns(settings): - settings.SLACK_PR_BOT_GITHUB_TOKEN = "ghp_secret" - settings.SLACK_PR_BOT_COMMENT_TEMPLATE = "Hello from test" - - mock_pull = MagicMock() - mock_repo = MagicMock() - mock_repo.get_pull.return_value = mock_pull - mock_gh = MagicMock() - mock_gh.get_repo.return_value = mock_repo - - with patch.object(gh_client, "Github", return_value=mock_gh): - gh_client._gh = None - gh_client.post_pr_comment("o", "r", 99) - - mock_repo.get_pull.assert_called_once_with(99) - mock_pull.create_issue_comment.assert_called_once_with("Hello from test") - - -@pytest.mark.django_db -def test_post_pr_comment_retries_then_raises(settings): - settings.SLACK_PR_BOT_GITHUB_TOKEN = "tok" - from github.GithubException import GithubException - - mock_pull = MagicMock() - mock_pull.create_issue_comment.side_effect = GithubException(500, "fail", {}) - mock_repo = MagicMock() - mock_repo.get_pull.return_value = mock_pull - mock_gh = MagicMock() - mock_gh.get_repo.return_value = mock_repo - - with patch.object(gh_client, "Github", return_value=mock_gh): - gh_client._gh = None - with patch.object(gh_client.time, "sleep"): - with pytest.raises(GithubException): - gh_client.post_pr_comment("a", "b", 1) - - assert mock_pull.create_issue_comment.call_count == gh_client.MAX_RETRIES diff --git a/slack_event_handler/tests/test_huddle_markdown.py b/slack_event_handler/tests/test_huddle_markdown.py deleted file mode 100644 index e0f02bf7..00000000 --- a/slack_event_handler/tests/test_huddle_markdown.py +++ /dev/null @@ -1,90 +0,0 @@ -"""Tests for slack_event_handler.utils.huddle_markdown.""" - -from pathlib import Path -from unittest.mock import MagicMock, patch - -from slack_event_handler.utils.huddle_markdown import generate_huddle_markdown - - -def test_generate_huddle_markdown_missing_html_returns_none(tmp_path): - missing_html = tmp_path / "nope.html" - json_path = tmp_path / "t.json" - json_path.write_text("{}", encoding="utf-8") - assert ( - generate_huddle_markdown(str(missing_html), str(json_path), str(tmp_path)) - is None - ) - - -def test_generate_huddle_markdown_invalid_json_returns_none(tmp_path): - html = tmp_path / "s.html" - html.write_text("x", encoding="utf-8") - bad_json = tmp_path / "bad.json" - bad_json.write_text("{not json", encoding="utf-8") - assert generate_huddle_markdown(str(html), str(bad_json), str(tmp_path)) is None - - -@patch("slack_event_handler.utils.huddle_markdown.write_huddle_transcript_md") -@patch("slack_event_handler.utils.huddle_markdown.replace_channel_ids_with_names") -@patch("slack_event_handler.utils.huddle_markdown.replace_user_ids_with_usernames") -@patch("slack_event_handler.utils.huddle_markdown.html_to_markdown") -@patch("slack_event_handler.utils.huddle_markdown.generate_transcript_from_json") -@patch("slack_event_handler.utils.huddle_markdown.SlackFetcher") -@patch("slack_event_handler.utils.huddle_markdown.parse_html_summary") -def test_generate_huddle_markdown_success_path( - mock_parse, - mock_fetcher_cls, - mock_gen_tx, - mock_html_md, - mock_replace_u, - mock_replace_c, - mock_write, - tmp_path, -): - html = tmp_path / "p.html" - html.write_text("@UZZZTOP", encoding="utf-8") - js = tmp_path / "p.json" - js.write_text('{"messages": []}', encoding="utf-8") - - mock_parse.return_value = {"channel_id": "C9", "attendee_ids": ["UA"]} - fetcher = MagicMock() - fetcher.get_channel_info.return_value = "general" - fetcher.get_user_info.return_value = {"display_name": "Someone"} - mock_fetcher_cls.return_value = fetcher - mock_gen_tx.return_value = [{"user_id": "UB", "text": "hi"}] - mock_html_md.return_value = "## Hi\n# Title\n" - # replace_* must return str; default MagicMock breaks re.sub below. - mock_replace_u.return_value = "## Hi\n# Title\n" - mock_replace_c.return_value = "## Hi\n# Title\n" - mock_write.return_value = Path(tmp_path / "out.md") - - out = generate_huddle_markdown(str(html), str(js), str(tmp_path), bot_token="x") - - assert out is not None - assert Path(out).resolve() == (tmp_path / "out.md").resolve() - mock_replace_u.assert_called_once() - mock_replace_c.assert_called_once() - - -@patch( - "slack_event_handler.utils.huddle_markdown.SlackFetcher", - side_effect=ValueError("bad"), -) -def test_generate_huddle_markdown_fetcher_init_fails(_mock_sf, tmp_path): - html = tmp_path / "a.html" - html.write_text("", encoding="utf-8") - js = tmp_path / "a.json" - js.write_text("{}", encoding="utf-8") - assert generate_huddle_markdown(str(html), str(js), str(tmp_path)) is None - - -@patch("slack_event_handler.utils.huddle_markdown.open", side_effect=OSError("no")) -def test_generate_huddle_markdown_html_read_error(_mock_open, tmp_path): - assert ( - generate_huddle_markdown( - str(tmp_path / "missing.html"), - str(tmp_path / "x.json"), - str(tmp_path), - ) - is None - ) diff --git a/slack_event_handler/tests/test_huddle_processor.py b/slack_event_handler/tests/test_huddle_processor.py deleted file mode 100644 index 395c8e55..00000000 --- a/slack_event_handler/tests/test_huddle_processor.py +++ /dev/null @@ -1,231 +0,0 @@ -"""Tests for slack_event_handler.utils.huddle_processor.""" - -from unittest.mock import MagicMock, patch - -import pytest - -from slack_event_handler.utils import huddle_processor - - -@pytest.mark.django_db -def test_process_huddle_canvas_fetch_fails(): - with patch( - "slack_event_handler.utils.huddle_processor.fetch_huddle_transcript", - return_value=None, - ): - out = huddle_processor.process_huddle_canvas("F123") - assert out == {"success": False} - - -@pytest.mark.django_db -def test_process_huddle_canvas_not_ok(): - with patch( - "slack_event_handler.utils.huddle_processor.fetch_huddle_transcript", - return_value={"ok": False}, - ): - out = huddle_processor.process_huddle_canvas("F123") - assert out == {"success": False} - - -@pytest.mark.django_db -def test_process_huddle_canvas_no_download_url(): - with patch( - "slack_event_handler.utils.huddle_processor.fetch_huddle_transcript", - return_value={"ok": True, "file": {}}, - ): - out = huddle_processor.process_huddle_canvas("F123") - assert out == {"success": False} - - -@pytest.mark.django_db -def test_process_huddle_canvas_fetcher_init_fails(tmp_path): - with patch( - "slack_event_handler.utils.huddle_processor.fetch_huddle_transcript", - return_value={ - "ok": True, - "file": {"url_private_download": "https://x", "name": "a.html"}, - }, - ): - with patch( - "slack_event_handler.utils.huddle_processor.get_data_dir", - return_value=tmp_path, - ): - with patch( - "slack_event_handler.utils.huddle_processor.SlackFetcher", - side_effect=ValueError("no token"), - ): - out = huddle_processor.process_huddle_canvas("F999") - assert out == {"success": False} - - -@pytest.mark.django_db -def test_process_huddle_canvas_download_fails(tmp_path): - mock_fetcher = MagicMock() - mock_fetcher.download_file.return_value = None - with patch( - "slack_event_handler.utils.huddle_processor.fetch_huddle_transcript", - return_value={ - "ok": True, - "file": {"url_private_download": "https://x", "name": "a.html"}, - }, - ): - with patch( - "slack_event_handler.utils.huddle_processor.get_data_dir", - return_value=tmp_path, - ): - with patch( - "slack_event_handler.utils.huddle_processor.SlackFetcher", - return_value=mock_fetcher, - ): - out = huddle_processor.process_huddle_canvas("F1") - assert out == {"success": False} - - -@pytest.mark.django_db -def test_process_huddle_canvas_markdown_fails(tmp_path): - mock_fetcher = MagicMock() - mock_fetcher.download_file.return_value = str(tmp_path / "a.html") - (tmp_path / "a.html").write_text("x", encoding="utf-8") - - with patch( - "slack_event_handler.utils.huddle_processor.fetch_huddle_transcript", - return_value={ - "ok": True, - "file": {"url_private_download": "https://x", "name": "a.html"}, - }, - ): - with patch( - "slack_event_handler.utils.huddle_processor.get_data_dir", - return_value=tmp_path, - ): - with patch( - "slack_event_handler.utils.huddle_processor.SlackFetcher", - return_value=mock_fetcher, - ): - with patch( - "slack_event_handler.utils.huddle_processor.generate_huddle_markdown", - return_value=None, - ): - out = huddle_processor.process_huddle_canvas("F1") - assert out == {"success": False} - - -@pytest.mark.django_db -def test_process_huddle_canvas_missing_github_repo_settings(tmp_path, settings): - settings.GITHUB_SLACK_HUDDLE_REPO_OWNER = "" - settings.GITHUB_SLACK_HUDDLE_REPO_NAME = "" - - mock_fetcher = MagicMock() - html = tmp_path / "F2" / "a.html" - html.parent.mkdir(parents=True) - html.write_text("", encoding="utf-8") - mock_fetcher.download_file.return_value = str(html) - - with patch( - "slack_event_handler.utils.huddle_processor.fetch_huddle_transcript", - return_value={ - "ok": True, - "file": {"url_private_download": "https://x", "name": "a.html"}, - }, - ): - with patch( - "slack_event_handler.utils.huddle_processor.get_data_dir", - return_value=tmp_path, - ): - with patch( - "slack_event_handler.utils.huddle_processor.SlackFetcher", - return_value=mock_fetcher, - ): - with patch( - "slack_event_handler.utils.huddle_processor.generate_huddle_markdown", - return_value=str(tmp_path / "out.md"), - ): - (tmp_path / "out.md").write_text("md", encoding="utf-8") - out = huddle_processor.process_huddle_canvas("F2") - assert out == {"success": False} - - -@pytest.mark.django_db -def test_process_huddle_canvas_upload_fail(tmp_path, settings): - settings.GITHUB_SLACK_HUDDLE_REPO_OWNER = "o" - settings.GITHUB_SLACK_HUDDLE_REPO_NAME = "r" - settings.GITHUB_DEFAULT_BRANCH = "main" - - mock_fetcher = MagicMock() - work = tmp_path / "F3" - work.mkdir(parents=True) - html = work / "a.html" - html.write_text("", encoding="utf-8") - mock_fetcher.download_file.return_value = str(html) - md_path = work / "t.md" - md_path.write_text("md", encoding="utf-8") - - with patch( - "slack_event_handler.utils.huddle_processor.fetch_huddle_transcript", - return_value={ - "ok": True, - "file": {"url_private_download": "https://x", "name": "a.html"}, - }, - ): - with patch( - "slack_event_handler.utils.huddle_processor.get_data_dir", - return_value=tmp_path, - ): - with patch( - "slack_event_handler.utils.huddle_processor.SlackFetcher", - return_value=mock_fetcher, - ): - with patch( - "slack_event_handler.utils.huddle_processor.generate_huddle_markdown", - return_value=str(md_path), - ): - with patch( - "slack_event_handler.utils.huddle_processor.upload_file", - return_value=False, - ): - out = huddle_processor.process_huddle_canvas("F3") - assert out == {"success": False} - - -@pytest.mark.django_db -def test_process_huddle_canvas_success(tmp_path, settings): - settings.GITHUB_SLACK_HUDDLE_REPO_OWNER = "acme" - settings.GITHUB_SLACK_HUDDLE_REPO_NAME = "repo" - settings.GITHUB_DEFAULT_BRANCH = "develop" - - mock_fetcher = MagicMock() - work = tmp_path / "F4" - work.mkdir(parents=True) - html = work / "h.html" - html.write_text("", encoding="utf-8") - mock_fetcher.download_file.return_value = str(html) - md_path = work / "doc.md" - md_path.write_text("md", encoding="utf-8") - - with patch( - "slack_event_handler.utils.huddle_processor.fetch_huddle_transcript", - return_value={ - "ok": True, - "file": {"url_private_download": "https://x", "name": "h.html"}, - }, - ): - with patch( - "slack_event_handler.utils.huddle_processor.get_data_dir", - return_value=tmp_path, - ): - with patch( - "slack_event_handler.utils.huddle_processor.SlackFetcher", - return_value=mock_fetcher, - ): - with patch( - "slack_event_handler.utils.huddle_processor.generate_huddle_markdown", - return_value=str(md_path), - ): - with patch( - "slack_event_handler.utils.huddle_processor.upload_file", - return_value=True, - ): - out = huddle_processor.process_huddle_canvas("F4") - - assert out["success"] is True - assert "github.com/acme/repo/blob/develop/" in out["github_url"] diff --git a/slack_event_handler/tests/test_job_queue.py b/slack_event_handler/tests/test_job_queue.py deleted file mode 100644 index b4f1897f..00000000 --- a/slack_event_handler/tests/test_job_queue.py +++ /dev/null @@ -1,494 +0,0 @@ -"""Tests for slack_event_handler.utils.job_queue.""" - -import threading -from contextlib import contextmanager -from unittest.mock import MagicMock, patch - -import pytest - -from slack_event_handler.utils import job_queue -from slack_event_handler.utils.rate_limiter import record_posted -from slack_event_handler.utils.state import load_state - - -@pytest.fixture(autouse=True) -def reset_job_queue_globals(): - job_queue._runtime.clear() - yield - job_queue._runtime.clear() - - -@pytest.mark.django_db -def test_enqueue_job_persists_to_state(settings, tmp_path): - settings.SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW = 5 - path = tmp_path / "state.json" - with patch( - "slack_event_handler.utils.state._get_state_file_path", - return_value=str(path), - ): - job = job_queue.enqueue_job( - owner="o", - repo="r", - pull_number=1, - channel="C1", - message_ts="1.0", - user_id="U1", - is_dm=False, - team_id="T9", - ) - assert job[job_queue.KEY_OWNER] == "o" - assert job[job_queue.KEY_TEAM_ID] == "T9" - - -@pytest.mark.django_db -def test_estimated_delay_sec_zero_when_empty_queue(settings, tmp_path): - settings.SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW = 5 - settings.SLACK_PR_BOT_COMMENTS_WINDOW_SECONDS = 3600 - path = tmp_path / "state.json" - empty = {"postedAt": [], "queue": []} - import json - - path.write_text(json.dumps(empty)) - - with patch( - "slack_event_handler.utils.state._get_state_file_path", - return_value=str(path), - ): - assert job_queue.estimated_delay_sec("T1") == 0 - - -@pytest.mark.django_db -def test_set_slack_app_registers_team(): - app = MagicMock() - job_queue.set_slack_app(app, "T1") - assert job_queue._runtime.get_app("T1") is app - - -@pytest.mark.django_db -def test_send_reply_no_app_no_crash(): - job_queue._send_reply("T1", "C", "1.0", False, "hi") - - -@pytest.mark.django_db -def test_process_job_posts_and_replies(settings): - settings.SLACK_PR_BOT_GITHUB_TOKEN = "tok" - - mock_app = MagicMock() - job_queue.set_slack_app(mock_app, "T1") - - job = { - job_queue.KEY_JOB_ID: "jid", - job_queue.KEY_TEAM_ID: "T1", - job_queue.KEY_OWNER: "o", - job_queue.KEY_REPO: "r", - job_queue.KEY_PULL_NUMBER: 3, - job_queue.KEY_CHANNEL: "C1", - job_queue.KEY_MESSAGE_TS: "9.9", - job_queue.KEY_USER_ID: "U1", - job_queue.KEY_IS_DM: False, - } - - with patch("slack_event_handler.utils.job_queue.wait_and_reserve_slot"): - with patch("slack_event_handler.utils.job_queue.post_pr_comment"): - job_queue._process_job(job) - - mock_app.client.chat_postMessage.assert_called_once() - mock_app.client.reactions_add.assert_called_once() - - -@pytest.mark.django_db -def test_process_job_reactions_already_reacted_swallows(settings): - settings.SLACK_PR_BOT_GITHUB_TOKEN = "tok" - mock_app = MagicMock() - mock_app.client.reactions_add.side_effect = Exception( - "error already_reacted something" - ) - job_queue.set_slack_app(mock_app, "T1") - - job = { - job_queue.KEY_JOB_ID: "j", - job_queue.KEY_TEAM_ID: "T1", - job_queue.KEY_OWNER: "o", - job_queue.KEY_REPO: "r", - job_queue.KEY_PULL_NUMBER: 1, - job_queue.KEY_CHANNEL: "C", - job_queue.KEY_MESSAGE_TS: "t", - job_queue.KEY_USER_ID: "U", - job_queue.KEY_IS_DM: False, - } - - with patch("slack_event_handler.utils.job_queue.wait_and_reserve_slot"): - with patch("slack_event_handler.utils.job_queue.post_pr_comment"): - job_queue._process_job(job) - - -@pytest.mark.django_db -def test_process_job_reactions_other_error_raises(settings): - settings.SLACK_PR_BOT_GITHUB_TOKEN = "tok" - mock_app = MagicMock() - mock_app.client.reactions_add.side_effect = RuntimeError("boom") - job_queue.set_slack_app(mock_app, "T1") - - job = { - job_queue.KEY_JOB_ID: "j", - job_queue.KEY_TEAM_ID: "T1", - job_queue.KEY_OWNER: "o", - job_queue.KEY_REPO: "r", - job_queue.KEY_PULL_NUMBER: 1, - job_queue.KEY_CHANNEL: "C", - job_queue.KEY_MESSAGE_TS: "t", - job_queue.KEY_USER_ID: "U", - job_queue.KEY_IS_DM: False, - } - - with patch("slack_event_handler.utils.job_queue.wait_and_reserve_slot"): - with patch("slack_event_handler.utils.job_queue.post_pr_comment"): - with pytest.raises(RuntimeError, match="boom"): - job_queue._process_job(job) - - -@pytest.mark.django_db -def test_process_job_clears_busy_after_slot_reserved(settings): - settings.SLACK_PR_BOT_GITHUB_TOKEN = "tok" - job_queue.set_slack_app(MagicMock(), "T1") - job = { - job_queue.KEY_JOB_ID: "jid", - job_queue.KEY_TEAM_ID: "T1", - job_queue.KEY_OWNER: "o", - job_queue.KEY_REPO: "r", - job_queue.KEY_PULL_NUMBER: 1, - job_queue.KEY_CHANNEL: "C1", - job_queue.KEY_MESSAGE_TS: "9.9", - job_queue.KEY_USER_ID: "U1", - job_queue.KEY_IS_DM: False, - } - - with patch("slack_event_handler.utils.job_queue.wait_and_reserve_slot"): - with patch("slack_event_handler.utils.job_queue.post_pr_comment"): - job_queue._process_job(job) - - assert not job_queue._runtime.is_busy("T1") - - -@pytest.mark.django_db -def test_estimated_delay_sec_nonzero_with_jobs_ahead(settings, tmp_path): - settings.SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW = 2 - settings.SLACK_PR_BOT_COMMENTS_WINDOW_SECONDS = 100 - path = tmp_path / "state.json" - import json - - now = 1000.0 - state = { - "postedAt": [now - 10, now - 5], - "queue": [{"id": "a"}, {"id": "b"}, {"id": "c"}], - } - path.write_text(json.dumps(state)) - - with patch( - "slack_event_handler.utils.state._get_state_file_path", - return_value=str(path), - ): - with patch("slack_event_handler.utils.job_queue.time.time", return_value=now): - d = job_queue.estimated_delay_sec("T9") - assert d >= 0 - - -@pytest.mark.django_db -def test_send_reply_chat_post_message_failure_logs(settings): - mock_app = MagicMock() - mock_app.client.chat_postMessage.side_effect = OSError("network") - job_queue.set_slack_app(mock_app, "T1") - job_queue._send_reply("T1", "C1", "9.9", False, "hi") - mock_app.client.chat_postMessage.assert_called_once() - - -@pytest.mark.django_db -def test_process_job_dm_uses_chat_post_message_without_thread_ts(settings): - settings.SLACK_PR_BOT_GITHUB_TOKEN = "tok" - mock_app = MagicMock() - job_queue.set_slack_app(mock_app, "T1") - - job = { - job_queue.KEY_JOB_ID: "jid", - job_queue.KEY_TEAM_ID: "T1", - job_queue.KEY_OWNER: "o", - job_queue.KEY_REPO: "r", - job_queue.KEY_PULL_NUMBER: 3, - job_queue.KEY_CHANNEL: "D1", - job_queue.KEY_MESSAGE_TS: "9.9", - job_queue.KEY_USER_ID: "U1", - job_queue.KEY_IS_DM: True, - } - - with patch("slack_event_handler.utils.job_queue.wait_and_reserve_slot"): - with patch("slack_event_handler.utils.job_queue.post_pr_comment"): - job_queue._process_job(job) - - kwargs = mock_app.client.chat_postMessage.call_args.kwargs - assert "thread_ts" not in kwargs - - -@pytest.mark.django_db -def test_process_job_skips_reaction_when_team_id_none(settings): - settings.SLACK_PR_BOT_GITHUB_TOKEN = "tok" - mock_app = MagicMock() - job_queue.set_slack_app(mock_app, "TX") - - job = { - job_queue.KEY_JOB_ID: "jid", - job_queue.KEY_TEAM_ID: None, - job_queue.KEY_OWNER: "o", - job_queue.KEY_REPO: "r", - job_queue.KEY_PULL_NUMBER: 3, - job_queue.KEY_CHANNEL: "C1", - job_queue.KEY_MESSAGE_TS: "9.9", - job_queue.KEY_USER_ID: "U1", - job_queue.KEY_IS_DM: False, - } - - with patch("slack_event_handler.utils.job_queue.wait_and_reserve_slot"): - with patch("slack_event_handler.utils.job_queue.post_pr_comment"): - job_queue._process_job(job) - - mock_app.client.reactions_add.assert_not_called() - - -@pytest.mark.django_db -def test_process_job_logs_when_rate_limited(settings): - settings.SLACK_PR_BOT_GITHUB_TOKEN = "tok" - mock_app = MagicMock() - job_queue.set_slack_app(mock_app, "T1") - - job = { - job_queue.KEY_JOB_ID: "jid", - job_queue.KEY_TEAM_ID: "T1", - job_queue.KEY_OWNER: "o", - job_queue.KEY_REPO: "r", - job_queue.KEY_PULL_NUMBER: 3, - job_queue.KEY_CHANNEL: "C1", - job_queue.KEY_MESSAGE_TS: "9.9", - job_queue.KEY_USER_ID: "U1", - job_queue.KEY_IS_DM: False, - } - - with patch("slack_event_handler.utils.job_queue.compute_delay", return_value=5.0): - with patch("slack_event_handler.utils.job_queue.wait_and_reserve_slot"): - with patch("slack_event_handler.utils.job_queue.post_pr_comment"): - with patch("slack_event_handler.utils.job_queue.logger") as log: - job_queue._process_job(job) - assert log.debug.called - - -@pytest.mark.django_db -def test_worker_processes_job_then_exits_on_sleep(settings): - settings.SLACK_PR_BOT_GITHUB_TOKEN = "tok" - job_queue.set_slack_app(MagicMock(), "T1") - - job = { - job_queue.KEY_JOB_ID: "jid", - job_queue.KEY_TEAM_ID: "T1", - job_queue.KEY_OWNER: "o", - job_queue.KEY_REPO: "r", - job_queue.KEY_PULL_NUMBER: 3, - job_queue.KEY_CHANNEL: "C1", - job_queue.KEY_MESSAGE_TS: "9.9", - job_queue.KEY_USER_ID: "U1", - job_queue.KEY_IS_DM: False, - } - - loads = [ - {"queue": [job], "postedAt": []}, - {"queue": [], "postedAt": []}, - ] - - @contextmanager - def fake_modify(team_id=None): - state = loads.pop(0) if loads else {"queue": [], "postedAt": []} - yield state - - def sleep_side_effect(_sec): - raise RuntimeError("stop_worker_loop") - - load_peeks = [ - {"queue": [job], "postedAt": []}, - {"queue": [], "postedAt": []}, - ] - - def load_side_effect(team_id=None): - if load_peeks: - return load_peeks.pop(0) - return {"queue": [], "postedAt": []} - - with patch.object(job_queue, "modify_state", fake_modify): - with patch.object(job_queue, "load_state", side_effect=load_side_effect): - with patch.object(job_queue, "wait_and_reserve_slot"): - with patch.object(job_queue, "post_pr_comment"): - with patch.object( - job_queue.time, "sleep", side_effect=sleep_side_effect - ): - with pytest.raises(RuntimeError, match="stop_worker_loop"): - job_queue._worker("T1") - - -@pytest.mark.django_db -def test_worker_process_job_failure_sends_error_reply(settings): - settings.SLACK_PR_BOT_GITHUB_TOKEN = "tok" - mock_app = MagicMock() - job_queue.set_slack_app(mock_app, "T1") - - job = { - job_queue.KEY_JOB_ID: "jid", - job_queue.KEY_TEAM_ID: "T1", - job_queue.KEY_OWNER: "o", - job_queue.KEY_REPO: "r", - job_queue.KEY_PULL_NUMBER: 3, - job_queue.KEY_CHANNEL: "C1", - job_queue.KEY_MESSAGE_TS: "9.9", - job_queue.KEY_USER_ID: "U1", - job_queue.KEY_IS_DM: False, - } - - loads = [ - {"queue": [job], "postedAt": []}, - {"queue": [], "postedAt": []}, - ] - - @contextmanager - def fake_modify(team_id=None): - state = loads.pop(0) if loads else {"queue": [], "postedAt": []} - yield state - - def sleep_side_effect(_sec): - raise RuntimeError("stop_worker_loop") - - load_peeks = [ - {"queue": [job], "postedAt": []}, - {"queue": [], "postedAt": []}, - ] - - def load_side_effect(team_id=None): - if load_peeks: - return load_peeks.pop(0) - return {"queue": [], "postedAt": []} - - with patch.object(job_queue, "modify_state", fake_modify): - with patch.object(job_queue, "load_state", side_effect=load_side_effect): - with patch.object( - job_queue, "post_pr_comment", side_effect=RuntimeError("gh") - ): - with patch.object(job_queue, "wait_and_reserve_slot"): - with patch.object( - job_queue.time, "sleep", side_effect=sleep_side_effect - ): - with pytest.raises(RuntimeError, match="stop_worker_loop"): - job_queue._worker("T1") - - texts = [ - (ca.kwargs.get("text") or "") - for ca in mock_app.client.chat_postMessage.call_args_list - ] - assert any("Could not post" in t for t in texts) - - -@pytest.mark.django_db -def test_concurrent_enqueue_preserves_all_jobs(settings, tmp_path): - settings.SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW = 5 - path = tmp_path / "state_T9.json" - n = 30 - barrier = threading.Barrier(n) - job_ids: list[str] = [] - errors: list[BaseException] = [] - lock = threading.Lock() - - def worker(): - try: - barrier.wait(timeout=10) - job = job_queue.enqueue_job( - owner="o", - repo="r", - pull_number=1, - channel="C1", - message_ts="1.0", - user_id="U1", - is_dm=False, - team_id="T9", - ) - with lock: - job_ids.append(job[job_queue.KEY_JOB_ID]) - except BaseException as e: - with lock: - errors.append(e) - - with patch( - "slack_event_handler.utils.state._get_state_file_path", - return_value=str(path), - ): - threads = [threading.Thread(target=worker) for _ in range(n)] - for t in threads: - t.start() - for t in threads: - t.join(timeout=30) - assert all(not t.is_alive() for t in threads) - - assert not errors - loaded = load_state("T9") - assert len(loaded["queue"]) == n - assert len(set(job_ids)) == n - - -@pytest.mark.django_db -def test_concurrent_enqueue_and_record_posted(settings, tmp_path): - settings.SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW = 10 - path = tmp_path / "state_T9.json" - n_enqueue = 15 - n_posted = 15 - n_total = n_enqueue + n_posted - barrier = threading.Barrier(n_total) - errors: list[BaseException] = [] - lock = threading.Lock() - - def enqueue_worker(): - try: - barrier.wait(timeout=10) - job_queue.enqueue_job( - owner="o", - repo="r", - pull_number=1, - channel="C1", - message_ts="1.0", - user_id="U1", - team_id="T9", - ) - except BaseException as e: - with lock: - errors.append(e) - - def posted_worker(): - try: - barrier.wait(timeout=10) - record_posted("T9") - except BaseException as e: - with lock: - errors.append(e) - - with patch( - "slack_event_handler.utils.state._get_state_file_path", - return_value=str(path), - ): - with patch( - "slack_event_handler.utils.rate_limiter.time.time", return_value=42.0 - ): - threads = [ - threading.Thread(target=enqueue_worker) for _ in range(n_enqueue) - ] - threads += [threading.Thread(target=posted_worker) for _ in range(n_posted)] - for t in threads: - t.start() - for t in threads: - t.join(timeout=30) - assert all(not t.is_alive() for t in threads) - - assert not errors - loaded = load_state("T9") - assert len(loaded["queue"]) == n_enqueue - assert len(loaded["postedAt"]) == n_posted diff --git a/slack_event_handler/tests/test_management_command.py b/slack_event_handler/tests/test_management_command.py deleted file mode 100644 index 44b21d68..00000000 --- a/slack_event_handler/tests/test_management_command.py +++ /dev/null @@ -1,86 +0,0 @@ -"""Tests for run_slack_event_handler management command.""" - -from io import StringIO -from unittest.mock import patch - -import pytest -from django.core.management import call_command - - -@pytest.mark.django_db -def test_command_dry_run_validates_tokens(settings): - settings.SLACK_BOT_TOKEN = {"T1": "xoxb-1"} - cmd_mod = "slack_event_handler.management.commands.run_slack_event_handler" - with patch(f"{cmd_mod}.logger") as log: - with patch(f"{cmd_mod}.get_slack_bot_token", return_value="tok"): - with patch(f"{cmd_mod}.get_slack_app_token", return_value="app"): - call_command("run_slack_event_handler", "--dry-run", stdout=StringIO()) - assert log.info.called - - -@pytest.mark.django_db -def test_command_dry_run_warns_when_no_teams(settings): - settings.SLACK_BOT_TOKEN = {} - cmd_mod = "slack_event_handler.management.commands.run_slack_event_handler" - with patch(f"{cmd_mod}.logger") as log: - call_command("run_slack_event_handler", "--dry-run", stdout=StringIO()) - assert log.warning.called - - -@pytest.mark.django_db -def test_command_runs_runner(settings): - settings.SLACK_BOT_TOKEN = {"T1": "x"} - cmd_mod = "slack_event_handler.management.commands.run_slack_event_handler" - with patch("slack_event_handler.runner.run_slack_event_handler") as run: - with patch(f"{cmd_mod}.get_slack_bot_token", return_value="b"): - with patch(f"{cmd_mod}.get_slack_app_token", return_value="a"): - call_command("run_slack_event_handler", stdout=StringIO()) - run.assert_called_once() - - -@pytest.mark.django_db -def test_command_keyboard_interrupt_logs(settings): - settings.SLACK_BOT_TOKEN = {"T1": "x"} - cmd_mod = "slack_event_handler.management.commands.run_slack_event_handler" - with patch( - "slack_event_handler.runner.run_slack_event_handler", - side_effect=KeyboardInterrupt, - ): - with patch(f"{cmd_mod}.logger") as log: - with patch(f"{cmd_mod}.get_slack_bot_token", return_value="b"): - with patch(f"{cmd_mod}.get_slack_app_token", return_value="a"): - call_command("run_slack_event_handler", stdout=StringIO()) - assert log.info.called - - -@pytest.mark.django_db -def test_command_tokens_map_not_dict_becomes_empty(settings): - settings.SLACK_BOT_TOKEN = ["not-a-dict"] - cmd_mod = "slack_event_handler.management.commands.run_slack_event_handler" - with patch(f"{cmd_mod}.logger") as log: - call_command("run_slack_event_handler", "--dry-run", stdout=StringIO()) - assert log.warning.called - - -@pytest.mark.django_db -def test_command_dry_run_value_error_from_token_helpers(settings): - settings.SLACK_BOT_TOKEN = {"T1": "x"} - cmd_mod = "slack_event_handler.management.commands.run_slack_event_handler" - with patch(f"{cmd_mod}.logger"): - with patch(f"{cmd_mod}.get_slack_bot_token", side_effect=ValueError("bad")): - with patch(f"{cmd_mod}.get_slack_app_token", side_effect=ValueError("bad")): - call_command("run_slack_event_handler", "--dry-run", stdout=StringIO()) - - -@pytest.mark.django_db -def test_command_runner_exception_reraises(settings): - settings.SLACK_BOT_TOKEN = {"T1": "x"} - cmd_mod = "slack_event_handler.management.commands.run_slack_event_handler" - with patch( - "slack_event_handler.runner.run_slack_event_handler", - side_effect=RuntimeError("runner boom"), - ): - with patch(f"{cmd_mod}.get_slack_bot_token", return_value="b"): - with patch(f"{cmd_mod}.get_slack_app_token", return_value="a"): - with pytest.raises(RuntimeError, match="runner boom"): - call_command("run_slack_event_handler", stdout=StringIO()) diff --git a/slack_event_handler/tests/test_pr_parser.py b/slack_event_handler/tests/test_pr_parser.py deleted file mode 100644 index f384f612..00000000 --- a/slack_event_handler/tests/test_pr_parser.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Tests for slack_event_handler.utils.pr_parser.""" - -from slack_event_handler.utils.pr_parser import extract_pr_urls - - -def test_extract_pr_urls_empty_text(): - valid, invalid = extract_pr_urls("") - assert valid == [] and invalid == [] - - -def test_extract_pr_urls_no_allowed_org_all_valid(): - text = "See https://github.com/foo/bar/pull/1 and https://github.com/Org2/x/pull/2" - valid, invalid = extract_pr_urls(text) - assert len(valid) == 2 - assert invalid == [] - assert valid[0]["owner"] == "foo" - assert valid[0]["pull_number"] == 1 - - -def test_extract_pr_urls_with_allowed_org_splits(): - text = "https://github.com/boostorg/beast/pull/99 https://github.com/other/x/pull/1" - valid, invalid = extract_pr_urls(text, allowed_org="boostorg") - assert len(valid) == 1 - assert valid[0]["repo"] == "beast" - assert len(invalid) == 1 - assert invalid[0]["owner"] == "other" - - -def test_extract_pr_urls_allowed_org_case_insensitive(): - url = "https://github.com/MyOrg/Repo/pull/3" - valid, invalid = extract_pr_urls(url, allowed_org="MYORG") - assert len(valid) == 1 and invalid == [] diff --git a/slack_event_handler/tests/test_rate_limiter.py b/slack_event_handler/tests/test_rate_limiter.py deleted file mode 100644 index f4d18cfc..00000000 --- a/slack_event_handler/tests/test_rate_limiter.py +++ /dev/null @@ -1,152 +0,0 @@ -"""Tests for slack_event_handler.utils.rate_limiter.""" - -from unittest.mock import patch - -import pytest - -from slack_event_handler.utils.rate_limiter import ( - compute_delay_at, - recent_timestamps_at, - record_posted, - try_reserve_slot, - wait_and_reserve_slot, - wait_for_slot, -) - - -def test_recent_timestamps_at_filters_by_cutoff(): - now = 1000.0 - posted = [100.0, 500.0, 900.0, 950.0] - recent = recent_timestamps_at(posted, now, window_seconds=200) - assert recent == [900.0, 950.0] - - -def test_compute_delay_at_zero_when_under_cap(): - assert compute_delay_at([], 1000.0) == 0.0 - assert compute_delay_at([999.0], 1000.0) == 0.0 - - -@pytest.mark.django_db -def test_compute_delay_at_positive_when_at_cap(settings): - settings.SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW = 2 - settings.SLACK_PR_BOT_COMMENTS_WINDOW_SECONDS = 100 - now = 1000.0 - posted = [990.0, 995.0] - delay = compute_delay_at(posted, now) - assert delay > 0 - - -@pytest.mark.django_db -def test_try_reserve_slot_false_at_cap(settings, tmp_path): - settings.SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW = 2 - settings.SLACK_PR_BOT_COMMENTS_WINDOW_SECONDS = 100 - path = tmp_path / "state.json" - import json - - now = 1000.0 - path.write_text(json.dumps({"postedAt": [990.0, 995.0], "queue": []})) - - with patch( - "slack_event_handler.utils.state._get_state_file_path", - return_value=str(path), - ): - with patch( - "slack_event_handler.utils.rate_limiter.time.time", return_value=now - ): - assert try_reserve_slot(None) is False - - -@pytest.mark.django_db -def test_try_reserve_slot_true_and_persists(settings, tmp_path): - settings.SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW = 5 - path = tmp_path / "state.json" - - with patch( - "slack_event_handler.utils.state._get_state_file_path", - return_value=str(path), - ): - with patch( - "slack_event_handler.utils.rate_limiter.time.time", return_value=42.0 - ): - assert try_reserve_slot(None) is True - from slack_event_handler.utils.state import load_state - - loaded = load_state(None) - assert 42.0 in loaded["postedAt"] - - -@pytest.mark.django_db -def test_wait_and_reserve_slot_retries_until_reserved(settings, monkeypatch): - settings.SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW = 5 - attempts = [] - - def fake_try_reserve(team_id=None): - attempts.append(1) - return len(attempts) > 1 - - sleeps = [] - - def fake_sleep(d): - sleeps.append(d) - - monkeypatch.setattr( - "slack_event_handler.utils.rate_limiter.try_reserve_slot", fake_try_reserve - ) - monkeypatch.setattr("slack_event_handler.utils.rate_limiter.time.sleep", fake_sleep) - monkeypatch.setattr( - "slack_event_handler.utils.rate_limiter.compute_delay", lambda _posted: 1.0 - ) - - with patch("slack_event_handler.utils.rate_limiter.load_state") as ls: - ls.return_value = {"postedAt": [], "queue": []} - wait_and_reserve_slot(None) - - assert sleeps == [1.0] - assert len(attempts) == 2 - - -@pytest.mark.django_db -def test_wait_for_slot_breaks_when_delay_zero(settings, monkeypatch): - settings.SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW = 5 - - calls = {"n": 0} - - def fake_compute(state_list): - calls["n"] += 1 - return 0.0 if calls["n"] > 1 else 1.0 - - sleeps = [] - - def fake_sleep(d): - sleeps.append(d) - - monkeypatch.setattr( - "slack_event_handler.utils.rate_limiter.compute_delay", fake_compute - ) - monkeypatch.setattr("slack_event_handler.utils.rate_limiter.time.sleep", fake_sleep) - - with patch("slack_event_handler.utils.rate_limiter.load_state") as ls: - ls.return_value = {"postedAt": [], "queue": []} - wait_for_slot(None) - - assert sleeps == [1.0] - - -@pytest.mark.django_db -def test_record_posted_appends_timestamp(settings, tmp_path): - settings.SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW = 10 - - path = tmp_path / "state.json" - - with patch( - "slack_event_handler.utils.state._get_state_file_path", - return_value=str(path), - ): - with patch( - "slack_event_handler.utils.rate_limiter.time.time", return_value=42.0 - ): - record_posted(None) - from slack_event_handler.utils.state import load_state - - loaded = load_state(None) - assert 42.0 in loaded["postedAt"] diff --git a/slack_event_handler/tests/test_runner.py b/slack_event_handler/tests/test_runner.py deleted file mode 100644 index f16b3b9c..00000000 --- a/slack_event_handler/tests/test_runner.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Tests for slack_event_handler.runner.""" - -from unittest.mock import MagicMock, patch - -import pytest - -from slack_event_handler.runner import run_slack_event_handler - - -@pytest.mark.django_db -def test_run_slack_event_handler_no_teams_logs_error(settings, tmp_path): - settings.SLACK_BOT_TOKEN = {} - ws = str(tmp_path / "slack-ws") - with patch("slack_event_handler.runner.get_workspace_root", return_value=ws): - with patch("slack_event_handler.runner.logger") as log: - run_slack_event_handler() - log.error.assert_called() - - -@pytest.mark.django_db -def test_run_slack_event_handler_workspace_root_failure_still_runs(settings): - settings.SLACK_BOT_TOKEN = {} - with patch( - "slack_event_handler.runner.get_workspace_root", - side_effect=OSError("no workspace"), - ): - with patch("slack_event_handler.runner.logger") as log: - run_slack_event_handler() - log.exception.assert_called() - log.error.assert_called() - - -@pytest.mark.django_db -def test_run_slack_event_handler_workspace_root_type_error_still_runs(settings): - settings.SLACK_BOT_TOKEN = {} - with patch( - "slack_event_handler.runner.get_workspace_root", - side_effect=TypeError("bad WORKSPACE_DIR type"), - ): - with patch("slack_event_handler.runner.logger") as log: - run_slack_event_handler() - log.exception.assert_called() - log.error.assert_called() - - -@pytest.mark.django_db -def test_run_slack_event_handler_starts_listener_threads( - settings, tmp_path, fake_slack_bolt -): - pytest.importorskip("github") - settings.SLACK_BOT_TOKEN = {"T123": "xoxb-test-token"} - mock_thread = MagicMock() - ws = str(tmp_path / "ws") - with patch("slack_event_handler.runner.get_workspace_root", return_value=ws): - with patch( - "slack_event_handler.runner.get_slack_app_token", - return_value="xapp-test", - ): - with patch( - "slack_event_handler.runner.threading.Thread", - return_value=mock_thread, - ) as mock_thread_cls: - run_slack_event_handler() - mock_thread_cls.assert_called_once() - mock_thread.start.assert_called_once() - mock_thread.join.assert_called_once() - - -@pytest.mark.django_db -def test_run_slack_event_handler_non_dict_tokens_treated_as_empty(settings, tmp_path): - settings.SLACK_BOT_TOKEN = "not-a-dict" - with patch( - "slack_event_handler.runner.get_workspace_root", return_value=str(tmp_path) - ): - with patch("slack_event_handler.runner.logger") as log: - run_slack_event_handler() - log.error.assert_called() - - -@pytest.mark.django_db -def test_run_slack_event_handler_skips_empty_bot_token( - settings, tmp_path, fake_slack_bolt -): - settings.SLACK_BOT_TOKEN = {"T1": " "} - with patch( - "slack_event_handler.runner.get_workspace_root", return_value=str(tmp_path) - ): - with patch("slack_event_handler.runner.logger") as log: - run_slack_event_handler() - log.error.assert_called() - - -@pytest.mark.django_db -def test_run_slack_event_handler_skips_when_app_token_missing( - settings, tmp_path, fake_slack_bolt -): - settings.SLACK_BOT_TOKEN = {"T9": "xoxb-valid-token"} - with patch( - "slack_event_handler.runner.get_workspace_root", return_value=str(tmp_path) - ): - with patch( - "slack_event_handler.runner.get_slack_app_token", - side_effect=ValueError("missing"), - ): - with patch("slack_event_handler.runner.logger") as log: - run_slack_event_handler() - assert log.warning.called - - -@pytest.mark.django_db -def test_run_slack_event_handler_two_teams_start_two_threads( - settings, tmp_path, fake_slack_bolt -): - settings.SLACK_BOT_TOKEN = {"TA": "xoxb-a", "TB": "xoxb-b"} - mock_thread = MagicMock() - with patch( - "slack_event_handler.runner.get_workspace_root", return_value=str(tmp_path) - ): - with patch( - "slack_event_handler.runner.get_slack_app_token", - side_effect=["app-a", "app-b"], - ): - with patch( - "slack_event_handler.runner.threading.Thread", - return_value=mock_thread, - ) as mock_tc: - run_slack_event_handler() - assert mock_tc.call_count == 2 - assert mock_thread.start.call_count == 2 diff --git a/slack_event_handler/tests/test_slack_internal_tokens_store.py b/slack_event_handler/tests/test_slack_internal_tokens_store.py deleted file mode 100644 index 667d3be3..00000000 --- a/slack_event_handler/tests/test_slack_internal_tokens_store.py +++ /dev/null @@ -1,122 +0,0 @@ -"""Tests for workspace JSON Slack internal token storage.""" - -import json -from unittest.mock import patch - -import pytest -from django.test import override_settings - -from slack_event_handler.utils import slack_internal_tokens_store as store - - -@override_settings( - WORKSPACE_DIR="/tmp/ws", - SLACK_INTERNAL_TOKENS_JSON="", -) -def test_save_and_load_tokens(tmp_path, settings): - settings.WORKSPACE_DIR = str(tmp_path) - path = store.save_slack_internal_tokens( - "T1", "xoxc-abc", "xoxd-xyz", team_name="Team" - ) - assert path == tmp_path / "slack_event_handler" / "slack_internal_tokens.json" - data = json.loads(path.read_text(encoding="utf-8")) - assert data["teams"]["T1"]["xoxc"] == "xoxc-abc" - assert data["teams"]["T1"]["xoxd"] == "xoxd-xyz" - loaded = store.load_slack_internal_tokens("T1") - assert loaded["xoxc"] == "xoxc-abc" - assert loaded["team_name"] == "Team" - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True, WORKSPACE_DIR="/tmp/ws") -def test_get_slack_internal_token_pair(tmp_path, settings): - settings.WORKSPACE_DIR = str(tmp_path) - store.save_slack_internal_tokens("T1", "xc", "xd") - with override_settings(SLACK_TEAM_IDS="T1"): - pair = store.get_slack_internal_token_pair("T1") - assert pair == ("xc", "xd") - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=False, WORKSPACE_DIR="/tmp/ws") -def test_get_pair_disabled(tmp_path, settings): - settings.WORKSPACE_DIR = str(tmp_path) - store.save_slack_internal_tokens("T1", "xc", "xd") - assert store.get_slack_internal_token_pair("T1") is None - - -def test_save_requires_fields(): - with pytest.raises(ValueError): - store.save_slack_internal_tokens("", "a", "b") - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True, WORKSPACE_DIR="/tmp/ws") -@patch( - "slack_event_handler.utils.slack_tokens.probe_slack_internal_tokens", - return_value=True, -) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.extract_and_save_slack_internal_tokens", - return_value=("xc", "xd"), -) -def test_get_or_load_extracts_when_json_missing( - mock_extract, _mock_probe, tmp_path, settings -): - settings.WORKSPACE_DIR = str(tmp_path) - pair = store.get_or_load_slack_internal_token_pair("T1") - assert pair == ("xc", "xd") - mock_extract.assert_called_once_with("T1") - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True, WORKSPACE_DIR="/tmp/ws") -@patch( - "slack_event_handler.utils.slack_tokens.probe_slack_internal_tokens", - side_effect=[False, True], -) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.extract_and_save_slack_internal_tokens", - return_value=("new-xc", "new-xd"), -) -def test_get_or_load_reextracts_when_json_tokens_stale( - mock_extract, _mock_probe, tmp_path, settings -): - settings.WORKSPACE_DIR = str(tmp_path) - store.save_slack_internal_tokens("T1", "old-xc", "old-xd") - pair = store.get_or_load_slack_internal_token_pair("T1") - assert pair == ("new-xc", "new-xd") - mock_extract.assert_called_once_with("T1") - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True, WORKSPACE_DIR="/tmp/ws") -@patch( - "slack_event_handler.utils.slack_tokens.probe_slack_internal_tokens", - return_value=False, -) -@patch( - "slack_event_handler.utils.slack_internal_tokens_store.extract_and_save_slack_internal_tokens", - return_value=("bad-xc", "bad-xd"), -) -def test_get_or_load_logs_when_reextracted_tokens_still_invalid( - mock_extract, _mock_probe, tmp_path, settings, caplog -): - import logging - - settings.WORKSPACE_DIR = str(tmp_path) - store.save_slack_internal_tokens("T1", "old-xc", "old-xd") - with caplog.at_level(logging.ERROR): - pair = store.get_or_load_slack_internal_token_pair("T1") - assert pair is None - mock_extract.assert_called_once_with("T1") - assert "still invalid" in caplog.text - assert ".env.example" in caplog.text - - -@override_settings(ALLOW_INTERNAL_SLACK_TOKENS=True, WORKSPACE_DIR="/tmp/ws") -@patch( - "slack_event_handler.utils.slack_tokens.probe_slack_internal_tokens", - return_value=True, -) -def test_get_or_load_keeps_valid_json_tokens(_mock_probe, tmp_path, settings): - settings.WORKSPACE_DIR = str(tmp_path) - store.save_slack_internal_tokens("T1", "xc", "xd") - pair = store.get_or_load_slack_internal_token_pair("T1") - assert pair == ("xc", "xd") - _mock_probe.assert_called_once_with("xc", "xd") diff --git a/slack_event_handler/tests/test_slack_listener_coverage.py b/slack_event_handler/tests/test_slack_listener_coverage.py deleted file mode 100644 index 08fb3867..00000000 --- a/slack_event_handler/tests/test_slack_listener_coverage.py +++ /dev/null @@ -1,725 +0,0 @@ -# ruff: noqa: S106 -- synthetic Slack tokens in tests. -"""Extra coverage for slack_event_handler.utils.slack_listener.""" - -from unittest.mock import MagicMock, patch - -import pytest - -from slack_event_handler.tests.conftest import ImmediateThread - -SCOPE_HUDDLE = 0 -SCOPE_PR_BOT = 1 - - -def _bolt_app_inst(**kwargs): - """Magic Slack App instance with .event usable as a Bolt decorator.""" - app_inst = MagicMock(**kwargs) - - def event_register(spec): - def deco(fn): - return fn - - return deco - - app_inst.event = event_register - return app_inst - - -def _bolt_app_mock_with_inst(app_inst): - mock_app_cls = MagicMock() - mock_app_cls.return_value = app_inst - return mock_app_cls - - -@pytest.mark.django_db -def test_slack_listener_raises_when_bot_token_missing(fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - with pytest.raises(ValueError, match="Missing bot_token"): - SlackListener(bot_token="", app_token="xapp-test") - - -@pytest.mark.django_db -def test_slack_listener_raises_when_app_token_missing(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = MagicMock() - mock_app_cls.return_value = MagicMock() - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - with patch( - "slack_event_handler.utils.slack_listener.get_slack_app_token", - return_value="", - ): - with pytest.raises(ValueError, match="SLACK_APP_TOKEN"): - SlackListener( - bot_token="xoxb-test", - app_token="", - team_id="T1", - ) - - -@pytest.mark.django_db -def test_resolve_pr_channel_not_found_logs(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "missing-channel" - - app_inst = _bolt_app_inst() - app_inst.client.conversations_list.return_value = { - "channels": [{"name": "other", "id": "C1"}], - "response_metadata": {}, - } - mock_app_cls = _bolt_app_mock_with_inst(app_inst) - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - assert listener._pr_channel_id is None - - -@pytest.mark.django_db -def test_resolve_pr_channel_pagination(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "alerts" - - app_inst = _bolt_app_inst() - app_inst.client.conversations_list.side_effect = [ - { - "channels": [{"name": "other", "id": "x"}], - "response_metadata": {"next_cursor": "c1"}, - }, - { - "channels": [{"name": "alerts", "id": "CAL"}], - "response_metadata": {}, - }, - ] - mock_app_cls = _bolt_app_mock_with_inst(app_inst) - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - assert listener._pr_channel_id == "CAL" - - -@pytest.mark.django_db -def test_resolve_pr_channel_api_error(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "alerts" - - app_inst = _bolt_app_inst() - app_inst.client.conversations_list.side_effect = OSError("api down") - mock_app_cls = _bolt_app_mock_with_inst(app_inst) - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - assert listener._pr_channel_id is None - - -@pytest.mark.django_db -def test_send_user_reply_channel_and_dm(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - app_inst = _bolt_app_inst() - mock_app_cls = _bolt_app_mock_with_inst(app_inst) - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - listener._send_user_reply("C1", "9.9", False, "hi") - kwargs = app_inst.client.chat_postMessage.call_args.kwargs - assert kwargs["thread_ts"] == "9.9" - - app_inst.client.chat_postMessage.side_effect = RuntimeError("fail") - listener._send_user_reply("D1", "9.9", True, "dm") - - -@pytest.mark.django_db -def test_handle_pr_request_no_github_url(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - settings.SLACK_PR_BOT_TEAM = "" - - mock_app_cls = _bolt_app_mock_with_inst(_bolt_app_inst()) - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - with patch.object(listener, "_send_user_reply") as send: - listener._handle_pr_request("hello no urls", "C1", "1", "U1", False) - send.assert_called_once() - assert "No GitHub PR URL" in send.call_args[0][3] - - -def _bolt_app_mock(): - return _bolt_app_mock_with_inst(_bolt_app_inst()) - - -@pytest.mark.django_db -def test_handle_pr_request_enqueues_once_per_pr(settings, fake_slack_bolt): - import slack_event_handler.utils.slack_listener as sl_mod - - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - settings.SLACK_PR_BOT_TEAM = "" - - mock_app_cls = _bolt_app_mock() - - with patch.object(sl_mod, "App", mock_app_cls): - with patch.object(sl_mod, "set_slack_app"): - with patch.object(sl_mod, "start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - url = "https://github.com/acme/r/pull/3" - text = f"{url} {url}" - - with patch.object(listener, "_send_user_reply"): - with patch.object(sl_mod, "enqueue_job") as eq: - with patch.object(sl_mod, "estimated_delay_sec", return_value=0): - listener._handle_pr_request(text, "C1", "1", "U1", False) - assert eq.call_count == 1 - - -@pytest.mark.django_db -def test_handle_pr_request_rate_limit_ack(settings, fake_slack_bolt): - import slack_event_handler.utils.slack_listener as sl_mod - - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - settings.SLACK_PR_BOT_TEAM = "" - - mock_app_cls = _bolt_app_mock() - - with patch.object(sl_mod, "App", mock_app_cls): - with patch.object(sl_mod, "set_slack_app"): - with patch.object(sl_mod, "start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - with patch.object(listener, "_send_user_reply") as send: - with patch.object(sl_mod, "enqueue_job"): - with patch.object(sl_mod, "estimated_delay_sec", return_value=30): - listener._handle_pr_request( - "https://github.com/acme/r/pull/1", - "C1", - "1", - "U1", - False, - ) - ack = send.call_args[0][3] - assert "30s" in ack - - -@pytest.mark.django_db -def test_extract_file_id_from_url_regex_error(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = _bolt_app_mock_with_inst(_bolt_app_inst()) - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - with patch( - "slack_event_handler.utils.slack_listener.re.search", - side_effect=RuntimeError("regex"), - ): - assert listener._extract_file_id_from_url("https://x/F0123456789AB") is None - - -@pytest.mark.django_db -def test_extract_file_id_from_event_finds_view_ai_notes(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = _bolt_app_mock_with_inst(_bolt_app_inst()) - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - event = { - "blocks": [ - { - "elements": [ - { - "type": "rich_text_section", - "elements": [ - { - "type": "link", - "text": "View AI notes", - "url": "https://files.slack.com/archives/F09999999999", - } - ], - } - ] - } - ] - } - assert listener._extract_file_id_from_event(event) == "F09999999999" - - -@pytest.mark.django_db -def test_mark_file_processed_evicts_oldest(settings, fake_slack_bolt): - import slack_event_handler.utils.slack_listener as sl_mod - - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = _bolt_app_mock() - - with patch.object(sl_mod, "MAX_PROCESSED_FILE_IDS", 2): - with patch.object(sl_mod, "App", mock_app_cls): - with patch.object(sl_mod, "set_slack_app"): - with patch.object(sl_mod, "start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - assert listener._mark_file_processed("FA") is True - assert listener._mark_file_processed("FB") is True - assert listener._mark_file_processed("FC") is True - assert "FA" not in listener._processed_file_ids - - -@pytest.mark.django_db -def test_message_handler_huddle_success_sync(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {"T1": [SCOPE_HUDDLE]} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = MagicMock() - app_inst = MagicMock() - captured = [] - - def event_register(spec): - def deco(fn): - captured.append((spec, fn)) - return fn - - return deco - - app_inst.event = event_register - mock_app_cls.return_value = app_inst - - event_body = { - "ai_context": {"type": "summary", "summary": {"type": "huddle"}}, - "blocks": [ - { - "elements": [ - { - "type": "rich_text_section", - "elements": [ - { - "type": "link", - "text": "View AI notes", - "url": "https://files.slack.com/archives/F08888888888", - } - ], - } - ] - } - ], - } - - with patch( - "slack_event_handler.utils.slack_listener.threading.Thread", ImmediateThread - ): - with patch("slack_event_handler.utils.slack_listener.time.sleep"): - with patch( - "slack_event_handler.utils.huddle_processor.process_huddle_canvas", - return_value={"success": True, "github_url": "https://g/pr/1"}, - ): - with patch( - "slack_event_handler.utils.slack_listener.App", mock_app_cls - ): - with patch( - "slack_event_handler.utils.slack_listener.set_slack_app" - ): - with patch( - "slack_event_handler.utils.slack_listener.start_worker" - ): - SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - msg_fn = next( - fn for spec, fn in captured if spec == "message" - ) - msg_fn(event_body, {"event": event_body}) - - -@pytest.mark.django_db -def test_message_handler_huddle_failure_unmarks(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {"T1": [SCOPE_HUDDLE]} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = MagicMock() - app_inst = MagicMock() - captured = [] - - def event_register(spec): - def deco(fn): - captured.append((spec, fn)) - return fn - - return deco - - app_inst.event = event_register - mock_app_cls.return_value = app_inst - - event_body = { - "ai_context": {"type": "summary", "summary": {"type": "huddle"}}, - "blocks": [ - { - "elements": [ - { - "type": "rich_text_section", - "elements": [ - { - "type": "link", - "text": "View AI note", - "url": "https://files.slack.com/archives/F07777777777", - } - ], - } - ] - } - ], - } - - with patch( - "slack_event_handler.utils.slack_listener.threading.Thread", ImmediateThread - ): - with patch("slack_event_handler.utils.slack_listener.time.sleep"): - with patch( - "slack_event_handler.utils.huddle_processor.process_huddle_canvas", - return_value={"success": False}, - ): - with patch( - "slack_event_handler.utils.slack_listener.App", mock_app_cls - ): - with patch( - "slack_event_handler.utils.slack_listener.set_slack_app" - ): - with patch( - "slack_event_handler.utils.slack_listener.start_worker" - ): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - msg_fn = next( - fn for spec, fn in captured if spec == "message" - ) - msg_fn(event_body, {"event": event_body}) - assert "F07777777777" not in listener._processed_file_ids - - -@pytest.mark.django_db -def test_message_handler_huddle_process_exception_unmarks(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {"T1": [SCOPE_HUDDLE]} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = MagicMock() - app_inst = MagicMock() - captured = [] - - def event_register(spec): - def deco(fn): - captured.append((spec, fn)) - return fn - - return deco - - app_inst.event = event_register - mock_app_cls.return_value = app_inst - - event_body = { - "ai_context": {"type": "summary", "summary": {"type": "huddle"}}, - "blocks": [ - { - "elements": [ - { - "type": "rich_text_section", - "elements": [ - { - "type": "link", - "text": "View AI notes", - "url": "https://files.slack.com/archives/F06666666666", - } - ], - } - ] - } - ], - } - - with patch( - "slack_event_handler.utils.slack_listener.threading.Thread", ImmediateThread - ): - with patch("slack_event_handler.utils.slack_listener.time.sleep"): - with patch( - "slack_event_handler.utils.huddle_processor.process_huddle_canvas", - side_effect=RuntimeError("proc"), - ): - with patch( - "slack_event_handler.utils.slack_listener.App", mock_app_cls - ): - with patch( - "slack_event_handler.utils.slack_listener.set_slack_app" - ): - with patch( - "slack_event_handler.utils.slack_listener.start_worker" - ): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - msg_fn = next( - fn for spec, fn in captured if spec == "message" - ) - msg_fn(event_body, {"event": event_body}) - assert "F06666666666" not in listener._processed_file_ids - - -@pytest.mark.django_db -def test_message_handler_huddle_missing_file_id_returns(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {"T1": [SCOPE_HUDDLE]} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = MagicMock() - app_inst = MagicMock() - captured = [] - - def event_register(spec): - def deco(fn): - captured.append((spec, fn)) - return fn - - return deco - - app_inst.event = event_register - mock_app_cls.return_value = app_inst - - event_body = { - "ai_context": {"type": "summary", "summary": {"type": "huddle"}}, - "blocks": [], - } - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - msg_fn = next(fn for spec, fn in captured if spec == "message") - msg_fn(event_body, {"event": event_body}) - - -@pytest.mark.django_db -def test_message_handler_pr_channel_match(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {"T1": [SCOPE_PR_BOT]} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - settings.SLACK_PR_BOT_TEAM = "" - - mock_app_cls = MagicMock() - app_inst = MagicMock() - captured = [] - - def event_register(spec): - def deco(fn): - captured.append((spec, fn)) - return fn - - return deco - - app_inst.event = event_register - mock_app_cls.return_value = app_inst - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - listener._pr_channel_id = "CPR" - - msg_fn = next(fn for spec, fn in captured if spec == "message") - with patch.object(listener, "_handle_pr_request") as hp: - msg_fn( - { - "channel_type": "channel", - "text": "https://github.com/acme/r/pull/1", - "channel": "CPR", - "ts": "1", - "user": "U1", - }, - {}, - ) - hp.assert_called_once() - - -@pytest.mark.django_db -def test_message_handler_neither_huddle_nor_pr_channel(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {"T1": [SCOPE_PR_BOT]} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - settings.SLACK_PR_BOT_TEAM = "" - - mock_app_cls = MagicMock() - app_inst = MagicMock() - captured = [] - - def event_register(spec): - def deco(fn): - captured.append((spec, fn)) - return fn - - return deco - - app_inst.event = event_register - mock_app_cls.return_value = app_inst - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - listener._pr_channel_id = "CPR" - - msg_fn = next(fn for spec, fn in captured if spec == "message") - with patch.object(listener, "_handle_pr_request") as hp: - msg_fn( - { - "channel_type": "channel", - "text": "hello", - "channel": "COTHER", - "ts": "1", - "user": "U1", - }, - {}, - ) - hp.assert_not_called() - - -@pytest.mark.django_db -def test_is_huddle_ai_note_malformed_logs(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = _bolt_app_mock_with_inst(_bolt_app_inst()) - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - class BadDict(dict): - def get(self, key, default=None): - if key == "ai_context": - raise RuntimeError("bad") - return super().get(key, default) - - assert listener._is_huddle_ai_note_event(BadDict()) is False diff --git a/slack_event_handler/tests/test_slack_listener_handlers.py b/slack_event_handler/tests/test_slack_listener_handlers.py deleted file mode 100644 index b8d2c959..00000000 --- a/slack_event_handler/tests/test_slack_listener_handlers.py +++ /dev/null @@ -1,167 +0,0 @@ -"""Exercise slack_listener registered event handlers (Bolt mocked).""" - -from unittest.mock import MagicMock, patch - -import pytest - -# Mirror slack_listener team-scope constants (avoid importing slack_listener at collect time). -SCOPE_HUDDLE = 0 -SCOPE_PR_BOT = 1 - - -@pytest.mark.django_db -def test_message_handler_skips_subtypes(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = MagicMock() - app_inst = MagicMock() - - captured = [] - - def event_register(spec): - def deco(fn): - captured.append((spec, fn)) - return fn - - return deco - - app_inst.event = event_register - mock_app_cls.return_value = app_inst - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - SlackListener( - bot_token="xoxb-test", app_token="xapp-test", team_id="T1" - ) - - msg_fn = next(fn for spec, fn in captured if spec == "message") - msg_fn({"subtype": "message_changed"}, {}) - msg_fn({"subtype": "message_deleted"}, {}) - - -@pytest.mark.django_db -def test_message_handler_pr_branch(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {"T1": [SCOPE_PR_BOT]} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - settings.SLACK_PR_BOT_TEAM = "" - - mock_app_cls = MagicMock() - app_inst = MagicMock() - captured = [] - - def event_register(spec): - def deco(fn): - captured.append((spec, fn)) - return fn - - return deco - - app_inst.event = event_register - mock_app_cls.return_value = app_inst - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - msg_fn = next(fn for spec, fn in captured if spec == "message") - with patch.object(listener, "_handle_pr_request") as hp: - msg_fn( - { - "channel_type": "im", - "text": "hello", - "channel": "D1", - "ts": "1", - "user": "U1", - }, - {}, - ) - hp.assert_called_once() - - -@pytest.mark.django_db -def test_message_handler_pr_disabled_scope_logs(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {"T1": [SCOPE_HUDDLE]} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = MagicMock() - app_inst = MagicMock() - captured = [] - - def event_register(spec): - def deco(fn): - captured.append((spec, fn)) - return fn - - return deco - - app_inst.event = event_register - mock_app_cls.return_value = app_inst - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - SlackListener( - bot_token="xoxb-test", app_token="xapp-test", team_id="T1" - ) - - msg_fn = next(fn for spec, fn in captured if spec == "message") - msg_fn( - { - "channel_type": "channel", - "text": "hi", - "channel": "C99", - "ts": "1", - }, - {}, - ) - - -@pytest.mark.django_db -def test_misc_event_handlers_run(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = MagicMock() - app_inst = MagicMock() - captured = [] - - def event_register(spec): - def deco(fn): - captured.append((spec, fn)) - return fn - - return deco - - app_inst.event = event_register - mock_app_cls.return_value = app_inst - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - SlackListener( - bot_token="xoxb-test", app_token="xapp-test", team_id="T1" - ) - - by_str = {spec: fn for spec, fn in captured if isinstance(spec, str)} - cb_fn = next(fn for spec, fn in captured if isinstance(spec, dict)) - - by_str["file_shared"]({}, {}) - by_str["reaction_added"]({}, {}) - by_str["app_mention"]({}, {}) - - cb_fn({}, {"event": {"type": "unknown"}}) diff --git a/slack_event_handler/tests/test_slack_listener_unit.py b/slack_event_handler/tests/test_slack_listener_unit.py deleted file mode 100644 index 5079a3be..00000000 --- a/slack_event_handler/tests/test_slack_listener_unit.py +++ /dev/null @@ -1,156 +0,0 @@ -# ruff: noqa: S106 -- synthetic Slack tokens in tests (not real secrets). -"""Unit tests for slack_event_handler.utils.slack_listener (Bolt mocked).""" - -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest - - -@pytest.mark.django_db -def test_save_event_to_file_writes_json(tmp_path, fake_slack_bolt): - from slack_event_handler.utils import slack_listener as sl - - with patch.object(sl, "_data_dir", return_value=str(tmp_path)): - p = sl.save_event_to_file("myevt", {"event": {"ts": "3.141"}}) - assert p is not None - assert Path(p).exists() - - -@pytest.mark.django_db -def test_save_event_to_file_returns_none_on_error(tmp_path, fake_slack_bolt): - from slack_event_handler.utils import slack_listener as sl - - with patch.object(sl, "_data_dir", return_value=str(tmp_path)): - with patch.object(sl.json, "dump", side_effect=OSError("fail")): - assert sl.save_event_to_file("x", {"event": {"ts": "1"}}) is None - - -@pytest.mark.django_db -def test_slack_listener_helpers(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = MagicMock() - app_inst = MagicMock() - mock_app_cls.return_value = app_inst - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - fid_url = "https://files.slack.com/archives/F0123456789AB" - assert listener._extract_file_id_from_url(fid_url) == "F0123456789AB" - - assert listener._is_huddle_ai_note_event( - {"ai_context": {"type": "summary", "summary": {"type": "huddle"}}} - ) - assert listener._is_huddle_ai_note_event([]) is False - - assert listener._mark_file_processed("FX") is True - assert listener._mark_file_processed("FX") is False - listener._unmark_file_processed("FX") - assert listener._mark_file_processed("FX") is True - - -@pytest.mark.django_db -def test_slack_listener_resolve_pr_channel(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "alerts" - - mock_app_cls = MagicMock() - app_inst = MagicMock() - app_inst.client.conversations_list.return_value = { - "channels": [{"name": "alerts", "id": "CALERT"}], - "response_metadata": {}, - } - mock_app_cls.return_value = app_inst - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - assert listener._pr_channel_id == "CALERT" - - -@pytest.mark.django_db -def test_handle_pr_request_invalid_org(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - settings.SLACK_PR_BOT_TEAM = "boostorg" - - mock_app_cls = MagicMock() - mock_app_cls.return_value = MagicMock() - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - with patch.object(listener, "_send_user_reply") as send: - listener._handle_pr_request( - "https://github.com/other/repo/pull/9", - "C1", - "1.0", - "U1", - False, - ) - send.assert_called() - - -@pytest.mark.django_db -def test_slack_listener_start_calls_socket_handler(settings, fake_slack_bolt): - from slack_event_handler.utils.slack_listener import SlackListener - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_app_cls = MagicMock() - mock_app_cls.return_value = MagicMock() - - with patch("slack_event_handler.utils.slack_listener.App", mock_app_cls): - with patch("slack_event_handler.utils.slack_listener.set_slack_app"): - with patch("slack_event_handler.utils.slack_listener.start_worker"): - listener = SlackListener( - bot_token="xoxb-test", - app_token="xapp-test", - team_id="T1", - ) - - with patch("slack_event_handler.utils.slack_listener.SocketModeHandler") as H: - handler_inst = MagicMock() - H.return_value = handler_inst - listener.start() - handler_inst.start.assert_called_once() - - -@pytest.mark.django_db -def test_start_slack_listener_factory(settings, fake_slack_bolt): - from slack_event_handler.utils import slack_listener as sl - - settings.SLACK_TEAM_SCOPE = {} - settings.SLACK_PR_BOT_CHANNEL_NAME = "" - - mock_listener = MagicMock() - with patch.object(sl, "SlackListener", return_value=mock_listener): - sl.start_slack_listener(bot_token="a", app_token="b", team_id="T") - mock_listener.start.assert_called_once() diff --git a/slack_event_handler/tests/test_slack_tokens.py b/slack_event_handler/tests/test_slack_tokens.py deleted file mode 100644 index 677edaa2..00000000 --- a/slack_event_handler/tests/test_slack_tokens.py +++ /dev/null @@ -1,239 +0,0 @@ -"""Tests for slack_event_handler.utils.slack_tokens (no real Chrome profile).""" - -import json -import sys -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest -from django.test import override_settings - -from slack_event_handler.utils import slack_tokens as st - - -@pytest.fixture -def sample_local_config(): - return { - "teams": { - "T1": {"token": "xoxc-1", "name": "Team One", "user_id": "U1"}, - "T2": {"token": "xoxc-2", "name": "Team Two", "user_id": "U2"}, - } - } - - -def test_is_slack_internal_token_auth_error(): - assert st.is_slack_internal_token_auth_error("invalid_auth") - assert not st.is_slack_internal_token_auth_error("file_not_found") - - -@patch("slack_event_handler.utils.slack_tokens.requests.post") -def test_probe_slack_internal_tokens_ok(mock_post): - mock_resp = MagicMock() - mock_resp.raise_for_status = MagicMock() - mock_resp.json.return_value = {"ok": False, "error": "file_not_found"} - mock_post.return_value = mock_resp - assert st.probe_slack_internal_tokens("xc", "xd") is True - - -@patch("slack_event_handler.utils.slack_tokens.requests.post") -def test_probe_slack_internal_tokens_auth_error(mock_post): - mock_resp = MagicMock() - mock_resp.raise_for_status = MagicMock() - mock_resp.json.return_value = {"ok": False, "error": "invalid_auth"} - mock_post.return_value = mock_resp - assert st.probe_slack_internal_tokens("xc", "xd") is False - - -def test_probe_slack_internal_tokens_empty(): - assert st.probe_slack_internal_tokens("", "xd") is False - - -@override_settings(CHROME_PROFILE_PATH="", WORKSPACE_DIR="/tmp/ws") -def test_resolve_chrome_profile_uses_workspace_default(tmp_path, settings): - settings.WORKSPACE_DIR = str(tmp_path) - expected = tmp_path / "slack_event_handler" / "chrome_profile" - expected.mkdir(parents=True) - assert st._resolve_chrome_profile_root() == expected.resolve() - - -def test_resolve_chrome_profile_respects_custom_path(tmp_path): - custom = tmp_path / "custom_slack_chrome" - custom.mkdir() - with override_settings(CHROME_PROFILE_PATH=str(custom), WORKSPACE_DIR="/tmp/ws"): - assert st._resolve_chrome_profile_root() == custom.resolve() - - -def test_validate_chrome_profile_path_ok(): - assert "/home/user/profile" in st._validate_chrome_profile_path( - "/home/user/profile" - ) - - -@pytest.mark.parametrize("bad", ["", None, "bad\x00path", "???"]) -def test_validate_chrome_profile_path_bad(bad): - with pytest.raises(ValueError): - st._validate_chrome_profile_path(bad) - - -def test_parse_local_config_raw_strips_prefix_byte(): - payload = {"teams": {}} - raw = b"\x01" + json.dumps(payload).encode("utf-8") - assert st._parse_local_config_raw(raw) == payload - - -def test_extract_slack_tokens_from_config_success(sample_local_config): - out = st.extract_slack_tokens_from_config(sample_local_config, "xoxd-val", "T1") - assert out["xoxc"] == "xoxc-1" - assert out["xoxd"] == "xoxd-val" - assert out["team_id"] == "T1" - assert out["team_name"] == "Team One" - - -def test_extract_slack_tokens_from_config_missing_team(sample_local_config): - assert st.extract_slack_tokens_from_config(sample_local_config, "d", "TX") is None - - -def test_extract_slack_tokens_from_config_missing_xoxd(sample_local_config): - assert st.extract_slack_tokens_from_config(sample_local_config, "", "T1") is None - - -def test_get_all_team_ids_from_config(sample_local_config): - assert set(st.get_all_team_ids_from_config(sample_local_config)) == {"T1", "T2"} - - -def test_get_all_team_ids_with_explicit_config(sample_local_config): - assert st.get_all_team_ids(sample_local_config) == ["T1", "T2"] - - -@patch.object(st, "_read_local_config_v2", return_value=None) -@patch.object(st, "_resolve_chrome_profile_root", return_value=Path("/tmp/profile")) -def test_get_all_team_ids_empty_when_no_config(_resolve, _read): - assert st.get_all_team_ids() == [] - - -def test_read_local_config_v2_parses_leveldb(tmp_path): - profile = tmp_path / "chrome_profile" - leveldb_dir = profile / "Default" / "Local Storage" / "leveldb" - leveldb_dir.mkdir(parents=True) - config = {"teams": {"T1": {"token": "x"}}} - - with patch.object( - st, "_read_leveldb_value", return_value=b"\x01" + json.dumps(config).encode() - ): - out = st._read_local_config_v2(profile) - assert out == config - - -def test_read_local_config_v2_returns_none_when_no_leveldb(tmp_path): - profile = tmp_path / "empty_profile" - profile.mkdir() - assert st._read_local_config_v2(profile) is None - - -def test_read_xoxd_cookie_success(tmp_path): - profile = tmp_path / "profile" - cookies = profile / "Default" / "Cookies" - cookies.parent.mkdir(parents=True) - cookies.touch() - cookie = MagicMock() - cookie.name = "d" - cookie.value = "xoxd-abc" - mock_bc3 = MagicMock() - mock_bc3.chrome.return_value = [cookie] - with patch.dict(sys.modules, {"browser_cookie3": mock_bc3}): - assert st._read_xoxd_cookie(profile) == "xoxd-abc" - - -def test_read_xoxd_cookie_missing(tmp_path): - profile = tmp_path / "profile" - cookies = profile / "Default" / "Cookies" - cookies.parent.mkdir(parents=True) - cookies.touch() - mock_bc3 = MagicMock() - mock_bc3.chrome.return_value = [] - with patch.dict(sys.modules, {"browser_cookie3": mock_bc3}): - assert st._read_xoxd_cookie(profile) is None - - -def test_decrypt_chrome_linux_v10_cookie_roundtrip(): - pytest.importorskip("Cryptodome", reason="pycryptodomex required") - from Cryptodome.Cipher import AES - - value = "xoxd-test-token" - padded = value.encode("utf-8") - pad_len = 16 - (len(padded) % 16) - padded += bytes([pad_len]) * pad_len - payload = b"x" * 32 + padded - cipher = AES.new(st._chrome_linux_v10_cookie_key(), AES.MODE_CBC, iv=b" " * 16) - encrypted = b"v10" + cipher.encrypt(payload) - assert st._decrypt_chrome_linux_v10_cookie(encrypted) == value - - -def test_read_xoxd_cookie_sqlite_fallback(tmp_path): - pytest.importorskip("Cryptodome", reason="pycryptodomex required") - from Cryptodome.Cipher import AES - - profile = tmp_path / "profile" - cookies = profile / "Default" / "Cookies" - cookies.parent.mkdir(parents=True) - value = "xoxd-from-sqlite" - padded = value.encode("utf-8") - pad_len = 16 - (len(padded) % 16) - padded += bytes([pad_len]) * pad_len - payload = b"x" * 32 + padded - cipher = AES.new(st._chrome_linux_v10_cookie_key(), AES.MODE_CBC, iv=b" " * 16) - encrypted = b"v10" + cipher.encrypt(payload) - - import sqlite3 - - conn = sqlite3.connect(cookies) - conn.execute( - "CREATE TABLE cookies (host_key TEXT, name TEXT, encrypted_value BLOB)" - ) - conn.execute( - "INSERT INTO cookies VALUES (?, ?, ?)", - (".slack.com", "d", encrypted), - ) - conn.commit() - conn.close() - - mock_bc3 = MagicMock() - mock_bc3.chrome.side_effect = ValueError("dbus") - with patch.dict(sys.modules, {"browser_cookie3": mock_bc3}): - assert st._read_xoxd_cookie(profile) == value - - -@patch.object(st, "_read_xoxd_cookie", return_value="xoxd") -@patch.object(st, "_read_local_config_v2") -@patch.object(st, "_resolve_chrome_profile_root") -def test_extract_slack_tokens_auto_success( - mock_resolve, mock_config, mock_cookie, sample_local_config, tmp_path, settings -): - profile = tmp_path / "profile" - profile.mkdir() - settings.CHROME_PROFILE_PATH = str(profile) - mock_resolve.return_value = profile - mock_config.return_value = sample_local_config - out = st.extract_slack_tokens_auto("T1") - assert out["xoxc"] == "xoxc-1" - assert out["xoxd"] == "xoxd" - - -@patch.object(st, "_resolve_chrome_profile_root") -def test_extract_slack_tokens_auto_missing_profile(mock_resolve, settings): - settings.CHROME_PROFILE_PATH = "/nonexistent/profile/path" - mock_resolve.return_value = Path("/nonexistent/profile/path") - assert st.extract_slack_tokens_auto("T1") is None - - -@patch.object(st, "_read_xoxd_cookie", return_value=None) -@patch.object(st, "_read_local_config_v2", return_value={"teams": {}}) -@patch.object(st, "_resolve_chrome_profile_root") -def test_extract_slack_tokens_auto_no_cookie( - mock_resolve, mock_config, mock_cookie, tmp_path, settings -): - profile = tmp_path / "profile" - profile.mkdir() - settings.CHROME_PROFILE_PATH = str(profile) - mock_resolve.return_value = profile - assert st.extract_slack_tokens_auto("T1") is None diff --git a/slack_event_handler/tests/test_slack_tokens_validate.py b/slack_event_handler/tests/test_slack_tokens_validate.py deleted file mode 100644 index 441b5d6c..00000000 --- a/slack_event_handler/tests/test_slack_tokens_validate.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Validation helpers for slack_tokens.""" - -import pytest - -from slack_event_handler.utils.slack_tokens import ( - _validate_chrome_profile_path, -) - - -def test_validate_chrome_profile_path_accepts_standard_path(): - path = _validate_chrome_profile_path(" /home/user/chrome-profile ") - assert path.startswith("/") - - -def test_validate_chrome_profile_path_rejects_null_byte(): - with pytest.raises(ValueError, match="null bytes"): - _validate_chrome_profile_path("/bad\x00path") - - -def test_validate_chrome_profile_path_empty(): - with pytest.raises(ValueError): - _validate_chrome_profile_path("") diff --git a/slack_event_handler/tests/test_state.py b/slack_event_handler/tests/test_state.py deleted file mode 100644 index 327a7655..00000000 --- a/slack_event_handler/tests/test_state.py +++ /dev/null @@ -1,143 +0,0 @@ -"""Tests for slack_event_handler.utils.state.""" - -import threading -from unittest.mock import patch - -import pytest - -from slack_event_handler.utils import state as state_mod - - -@pytest.fixture -def data_dir(tmp_path): - d = tmp_path / "data" - d.mkdir(parents=True) - return d - - -def test_load_state_missing_file_returns_default(data_dir): - with patch( - "slack_event_handler.workspace.get_workspace_root", - return_value=data_dir.parent, - ): - with patch.object( - state_mod, "_get_state_file_path", return_value=str(data_dir / "state.json") - ): - s = state_mod.load_state(None) - assert s == {"postedAt": [], "queue": []} - - -def test_load_state_corrupt_json_quarantines_and_returns_default(data_dir, monkeypatch): - bad = data_dir / "state.json" - bad.write_text("{not json", encoding="utf-8") - - with patch( - "slack_event_handler.workspace.get_workspace_root", - return_value=data_dir.parent, - ): - with patch.object(state_mod, "_get_state_file_path", return_value=str(bad)): - monkeypatch.setattr(state_mod.time, "time", lambda: 12345.0) - s = state_mod.load_state(None) - assert s == {"postedAt": [], "queue": []} - quarantined = list(data_dir.glob("state.json.corrupt.*")) - assert len(quarantined) == 1 - - -def test_save_state_roundtrip(data_dir): - path = data_dir / "state.json" - payload = {"postedAt": [1.0, 2.0], "queue": [{"jobId": "x"}]} - - with patch( - "slack_event_handler.workspace.get_workspace_root", - return_value=data_dir.parent, - ): - with patch.object(state_mod, "_get_state_file_path", return_value=str(path)): - state_mod.save_state(payload, None) - loaded = state_mod.load_state(None) - assert loaded["postedAt"] == [1.0, 2.0] - assert loaded["queue"][0]["jobId"] == "x" - - -def test_get_state_file_path_team_id_sanitized(tmp_path): - root = tmp_path / "slack_event_handler" - root.mkdir(parents=True) - with patch("slack_event_handler.workspace.get_workspace_root", return_value=root): - p = state_mod._get_state_file_path("T01234/whee") - norm = p.replace("\\", "/") - assert "state_T01234_whee.json" in norm - assert "/data/" in norm or norm.endswith("/data/state_T01234_whee.json") - - -def test_sanitize_team_id_empty_returns_default(): - assert state_mod._sanitize_team_id_for_path("") == "default" - - -def test_get_lock_file_path_appends_lock_suffix(data_dir): - state_path = str(data_dir / "state.json") - with patch.object(state_mod, "_get_state_file_path", return_value=state_path): - assert state_mod._get_lock_file_path(None) == f"{state_path}.lock" - - -def test_get_lock_file_path_team_id(data_dir): - state_path = str(data_dir / "state_T9.json") - with patch.object(state_mod, "_get_state_file_path", return_value=state_path): - assert state_mod._get_lock_file_path("T9") == f"{state_path}.lock" - - -def test_thread_lock_for_same_lock_file_path(tmp_path): - root = tmp_path / "slack_event_handler" - root.mkdir(parents=True) - with patch("slack_event_handler.workspace.get_workspace_root", return_value=root): - lock_a = state_mod._thread_lock_for("T/1") - lock_b = state_mod._thread_lock_for("T?1") - assert lock_a is lock_b - - -def test_state_file_lock_blocks_until_released(data_dir): - state_path = str(data_dir / "state.json") - lock_path = f"{state_path}.lock" - holder_ready = threading.Event() - holder_release = threading.Event() - second_acquired = threading.Event() - - def hold_lock(): - with patch.object(state_mod, "_get_lock_file_path", return_value=lock_path): - with state_mod.state_file_lock(None): - holder_ready.set() - holder_release.wait(timeout=5) - - def try_lock(): - holder_ready.wait(timeout=5) - with patch.object(state_mod, "_get_lock_file_path", return_value=lock_path): - with state_mod.state_file_lock(None): - second_acquired.set() - - holder = threading.Thread(target=hold_lock) - waiter = threading.Thread(target=try_lock) - holder.start() - waiter.start() - holder_ready.wait(timeout=5) - assert not second_acquired.is_set() - holder_release.set() - waiter.join(timeout=5) - holder.join(timeout=5) - assert second_acquired.is_set() - - -def test_load_state_corrupt_json_quarantine_oserror_fallback(data_dir, monkeypatch): - bad = data_dir / "state.json" - bad.write_text("{not json", encoding="utf-8") - - def boom_replace(*a, **k): - raise OSError("disk full") - - monkeypatch.setattr(state_mod.os, "replace", boom_replace) - - with patch( - "slack_event_handler.workspace.get_workspace_root", - return_value=data_dir.parent, - ): - with patch.object(state_mod, "_get_state_file_path", return_value=str(bad)): - monkeypatch.setattr(state_mod.time, "time", lambda: 999.0) - s = state_mod.load_state(None) - assert s == {"postedAt": [], "queue": []} diff --git a/slack_event_handler/tests/test_workspace_paths.py b/slack_event_handler/tests/test_workspace_paths.py deleted file mode 100644 index 3a4e9fc3..00000000 --- a/slack_event_handler/tests/test_workspace_paths.py +++ /dev/null @@ -1,50 +0,0 @@ -"""Tests for slack_event_handler.workspace.""" - -from pathlib import Path -from unittest.mock import patch - -import pytest - -from slack_event_handler.workspace import ( - get_data_dir, - get_workspace_root, - set_working_directory, -) - - -@pytest.fixture -def mock_workspace_path(tmp_path): - root = tmp_path / "slack_event_handler" - root.mkdir(parents=True) - return root - - -def test_get_workspace_root_uses_config_workspace(): - with patch("slack_event_handler.workspace.get_workspace_path") as m: - m.return_value = Path("/tmp/slack_wh") - assert get_workspace_root() == Path("/tmp/slack_wh") - - -def test_get_data_dir_creates_data_subdirectory(mock_workspace_path): - with patch( - "slack_event_handler.workspace.get_workspace_root", - return_value=mock_workspace_path, - ): - d = get_data_dir() - assert d == mock_workspace_path / "data" - assert d.is_dir() - - -def test_set_working_directory_changes_cwd(mock_workspace_path): - import os - - with patch( - "slack_event_handler.workspace.get_workspace_root", - return_value=mock_workspace_path, - ): - old = os.getcwd() - try: - set_working_directory() - assert os.getcwd() == str(mock_workspace_path.resolve()) - finally: - os.chdir(old) diff --git a/slack_event_handler/utils/__init__.py b/slack_event_handler/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/slack_event_handler/utils/github_pr_client.py b/slack_event_handler/utils/github_pr_client.py deleted file mode 100644 index 67b3b221..00000000 --- a/slack_event_handler/utils/github_pr_client.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -GitHub PR comment client for the Slack PR bot. -Reads SLACK_PR_BOT_GITHUB_TOKEN and SLACK_PR_BOT_COMMENT_TEMPLATE from Django settings. -""" - -import logging -import time - -from django.conf import settings -from github import Github -from github.GithubException import GithubException - -logger = logging.getLogger(__name__) - -_gh: Github | None = None - -MAX_RETRIES = 3 -RETRY_BASE_DELAY_SEC = 2 # exponential backoff: base * 2^attempt - - -def _get_client() -> Github: - global _gh - if _gh is None: - token = (getattr(settings, "SLACK_PR_BOT_GITHUB_TOKEN", "") or "").strip() - if not token: - raise ValueError( - "Missing SLACK_PR_BOT_GITHUB_TOKEN in Django settings / .env" - ) - _gh = Github(token) - return _gh - - -def post_pr_comment(owner: str, repo: str, pull_number: int) -> None: - """ - Posts a comment to a GitHub PR using the configured template. - Raises on network errors, 404 (not found), 403 (no access), etc. - """ - template = ( - getattr(settings, "SLACK_PR_BOT_COMMENT_TEMPLATE", "") - or "Automated comment from Slack bot." - ) - gh = _get_client() - repository = gh.get_repo(f"{owner}/{repo}") - pull = repository.get_pull(pull_number) - - for attempt in range(MAX_RETRIES): - try: - pull.create_issue_comment(template) - logger.debug("Posted PR comment to %s/%s#%d", owner, repo, pull_number) - return - except GithubException as e: - if attempt < MAX_RETRIES - 1: - delay_sec = RETRY_BASE_DELAY_SEC * (2**attempt) - logger.warning( - "GitHub PR comment failed (attempt %d/%d): %s; retrying in %ds", - attempt + 1, - MAX_RETRIES, - e, - delay_sec, - ) - time.sleep(delay_sec) - else: - logger.error( - "GitHub PR comment failed after %d attempts: %s", - MAX_RETRIES, - e, - ) - raise diff --git a/slack_event_handler/utils/huddle_markdown.py b/slack_event_handler/utils/huddle_markdown.py deleted file mode 100644 index 23287f5a..00000000 --- a/slack_event_handler/utils/huddle_markdown.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -Huddle Markdown: orchestration for Slack huddle -> markdown. -Reads HTML/JSON, fetches Slack channel/user info, then uses operations.md_ops for transcript MD. -""" - -import json -import logging -import re - -from core.operations.md_ops import html_to_markdown -from core.operations.md_ops.transcript import ( - generate_transcript_from_json, - parse_html_summary, - replace_channel_ids_with_names, - replace_user_ids_with_usernames, - write_huddle_transcript_md, -) - -from core.operations.slack_ops import SlackFetcher - -logger = logging.getLogger(__name__) - - -def generate_huddle_markdown( - html_file_path, - result_json_path, - output_dir=".", - bot_token=None, -): - """Generate markdown file from huddle HTML and JSON transcript.""" - try: - with open(html_file_path, "r", encoding="utf-8") as f: - html_content = f.read() - except (OSError, UnicodeError) as e: - logger.error("Error reading HTML file: %s", e) - return None - try: - with open(result_json_path, "r", encoding="utf-8") as f: - result_json = json.load(f) - except (OSError, UnicodeError, json.JSONDecodeError) as e: - logger.error("Error reading JSON file: %s", e) - return None - - html_data = parse_html_summary(html_content) - try: - fetcher = SlackFetcher(bot_token) - except ValueError as e: - logger.error("Error: %s", e) - return None - - channel_name = fetcher.get_channel_info(html_data["channel_id"]) - user_info_map = {} - for user_id in html_data["attendee_ids"]: - user_info_map[user_id] = fetcher.get_user_info(user_id) - for user_id in re.findall(r"@(U[A-Z0-9]+)", html_content): - if user_id not in user_info_map: - user_info_map[user_id] = fetcher.get_user_info(user_id) - - transcript = generate_transcript_from_json(result_json) - for entry in transcript: - if entry.get("user_id") and entry["user_id"] not in user_info_map: - user_info_map[entry["user_id"]] = fetcher.get_user_info(entry["user_id"]) - - summary_markdown = html_to_markdown(html_content) - summary_markdown = replace_user_ids_with_usernames(summary_markdown, user_info_map) - summary_markdown = replace_channel_ids_with_names( - summary_markdown, html_data.get("channel_id"), channel_name - ) - summary_markdown = re.sub(r"^## ", "#### ", summary_markdown, flags=re.MULTILINE) - summary_markdown = re.sub(r"^# ", "### ", summary_markdown, flags=re.MULTILINE) - - result_path = write_huddle_transcript_md( - output_dir, - html_content=html_content, - result_json=result_json, - channel_name=channel_name, - user_info_map=user_info_map, - summary_markdown=summary_markdown, - ) - return str(result_path) if result_path else None diff --git a/slack_event_handler/utils/huddle_processor.py b/slack_event_handler/utils/huddle_processor.py deleted file mode 100644 index 6d7d006f..00000000 --- a/slack_event_handler/utils/huddle_processor.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -Process a Slack huddle canvas: fetch transcript, download HTML, generate markdown, upload to GitHub. -""" - -import json -import logging -import os - -from django.conf import settings - -from core.operations.github_ops import upload_file -from core.operations.file_ops import sanitize_filename -from core.operations.slack_ops import SlackFetcher, fetch_huddle_transcript - -from slack_event_handler.workspace import get_data_dir - -from .huddle_markdown import generate_huddle_markdown - -logger = logging.getLogger(__name__) - - -def process_huddle_canvas(file_id): - """ - Fetch huddle by file_id, generate markdown, upload to GitHub. - Returns dict with "success" (bool) and optionally "github_url" (str). - """ - result = fetch_huddle_transcript(file_id) - if not result or not result.get("ok"): - logger.warning("Failed to fetch huddle transcript for file_id: %s", file_id) - return {"success": False} - - file_data = result.get("file", {}) - download_url = file_data.get("url_private_download") or file_data.get("url_private") - if not download_url: - logger.warning("No download URL for file_id: %s", file_id) - return {"success": False} - - data_dir = get_data_dir() - work_dir = data_dir / file_id - work_dir.mkdir(parents=True, exist_ok=True) - - result_json_path = work_dir / "result.json" - with open(result_json_path, "w", encoding="utf-8") as f: - json.dump(result, f, indent=2) - - try: - fetcher = SlackFetcher() - except ValueError as e: - logger.error("SlackFetcher init failed: %s", e) - return {"success": False} - - filename = sanitize_filename(file_data.get("name", "huddle_summary.html")) - html_path = fetcher.download_file(download_url, str(work_dir), filename) - if not html_path or not os.path.isfile(html_path): - logger.error("Failed to download huddle HTML for file_id: %s", file_id) - return {"success": False} - - md_path = generate_huddle_markdown(html_path, result_json_path, work_dir) - if not md_path: - logger.error("Failed to generate markdown for file_id: %s", file_id) - return {"success": False} - - md_basename = os.path.basename(md_path) - dest_path = f"slack_huddles/{md_basename}" - branch = getattr(settings, "GITHUB_DEFAULT_BRANCH", None) or "main" - owner = (getattr(settings, "GITHUB_SLACK_HUDDLE_REPO_OWNER", "") or "").strip() - repo = (getattr(settings, "GITHUB_SLACK_HUDDLE_REPO_NAME", "") or "").strip() - if not owner or not repo: - logger.error( - "Missing GITHUB_SLACK_HUDDLE_REPO_OWNER or GITHUB_SLACK_HUDDLE_REPO_NAME" - ) - return {"success": False} - upload_result = upload_file( - owner, - repo, - dest_path, - md_path, - commit_message=f"Add huddle transcript: {md_basename}", - branch=branch, - ) - if not upload_result: - logger.error("Failed to upload %s to GitHub", md_path) - return {"success": False} - - github_url = f"https://github.com/{owner}/{repo}/blob/{branch}/{dest_path}" - return {"success": True, "github_url": github_url} diff --git a/slack_event_handler/utils/job_queue.py b/slack_event_handler/utils/job_queue.py deleted file mode 100644 index a4095ecf..00000000 --- a/slack_event_handler/utils/job_queue.py +++ /dev/null @@ -1,259 +0,0 @@ -""" -FIFO job queue and background worker for the Slack PR comment bot. - -State (queue + rate-limit timestamps) is persisted per team so multiple workspaces -can run in one process. Config is read from Django settings via rate_limiter helpers. -""" - -import logging -import threading -import time -import uuid -from typing import Optional - -from django.conf import settings - -from slack_event_handler.utils.rate_limiter import ( - SLOT_BUFFER_SEC, - compute_delay, - compute_delay_at, - recent_timestamps_at, - wait_and_reserve_slot, -) -from slack_event_handler.utils.github_pr_client import post_pr_comment -from slack_event_handler.utils.state import load_state, modify_state - -logger = logging.getLogger(__name__) - -KEY_JOB_ID = "jobId" -KEY_TEAM_ID = "teamId" -KEY_OWNER = "owner" -KEY_REPO = "repo" -KEY_PULL_NUMBER = "pullNumber" -KEY_CHANNEL = "channel" -KEY_MESSAGE_TS = "messageTs" -KEY_USER_ID = "userId" -KEY_IS_DM = "isDm" -KEY_ENQUEUED_AT = "enqueuedAt" - - -class _JobQueueRuntime: - """In-process PR-bot runtime: per-team Bolt apps and worker-busy flags. - - ``_apps_lock`` and ``_busy_lock`` are independent; never acquire both - simultaneously. Neither nests inside ``modify_state`` / ``state_file_lock``. - """ - - def __init__(self) -> None: - self._slack_app_by_team: dict[str, object] = {} - self._apps_lock = threading.Lock() - self._worker_busy_by_team: dict[str, bool] = {} - self._busy_lock = threading.Lock() - - def set_app(self, app: object, team_id: str) -> None: - with self._apps_lock: - self._slack_app_by_team[team_id] = app - - def get_app(self, team_id: str | None) -> object | None: - if team_id is None: - return None - with self._apps_lock: - return self._slack_app_by_team.get(team_id) - - def set_busy(self, team_id: str | None, busy: bool) -> None: - with self._busy_lock: - self._worker_busy_by_team[team_id] = busy - - def is_busy(self, team_id: str | None) -> bool: - with self._busy_lock: - return self._worker_busy_by_team.get(team_id, False) - - def clear(self) -> None: - with self._apps_lock: - self._slack_app_by_team.clear() - with self._busy_lock: - self._worker_busy_by_team.clear() - - -_runtime = _JobQueueRuntime() - - -def set_slack_app(app, team_id: str) -> None: - """Register the Bolt app for this team so the worker can send replies.""" - _runtime.set_app(app, team_id) - - -def enqueue_job( - owner: str, - repo: str, - pull_number: int, - channel: str, - message_ts: str, - user_id: str, - is_dm: bool = False, - team_id: Optional[str] = None, -) -> dict: - """Adds a new job to the persistent FIFO queue for this team and returns it.""" - job = { - KEY_JOB_ID: str(uuid.uuid4()), - KEY_TEAM_ID: team_id, - KEY_OWNER: owner, - KEY_REPO: repo, - KEY_PULL_NUMBER: pull_number, - KEY_CHANNEL: channel, - KEY_MESSAGE_TS: message_ts, - KEY_USER_ID: user_id, - KEY_IS_DM: is_dm, - KEY_ENQUEUED_AT: time.time(), - } - with modify_state(team_id) as state: - state["queue"].append(job) - return job - - -def estimated_delay_sec(team_id: Optional[str] = None) -> int: - """ - Returns estimated seconds before the newest queued job for this team can post. - - Simulates each job already ahead in the queue consuming a rate-limit slot - at the earliest available time, then computes the delay for the new job. - """ - max_per_window = int(getattr(settings, "SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW", 5)) - window_sec = int(getattr(settings, "SLACK_PR_BOT_COMMENTS_WINDOW_SECONDS", 3600)) - - state = load_state(team_id) - posted_at = list(state["postedAt"]) - busy = _runtime.is_busy(team_id) - jobs_ahead = max(0, len(state["queue"]) - 1) + (1 if busy else 0) - now = time.time() - sim_time = now - - for _ in range(jobs_ahead): - recent = recent_timestamps_at(posted_at, sim_time, window_sec) - if len(recent) >= max_per_window: - oldest = min(recent) - sim_time += max(0.0, oldest + window_sec - sim_time + SLOT_BUFFER_SEC) - posted_at = recent_timestamps_at(posted_at, sim_time, window_sec) - posted_at.append(sim_time) - - delay_at_sim = compute_delay_at(posted_at, sim_time) - total_delay = max(0.0, sim_time - now + (delay_at_sim if delay_at_sim > 0 else 0)) - return int(total_delay + 0.999) if total_delay > 0.999 else 0 - - -def _send_reply( - team_id: Optional[str], - channel: str, - thread_ts: str, - is_dm: bool, - text: str, -) -> None: - """Posts a thread reply for channel messages or a plain DM for direct messages.""" - app = _runtime.get_app(team_id) - if app is None: - return - try: - kwargs = {"channel": channel, "text": text} - if not is_dm: - kwargs["thread_ts"] = thread_ts - app.client.chat_postMessage(**kwargs) - except Exception as e: - logger.warning("Failed to send reply (channel=%s): %s", channel, e) - - -def _job_label(job: dict) -> str: - is_dm = job.get(KEY_IS_DM, False) - source = "dm" if is_dm else "channel" - team = job.get(KEY_TEAM_ID, "") - return ( - f"[job:{job[KEY_JOB_ID]}][team:{team}][{source}] " - f"{job[KEY_OWNER]}/{job[KEY_REPO]}#{job[KEY_PULL_NUMBER]}" - ) - - -def _process_job(job: dict) -> None: - team_id = job.get(KEY_TEAM_ID) - _runtime.set_busy(team_id, True) - - is_dm = job.get(KEY_IS_DM, False) - label = _job_label(job) - owner = job[KEY_OWNER] - repo = job[KEY_REPO] - pull_number = job[KEY_PULL_NUMBER] - channel = job[KEY_CHANNEL] - message_ts = job[KEY_MESSAGE_TS] - - state = load_state(team_id) - delay = compute_delay(state["postedAt"]) - if delay > 0: - logger.debug("%s – rate limited, waiting %ds", label, int(delay + 0.999)) - - wait_and_reserve_slot(team_id) - _runtime.set_busy(team_id, False) - - logger.debug("%s – posting GitHub comment", label) - post_pr_comment(owner, repo, pull_number) - _send_reply( - team_id, - channel, - message_ts, - is_dm, - f"✅ Comment posted to `{owner}/{repo}#{pull_number}`.", - ) - logger.debug("%s – comment posted", label) - - app = _runtime.get_app(team_id) - if app is None: - return - try: - app.client.reactions_add( - channel=channel, timestamp=message_ts, name="white_check_mark" - ) - except Exception as e: - if "already_reacted" not in str(e): - raise - - -def _worker(team_id: Optional[str]) -> None: - """Long-running FIFO worker daemon thread for one team.""" - logger.debug("PR job queue worker started for team %s", team_id or "default") - while True: - if not load_state(team_id)["queue"]: - time.sleep(1) - continue - - with modify_state(team_id) as state: - if not state["queue"]: - job = None - else: - job, *remaining = state["queue"] - state["queue"] = remaining - - if job is None: - time.sleep(1) - continue - - _runtime.set_busy(team_id, True) - label = _job_label(job) - is_dm = job.get(KEY_IS_DM, False) - try: - _process_job(job) - except Exception as e: - logger.warning("%s – FAILED: %s", label, e) - _send_reply( - team_id, - job[KEY_CHANNEL], - job[KEY_MESSAGE_TS], - is_dm, - f"❌ Could not post comment to " - f"`{job[KEY_OWNER]}/{job[KEY_REPO]}#{job[KEY_PULL_NUMBER]}`: {e}", - ) - finally: - _runtime.set_busy(team_id, False) - - -def start_worker(team_id: Optional[str] = None) -> None: - """Starts the background PR job queue worker for this team in a daemon thread.""" - name = f"pr-job-queue-worker-{team_id or 'default'}" - t = threading.Thread(target=_worker, args=(team_id,), daemon=True, name=name) - t.start() diff --git a/slack_event_handler/utils/pr_parser.py b/slack_event_handler/utils/pr_parser.py deleted file mode 100644 index 2c33d49d..00000000 --- a/slack_event_handler/utils/pr_parser.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -GitHub PR URL parser for the Slack PR comment bot. -Extracts PR URLs from Slack message text and splits by allowed org. -""" - -import re -from typing import Optional - -PR_URL_PATTERN = re.compile( - r"https://github\.com/([^/\s]+)/([^/\s]+)/pull/(\d+)", re.IGNORECASE -) - - -def extract_pr_urls( - text: str, allowed_org: Optional[str] = None -) -> tuple[list[dict], list[dict]]: - """ - Extract all GitHub PR URLs from a Slack message and split by allowed org. - - Args: - text: Raw message text (may contain one or more GitHub PR URLs). - allowed_org: If set, PRs under this owner are returned as "valid"; - PRs under other owners are returned as "invalid_org". If empty/None, - all found PRs are valid and invalid_org is empty. - - Returns: - (valid, invalid_org): Each is a list of dicts with keys url, owner, repo, pull_number. - valid = PRs that are under allowed_org (or all if allowed_org not set). - invalid_org = PRs that are under a different org (only when allowed_org is set). - """ - allowed = (allowed_org or "").strip().lower() - all_entries: list[dict] = [] - for match in PR_URL_PATTERN.finditer(text): - url = match.group(0) - owner = match.group(1) - repo = match.group(2) - pull_number = int(match.group(3)) - all_entries.append( - {"url": url, "owner": owner, "repo": repo, "pull_number": pull_number} - ) - - if not allowed: - return (all_entries, []) - - valid = [e for e in all_entries if (e["owner"] or "").strip().lower() == allowed] - invalid_org = [ - e for e in all_entries if (e["owner"] or "").strip().lower() != allowed - ] - return (valid, invalid_org) diff --git a/slack_event_handler/utils/rate_limiter.py b/slack_event_handler/utils/rate_limiter.py deleted file mode 100644 index c39c79ac..00000000 --- a/slack_event_handler/utils/rate_limiter.py +++ /dev/null @@ -1,92 +0,0 @@ -""" -Rolling-window rate limiter for the Slack PR comment bot. -Rate limit config is read from Django settings: - - SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW (default 5) - - SLACK_PR_BOT_COMMENTS_WINDOW_SECONDS (default 3600) -""" - -import time -from typing import Optional - -from django.conf import settings - -from slack_event_handler.utils.state import load_state, modify_state - -SLOT_BUFFER_SEC = 0.05 - - -def _max_per_window() -> int: - return int(getattr(settings, "SLACK_PR_BOT_COMMENTS_MAX_PER_WINDOW", 5)) - - -def _window_seconds() -> int: - return int(getattr(settings, "SLACK_PR_BOT_COMMENTS_WINDOW_SECONDS", 3600)) - - -def recent_timestamps_at( - posted_at: list[float], now: float, window_seconds: int | None = None -) -> list[float]: - """Returns timestamps still inside the rolling window as of time now.""" - window = window_seconds if window_seconds is not None else _window_seconds() - cutoff = now - window - return [ts for ts in posted_at if ts > cutoff] - - -def compute_delay_at(posted_at: list[float], now: float) -> float: - """ - Returns seconds to wait from now before the next slot opens, - or 0.0 if a slot is available right now. - """ - recent = recent_timestamps_at(posted_at, now) - if len(recent) < _max_per_window(): - return 0.0 - oldest = min(recent) - return max(0.0, oldest + _window_seconds() - now + SLOT_BUFFER_SEC) - - -def compute_delay(posted_at: list[float]) -> float: - """ - Returns seconds to wait before the next slot opens, - or 0.0 if a slot is available right now. - """ - return compute_delay_at(posted_at, time.time()) - - -def try_reserve_slot(team_id: Optional[str] = None) -> bool: - """ - Atomically check availability and reserve a slot timestamp for this team. - - Returns True if a slot was reserved, False if the rolling window is full. - """ - now = time.time() - with modify_state(team_id) as state: - recent = recent_timestamps_at(state["postedAt"], now) - if len(recent) >= _max_per_window(): - return False - state["postedAt"] = recent + [now] - return True - - -def wait_and_reserve_slot(team_id: Optional[str] = None) -> None: - """Blocks until a rate-limit slot is atomically reserved for this team.""" - while not try_reserve_slot(team_id): - delay = compute_delay(load_state(team_id)["postedAt"]) - if delay > 0: - time.sleep(delay) - - -def wait_for_slot(team_id: Optional[str] = None) -> None: - """Blocks until a slot appears available (does not reserve). Prefer wait_and_reserve_slot.""" - while True: - state = load_state(team_id) - delay = compute_delay(state["postedAt"]) - if delay == 0: - break - time.sleep(delay) - - -def record_posted(team_id: Optional[str] = None) -> None: - """Appends a post timestamp without checking the cap (legacy / test helper).""" - with modify_state(team_id) as state: - recent = recent_timestamps_at(state["postedAt"], time.time()) - state["postedAt"] = recent + [time.time()] diff --git a/slack_event_handler/utils/slack_internal_tokens_store.py b/slack_event_handler/utils/slack_internal_tokens_store.py deleted file mode 100644 index 43329e5f..00000000 --- a/slack_event_handler/utils/slack_internal_tokens_store.py +++ /dev/null @@ -1,234 +0,0 @@ -"""Persist Slack session credentials as JSON under workspace/slack_event_handler/.""" - -from __future__ import annotations - -import json -import logging -import os -from datetime import datetime, timezone -from pathlib import Path -from typing import Any - -from django.conf import settings - -from slack_event_handler.workspace import get_slack_internal_tokens_json_path - -logger = logging.getLogger(__name__) - -SLACK_TOKENS_RELOGIN_HINT = "Session credentials invalid or unavailable. Check workspace configuration per .env.example." - - -def slack_internal_tokens_json_path() -> Path: - """Resolved path to the tokens JSON file.""" - override = (getattr(settings, "SLACK_INTERNAL_TOKENS_JSON", "") or "").strip() - if override: - path = Path(override).expanduser() - if not path.is_absolute(): - path = Path.cwd() / path - return path.resolve() - return get_slack_internal_tokens_json_path().resolve() - - -def _read_document(path: Path) -> dict[str, Any]: - if not path.is_file(): - return {"teams": {}} - raw = path.read_text(encoding="utf-8") - if not raw.strip(): - return {"teams": {}} - data = json.loads(raw) - if not isinstance(data, dict): - raise ValueError(f"Invalid tokens file (expected object): {path}") - teams = data.get("teams") - if teams is None: - data["teams"] = {} - elif not isinstance(teams, dict): - raise ValueError(f"Invalid tokens file (teams must be object): {path}") - return data - - -def _write_document(path: Path, data: dict[str, Any]) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - tmp = path.with_suffix(".json.tmp") - payload = json.dumps(data, indent=2, sort_keys=True) - tmp.write_text(payload + "\n", encoding="utf-8") - os.replace(tmp, path) - try: - os.chmod(path, 0o600) - except OSError: - pass - logger.debug("Saved Slack internal tokens to %s", path) - - -def save_slack_internal_tokens( - team_id: str, - xoxc: str, - xoxd: str, - *, - team_name: str | None = None, - user_id: str | None = None, -) -> Path: - """Write session credentials for team_id into workspace JSON. Returns path written.""" - team_id = (team_id or "").strip() - xoxc = (xoxc or "").strip() - xoxd = (xoxd or "").strip() - if not team_id or not xoxc or not xoxd: - raise ValueError("team_id, xoxc, and xoxd are required") - - path = slack_internal_tokens_json_path() - doc = _read_document(path) - entry: dict[str, Any] = { - "xoxc": xoxc, - "xoxd": xoxd, - "updated_at": datetime.now(timezone.utc).isoformat(), - } - if team_name: - entry["team_name"] = team_name - if user_id: - entry["user_id"] = user_id - doc["teams"][team_id] = entry - _write_document(path, doc) - return path - - -def load_slack_internal_tokens(team_id: str) -> dict[str, str] | None: - """Load token record for team_id, or None if missing.""" - team_id = (team_id or "").strip() - if not team_id: - return None - path = slack_internal_tokens_json_path() - try: - doc = _read_document(path) - except (OSError, json.JSONDecodeError, ValueError) as e: - logger.warning("Could not read Slack internal tokens from %s: %s", path, e) - return None - entry = doc.get("teams", {}).get(team_id) - if not isinstance(entry, dict): - return None - xoxc = (entry.get("xoxc") or "").strip() - xoxd = (entry.get("xoxd") or "").strip() - if not xoxc or not xoxd: - return None - out = {"xoxc": xoxc, "xoxd": xoxd, "team_id": team_id} - if entry.get("team_name"): - out["team_name"] = str(entry["team_name"]) - if entry.get("user_id"): - out["user_id"] = str(entry["user_id"]) - return out - - -def extract_and_save_slack_internal_tokens(team_id: str) -> tuple[str, str] | None: - """Load session credentials from workspace storage and persist to workspace JSON.""" - from slack_event_handler.utils.slack_tokens import extract_slack_tokens_auto - - tokens = extract_slack_tokens_auto(team_id) - if not tokens or "xoxc" not in tokens or "xoxd" not in tokens: - return None - save_slack_internal_tokens( - team_id, - tokens["xoxc"], - tokens["xoxd"], - team_name=tokens.get("team_name"), - user_id=tokens.get("user_id"), - ) - return tokens["xoxc"], tokens["xoxd"] - - -def get_slack_internal_token_pair(team_id: str | None = None) -> tuple[str, str] | None: - """Return session credential pair from workspace JSON when internal mode is enabled.""" - allow = getattr(settings, "ALLOW_INTERNAL_SLACK_TOKENS", False) - if isinstance(allow, str): - allow = allow.strip().lower() == "true" - if not allow: - return None - - tid = (team_id or "").strip() - if not tid: - from core.operations.slack_ops.tokens import get_default_team_key - - tid = (get_default_team_key() or "").strip() - if not tid: - return None - - record = load_slack_internal_tokens(tid) - if not record: - return None - return record["xoxc"], record["xoxd"] - - -def _resolve_team_id(team_id: str | None = None) -> str: - tid = (team_id or "").strip() - if not tid: - from core.operations.slack_ops.tokens import get_default_team_key - - tid = (get_default_team_key() or "").strip() - return tid - - -def log_slack_internal_tokens_still_invalid(team_id: str) -> None: - """Log when session credentials remain invalid after refresh.""" - logger.error( - "Slack session credentials still invalid for team %s. %s", - team_id, - SLACK_TOKENS_RELOGIN_HINT, - ) - - -def log_slack_internal_tokens_extract_failed(team_id: str) -> None: - """Log when session credentials could not be loaded from workspace storage.""" - logger.error( - "Failed to load Slack session credentials for team %s. %s", - team_id, - SLACK_TOKENS_RELOGIN_HINT, - ) - - -def _extract_validate_and_return(team_id: str) -> tuple[str, str] | None: - """Refresh credentials from workspace storage; return pair only if auth probe passes.""" - from slack_event_handler.utils.slack_tokens import probe_slack_internal_tokens - - pair = extract_and_save_slack_internal_tokens(team_id) - if not pair: - log_slack_internal_tokens_extract_failed(team_id) - return None - if probe_slack_internal_tokens(pair[0], pair[1]): - return pair - log_slack_internal_tokens_still_invalid(team_id) - return None - - -def get_or_load_slack_internal_token_pair( - team_id: str | None = None, -) -> tuple[str, str] | None: - """ - Return session credential pair from workspace JSON. - - Refreshes from workspace storage when JSON is missing or credentials fail auth probe. - Returns None if credentials remain invalid. - """ - from slack_event_handler.utils.slack_tokens import probe_slack_internal_tokens - - tid = _resolve_team_id(team_id) - if not tid: - return None - - allow = getattr(settings, "ALLOW_INTERNAL_SLACK_TOKENS", False) - if isinstance(allow, str): - allow = allow.strip().lower() == "true" - if not allow: - return None - - pair = get_slack_internal_token_pair(tid) - if pair: - if probe_slack_internal_tokens(pair[0], pair[1]): - return pair - logger.info( - "Slack session credentials in JSON are stale for team %s; refreshing", - tid, - ) - return _extract_validate_and_return(tid) - - logger.info( - "Slack session credentials not in JSON; loading for team %s", - tid, - ) - return _extract_validate_and_return(tid) diff --git a/slack_event_handler/utils/slack_listener.py b/slack_event_handler/utils/slack_listener.py deleted file mode 100644 index f59d314a..00000000 --- a/slack_event_handler/utils/slack_listener.py +++ /dev/null @@ -1,432 +0,0 @@ -""" -Unified Slack Event Listener for slack_event_handler. - -Handles two event streams in a single Socket Mode connection: - 1. Huddle AI note events → process_huddle_canvas() (slack_event_handler) - 2. GitHub PR URL messages on the configured channel / DMs → PR comment job queue -""" - -import json -import logging -import os -import re -import threading -import time -from collections import OrderedDict -from datetime import datetime - -from django.conf import settings -from slack_bolt import App -from slack_bolt.adapter.socket_mode import SocketModeHandler - -from core.operations.slack_ops import ( - get_slack_app_token, -) - -from slack_event_handler.utils.job_queue import ( - enqueue_job, - estimated_delay_sec, - set_slack_app, - start_worker, -) -from slack_event_handler.utils.pr_parser import extract_pr_urls -from slack_event_handler.workspace import get_data_dir - -MAX_PROCESSED_FILE_IDS = 1000 - -# Team scope: which features are enabled per team (see SLACK_TEAM_SCOPE_ in settings). -SCOPE_HUDDLE = 0 -SCOPE_PR_BOT = 1 - -logger = logging.getLogger(__name__) - - -def _data_dir() -> str: - """Return workspace data dir path (avoids CWD dependency when run from runserver).""" - return str(get_data_dir()) - - -def save_event_to_file(event_type: str, body: dict) -> str | None: - """Save raw event body to a JSON file in the data folder (for debugging).""" - try: - data_dir = _data_dir() - os.makedirs(data_dir, exist_ok=True) - event = body.get("event", {}) - ts = event.get("ts") or event.get("event_ts") or str(datetime.now().timestamp()) - ts_clean = ts.replace(".", "_") - filepath = os.path.join(data_dir, f"{event_type}_{ts_clean}.json") - with open(filepath, "w", encoding="utf-8") as f: - json.dump(body, f, indent=2, default=str, ensure_ascii=False) - logger.debug("Saved event to: %s", filepath) - return filepath - except Exception as e: - logger.error("Error saving event to file: %s", e) - return None - - -class SlackListener: - """Unified Slack Event Listener using Slack Bolt (Socket Mode).""" - - def __init__( - self, - bot_token: str | None = None, - app_token: str | None = None, - team_id: str | None = None, - ): - self._team_id = team_id - token = (bot_token or "").strip() - if token: - self.bot_token = token - else: - raise ValueError( - "Missing bot_token. Pass bot_token or set SLACK_TEAM_IDS and " - "SLACK_BOT_TOKEN_ in .env." - ) - - app_token = (app_token or "").strip() - self.app_token = app_token or get_slack_app_token(self._team_id) - - if not self.bot_token: - raise ValueError("Missing SLACK_BOT_TOKEN. Set it in .env file.") - if not self.app_token: - raise ValueError( - "Missing SLACK_APP_TOKEN_. Set SLACK_TEAM_IDS and SLACK_APP_TOKEN_ in .env." - ) - - self.app = App(token=self.bot_token) - - # Per-team scope: 0 = huddle, 1 = PR bot (from SLACK_TEAM_SCOPE_). Default both. - _scope_map = getattr(settings, "SLACK_TEAM_SCOPE", None) or {} - self._team_scope = _scope_map.get(self._team_id, [SCOPE_HUDDLE, SCOPE_PR_BOT]) - - # Huddle dedup cache (LRU, capped at MAX_PROCESSED_FILE_IDS) - self._processed_file_ids: OrderedDict = OrderedDict() - self._processed_file_ids_lock = threading.Lock() - - # PR bot: resolve configured channel ID (None disables PR handling) - self._pr_channel_id: str | None = self._resolve_pr_channel() - - # Wire the PR job queue to this Bolt app and start the worker for this team. - set_slack_app(self.app, self._team_id) - start_worker(self._team_id) - - self._register_handlers() - logger.debug( - "SlackListener initialised team_id=%s scope=%s (PR channel: %s)", - self._team_id or "default", - self._team_scope, - self._pr_channel_id or "disabled", - ) - - # ------------------------------------------------------------------ - # PR bot helpers - # ------------------------------------------------------------------ - - def _resolve_pr_channel(self) -> str | None: - """Resolve the configured PR bot channel name to its Slack channel ID.""" - channel_name: str = ( - getattr(settings, "SLACK_PR_BOT_CHANNEL_NAME", "") or "" - ).strip() - if not channel_name: - return None - clean = channel_name.lstrip("#") - try: - cursor = None - while True: - kwargs = {"types": "public_channel,private_channel", "limit": 200} - if cursor: - kwargs["cursor"] = cursor - res = self.app.client.conversations_list(**kwargs) - for ch in res.get("channels", []): - if ch.get("name") == clean: - logger.debug( - "PR bot channel resolved: #%s → %s", clean, ch["id"] - ) - return ch["id"] - cursor = res.get("response_metadata", {}).get("next_cursor") - if not cursor: - break - logger.warning( - "PR bot channel '#%s' not found via conversations.list. " - "PR handling will be disabled for channel messages (DMs still work).", - clean, - ) - except Exception as e: - logger.error("Error resolving PR bot channel '%s': %s", clean, e) - return None - - def _send_user_reply( - self, channel: str, message_ts: str, is_dm: bool, text: str - ) -> None: - """Posts a thread reply for channel messages or a plain message for DMs.""" - try: - kwargs = {"channel": channel, "text": text} - if not is_dm: - kwargs["thread_ts"] = message_ts - self.app.client.chat_postMessage(**kwargs) - except Exception as e: - logger.warning("Error sending reply to %s: %s", channel, e) - - def _handle_pr_request( - self, - text: str, - channel: str, - message_ts: str, - user_id: str, - is_dm: bool, - ) -> None: - """Full PR comment request pipeline: parse → deduplicate → enqueue → ack.""" - allowed_org: str = (getattr(settings, "SLACK_PR_BOT_TEAM", "") or "").strip() - valid, invalid_org = extract_pr_urls(text, allowed_org=allowed_org) - - org_hint = ( - f"only PRs under the `{allowed_org}` org are supported." - if allowed_org - else "Set SLACK_PR_BOT_TEAM in .env to the GitHub org name (e.g. your-org) to enable PR comments." - ) - for entry in invalid_org: - self._send_user_reply( - channel, - message_ts, - is_dm, - f"⚠️ Ignored `{entry['url']}` — {org_hint}", - ) - - if not valid and not invalid_org: - example_org = allowed_org or "owner" - self._send_user_reply( - channel, - message_ts, - is_dm, - f"No GitHub PR URL found. Paste a link like " - f"`https://github.com/{example_org}/repo/pull/123` to post a comment.", - ) - return - - seen: set[str] = set() - for entry in valid: - key = f"{entry['owner']}/{entry['repo']}#{entry['pull_number']}" - if key in seen: - continue - seen.add(key) - - enqueue_job( - owner=entry["owner"], - repo=entry["repo"], - pull_number=entry["pull_number"], - channel=channel, - message_ts=message_ts, - user_id=user_id, - is_dm=is_dm, - team_id=self._team_id, - ) - delay_sec = estimated_delay_sec(self._team_id) - pr_ref = f"`{entry['owner']}/{entry['repo']}#{entry['pull_number']}`" - ack = ( - f"✅ Request received for {pr_ref}. " - f"Rate limit reached — estimated delay: {delay_sec}s." - if delay_sec > 0 - else f"✅ Request received for {pr_ref}." - ) - self._send_user_reply(channel, message_ts, is_dm, ack) - - # ------------------------------------------------------------------ - # Huddle helpers (slack_event_handler) - # ------------------------------------------------------------------ - - def _extract_file_id_from_url(self, url: str) -> str | None: - """Extract Slack file ID (pattern: F + 10+ uppercase alphanumerics) from a URL.""" - try: - match = re.search(r"/(F[A-Z0-9]{10,})$", url) - return match.group(1) if match else None - except Exception as e: - logger.warning("Error extracting file ID from URL %s: %s", url, e) - return None - - def _is_huddle_ai_note_event(self, event: dict) -> bool: - """Return True if this Slack event is a huddle AI note summary.""" - try: - ai_context = event.get("ai_context", {}) - if ai_context.get("type") == "summary": - summary = ai_context.get("summary", {}) - return summary.get("type") == "huddle" - except Exception: - logger.exception( - "Malformed event in _is_huddle_ai_note_event (event keys=%s)", - list(event.keys()) if isinstance(event, dict) else type(event).__name__, - ) - return False - - def _extract_file_id_from_event(self, event: dict) -> str | None: - """Extract the huddle canvas file ID from a huddle AI note event's message blocks.""" - try: - for block in event.get("blocks", []): - for element in block.get("elements", []): - if element.get("type") == "rich_text_section": - for sub in element.get("elements", []): - if sub.get("type") == "link": - if sub.get("text", "").strip().lower() in ( - "view ai notes", - "view ai note", - ): - file_id = self._extract_file_id_from_url( - sub.get("url", "") - ) - if file_id: - return file_id - except Exception as e: - logger.warning("Error extracting file ID from huddle event: %s", e) - return None - - def _mark_file_processed(self, file_id: str) -> bool: - """Atomically mark file_id as seen; returns True if newly added (False if duplicate).""" - with self._processed_file_ids_lock: - if file_id in self._processed_file_ids: - return False - while len(self._processed_file_ids) >= MAX_PROCESSED_FILE_IDS: - self._processed_file_ids.popitem(last=False) - self._processed_file_ids[file_id] = None - return True - - def _unmark_file_processed(self, file_id: str) -> None: - """Remove file_id from the dedup cache (e.g. after a processing failure).""" - with self._processed_file_ids_lock: - self._processed_file_ids.pop(file_id, None) - - # ------------------------------------------------------------------ - # Event handler registration - # ------------------------------------------------------------------ - - def _register_handlers(self) -> None: - @self.app.event("message") - def handle_message_events(event, body): - subtype = event.get("subtype") - if subtype in ("message_changed", "message_deleted"): - return - - # -- Huddle AI note path (only if scope includes 0) -- - if SCOPE_HUDDLE in self._team_scope and self._is_huddle_ai_note_event( - event - ): - logger.debug("Huddle AI note event detected") - save_event_to_file("huddle_ai_note", body) - file_id = self._extract_file_id_from_event(event) - if not file_id: - logger.warning( - "Could not extract file ID from huddle AI note event" - ) - return - if not self._mark_file_processed(file_id): - logger.debug("File %s already processed, skipping", file_id) - return - - def _process_later(fid): - time.sleep(30) - logger.debug("Processing huddle canvas for file_id: %s", fid) - try: - from slack_event_handler.utils.huddle_processor import ( - process_huddle_canvas, - ) - - result = process_huddle_canvas(fid) - if result and result.get("success"): - logger.debug("Processed huddle canvas %s", fid) - if result.get("github_url"): - logger.debug("GitHub URL: %s", result["github_url"]) - else: - logger.error("Failed to process huddle canvas: %s", fid) - self._unmark_file_processed(fid) - except Exception as e: - logger.exception( - "Error processing huddle canvas %s: %s", fid, e - ) - self._unmark_file_processed(fid) - - threading.Thread( - target=_process_later, args=(file_id,), daemon=True - ).start() - logger.debug( - "Huddle AI note for file_id %s queued for processing in 30s", - file_id, - ) - return - - # -- PR bot path (only if scope includes 1) -- - if SCOPE_PR_BOT not in self._team_scope: - logger.debug( - "Unhandled regular message event (PR bot disabled for this team)" - ) - return - - channel_type = event.get("channel_type") - is_dm = channel_type == "im" - - if is_dm or ( - self._pr_channel_id and event.get("channel") == self._pr_channel_id - ): - source = "dm" if is_dm else "channel" - logger.debug( - "PR bot message [%s] user=%s channel=%s", - source, - event.get("user", "?"), - event.get("channel"), - ) - self._handle_pr_request( - text=event.get("text") or "", - channel=event["channel"], - message_ts=event["ts"], - user_id=event.get("user", ""), - is_dm=is_dm, - ) - return - - logger.debug("Unhandled regular message event (not huddle, not PR channel)") - - @self.app.event("file_shared") - def handle_file_shared(event, body): - logger.debug("File shared event received") - - @self.app.event("reaction_added") - def handle_reaction_added(event, body): - logger.debug("Reaction added event received") - - @self.app.event("app_mention") - def handle_app_mention(event, body): - logger.debug("App mention event received") - - @self.app.event({"type": "event_callback"}) - def handle_all_events(event, body): - event_type = body.get("event", {}).get("type", "unknown") - logger.debug("Received event: %s", event_type) - - # ------------------------------------------------------------------ - # Start - # ------------------------------------------------------------------ - - def start(self) -> None: - """Start listening for events using Socket Mode (blocks forever).""" - data_dir = _data_dir() - os.makedirs(data_dir, exist_ok=True) - logger.debug( - "Starting unified Slack Event Handler (Socket Mode), data dir: %s", - os.path.abspath(data_dir), - ) - handler = SocketModeHandler(self.app, self.app_token) - handler.start() - - -def start_slack_listener( - bot_token: str | None = None, - app_token: str | None = None, - team_id: str | None = None, -) -> None: - """ - Start the unified Slack event listener for one workspace. - For multiple workspaces, call this once per team from separate threads. - """ - listener = SlackListener(bot_token, app_token, team_id) - listener.start() - - -if __name__ == "__main__": - start_slack_listener() diff --git a/slack_event_handler/utils/slack_tokens.py b/slack_event_handler/utils/slack_tokens.py deleted file mode 100644 index 75a50a98..00000000 --- a/slack_event_handler/utils/slack_tokens.py +++ /dev/null @@ -1,364 +0,0 @@ -"""Slack session credential helpers for huddle transcript flows.""" - -import json -import logging -import re -import shutil -import tempfile -from pathlib import Path - -import requests -from django.conf import settings - -logger = logging.getLogger(__name__) - -# Slack files.info errors that indicate stale session credentials (not missing file). -SLACK_INTERNAL_TOKEN_AUTH_ERRORS = frozenset( - { - "invalid_auth", - "not_authed", - "token_expired", - "token_revoked", - "invalid_cookie", - "account_inactive", - } -) - -# Dummy file id for auth probe; file_not_found means auth succeeded. -SLACK_TOKEN_PROBE_FILE_ID = "F00000000000" - -# Chromium localStorage key for Slack app (https://app.slack.com) -LOCAL_CONFIG_V2_KEY = b"_https://app.slack.com\x00\x01localConfig_v2" -LOCAL_CONFIG_V2_MARKER = b"localConfig_v2" - -# Session storage path: validate normalized POSIX form (Windows drive letters via ":"). -CHROME_PROFILE_PATH_PATTERN = re.compile(r"^[a-zA-Z0-9/_. \-:]+$") - - -def _validate_chrome_profile_path(path: str) -> str: - """Validate CHROME_PROFILE_PATH format. Raises ValueError if invalid.""" - if not path or not isinstance(path, str): - raise ValueError("CHROME_PROFILE_PATH must be a non-empty string") - path = path.strip() - if "\x00" in path: - raise ValueError("CHROME_PROFILE_PATH must not contain null bytes") - normalized = Path(path).as_posix() - if not CHROME_PROFILE_PATH_PATTERN.match(normalized): - raise ValueError( - "CHROME_PROFILE_PATH must contain only path characters (letters, digits, /, _, ., -, space, :), got: %s" - % (path[:100],) - ) - return path - - -def _resolve_chrome_profile_root() -> Path: - """Return validated session storage directory for Slack credentials.""" - from slack_event_handler.workspace import get_chrome_profile_path - - raw = (getattr(settings, "CHROME_PROFILE_PATH", "") or "").strip() - if not raw: - return get_chrome_profile_path() - validated = _validate_chrome_profile_path(raw) - root = Path(validated).expanduser() - if not root.is_absolute(): - root = Path.cwd() / root - return root.resolve() - - -def _leveldb_path(profile_root: Path) -> Path: - return profile_root / "Default" / "Local Storage" / "leveldb" - - -def _cookies_path(profile_root: Path) -> Path: - return profile_root / "Default" / "Cookies" - - -def _parse_local_config_raw(raw: bytes) -> dict: - """Parse localConfig_v2 payload (strip optional prefix byte).""" - if not raw: - raise ValueError("localConfig_v2 is empty") - if raw[0:1] in (b"\x00", b"\x01"): - text = raw[1:].decode("utf-8", errors="replace") - else: - text = raw.decode("utf-8", errors="replace") - return json.loads(text) - - -def _read_leveldb_value(leveldb_dir: Path, key: bytes) -> bytes | None: - """Read a single key from local storage; copy to temp dir if locked.""" - try: - import plyvel - except ImportError: - logger.warning( - "plyvel is not installed; cannot read session storage at %s. " - "See .env.example for supported environments.", - leveldb_dir, - ) - return None - - try: - db = plyvel.DB(str(leveldb_dir), create_if_missing=False) - try: - value = db.get(key) - if value is not None: - return value - for db_key, db_value in db.iterator(): - if LOCAL_CONFIG_V2_MARKER in db_key: - return db_value - return None - finally: - db.close() - except plyvel.Error as e: - err = str(e).lower() - if "lock" not in err and "resource temporarily unavailable" not in err: - raise - logger.debug("LevelDB locked at %s, copying to temp dir", leveldb_dir) - with tempfile.TemporaryDirectory(prefix="leveldb-") as tmp: - shutil.copytree(leveldb_dir, Path(tmp) / "leveldb", dirs_exist_ok=True) - db = plyvel.DB(str(Path(tmp) / "leveldb"), create_if_missing=False) - try: - value = db.get(key) - if value is not None: - return value - for db_key, db_value in db.iterator(): - if LOCAL_CONFIG_V2_MARKER in db_key: - return db_value - return None - finally: - db.close() - - -def _read_local_config_v2(profile_root: Path) -> dict | None: - """Load and parse localConfig_v2 from session storage.""" - leveldb_dir = _leveldb_path(profile_root) - if not leveldb_dir.is_dir(): - logger.warning("LevelDB not found at %s", leveldb_dir) - return None - try: - raw = _read_leveldb_value(leveldb_dir, LOCAL_CONFIG_V2_KEY) - if not raw: - logger.warning("localConfig_v2 not found in %s", leveldb_dir) - return None - return _parse_local_config_raw(raw) - except json.JSONDecodeError as e: - logger.warning("Error parsing localConfig_v2 JSON: %s", e) - return None - except Exception as e: - logger.warning("Error reading localConfig_v2: %s", e) - return None - - -def _chrome_linux_v10_cookie_key() -> bytes: - """AES key for Chromium v10 encrypted session values on Linux.""" - from Cryptodome.Protocol.KDF import PBKDF2 - - return PBKDF2(b"peanuts", b"saltysalt", dkLen=16, count=1) - - -def _decrypt_chrome_linux_v10_cookie(encrypted_value: bytes) -> str: - """Decrypt Chromium v10 encrypted session blobs (AES-128-CBC).""" - if not encrypted_value.startswith(b"v10"): - raise ValueError("unsupported Chrome cookie encryption (expected v10 prefix)") - from Cryptodome.Cipher import AES - - cipher = AES.new(_chrome_linux_v10_cookie_key(), AES.MODE_CBC, iv=b" " * 16) - plain = cipher.decrypt(encrypted_value[3:]) - pad = plain[-1] - if pad < 1 or pad > 16 or plain[-pad:] != bytes([pad]) * pad: - raise ValueError("invalid Chrome cookie padding") - return plain[32:-pad].decode("utf-8") - - -def _read_xoxd_cookie_from_sqlite(cookies_file: Path) -> str | None: - """Read session value from SQLite storage with Linux v10 decryption.""" - import sqlite3 - - conn = sqlite3.connect(f"file:{cookies_file}?mode=ro", uri=True) - try: - rows = conn.execute( - """ - SELECT encrypted_value FROM cookies - WHERE name = 'd' AND (host_key LIKE '%slack.com' OR host_key = '.slack.com') - ORDER BY length(encrypted_value) DESC - """ - ).fetchall() - finally: - conn.close() - for (encrypted_value,) in rows: - if not encrypted_value: - continue - try: - value = _decrypt_chrome_linux_v10_cookie(encrypted_value) - if value: - return value - except Exception as e: - logger.debug("Could not decrypt cookie row: %s", e) - return None - - -def _read_xoxd_cookie(profile_root: Path) -> str | None: - """Read secondary session credential from configured storage.""" - cookies_file = _cookies_path(profile_root) - if not cookies_file.is_file(): - logger.warning("Session storage database not found at %s", cookies_file) - return None - try: - import browser_cookie3 - - jar = browser_cookie3.chrome( - cookie_file=str(cookies_file), - domain_name="slack.com", - ) - for cookie in jar: - if cookie.name == "d" and cookie.value: - return cookie.value - except Exception as e: - logger.debug("browser_cookie3 could not read cookie 'd': %s", e) - - try: - value = _read_xoxd_cookie_from_sqlite(cookies_file) - if value: - return value - except Exception as e: - logger.warning("Error reading session credential from SQLite: %s", e) - return None - - logger.warning("Secondary session credential not found in %s", cookies_file) - return None - - -def extract_slack_tokens_from_config( - local_config: dict, xoxd: str, team_id: str -) -> dict | None: - """Build session credential dict from parsed localConfig, or None.""" - try: - teams = local_config.get("teams", {}) - team_data = teams.get(team_id) - if not team_data: - logger.warning( - "Team ID '%s' not found in localConfig_v2. Available: %s", - team_id, - list(teams.keys()), - ) - return None - xoxc_token = team_data.get("token") - team_name = team_data.get("name") - user_id = team_data.get("user_id") - if not xoxc_token: - logger.warning("Primary session credential not found in team data") - return None - if not xoxd: - logger.warning("Secondary session credential not found") - return None - tokens = { - "xoxc": xoxc_token, - "xoxd": xoxd, - "team_id": team_id, - "team_name": team_name, - "user_id": user_id, - } - logger.debug("Session credentials loaded for team %s", team_name) - return tokens - except Exception as e: - logger.warning("Error loading session credentials: %s", e) - return None - - -def get_all_team_ids_from_config(local_config: dict) -> list[str]: - """Get all available team IDs from parsed localConfig.""" - try: - teams = local_config.get("teams", {}) - return list(teams.keys()) - except Exception as e: - logger.warning("Error getting team IDs: %s", e) - return [] - - -def get_all_team_ids(local_config: dict | None = None) -> list[str]: - """Get team IDs from localConfig; reads workspace storage if not provided.""" - if local_config is not None: - return get_all_team_ids_from_config(local_config) - try: - profile_root = _resolve_chrome_profile_root() - config = _read_local_config_v2(profile_root) - if not config: - return [] - return get_all_team_ids_from_config(config) - except ValueError as e: - logger.warning("%s", e) - return [] - - -def is_slack_internal_token_auth_error(error: str | None) -> bool: - """True if Slack API error indicates expired or invalid session credentials.""" - return (error or "").strip() in SLACK_INTERNAL_TOKEN_AUTH_ERRORS - - -def probe_slack_internal_tokens( - xoxc: str, - xoxd: str, - file_id: str = SLACK_TOKEN_PROBE_FILE_ID, -) -> bool: - """Return True if session credentials authenticate against Slack files.info.""" - xoxc = (xoxc or "").strip() - xoxd = (xoxd or "").strip() - if not xoxc or not xoxd: - return False - try: - response = requests.post( - "https://slack.com/api/files.info", - headers={"Authorization": f"Bearer {xoxc}"}, - cookies={"d": xoxd}, - data={"file": file_id, "include_transcription": "true"}, - timeout=30, - ) - response.raise_for_status() - result = response.json() - except Exception as e: - logger.debug("Slack token probe request failed: %s", e) - return False - if result.get("ok"): - return True - err = (result.get("error") or "").strip() - if is_slack_internal_token_auth_error(err): - logger.debug("Slack token probe auth error: %s", err) - return False - logger.debug("Slack token probe non-auth response (tokens valid): %s", err) - return True - - -def extract_slack_tokens_auto(team_id: str) -> dict | None: - """Load session credentials for team_id from configured workspace paths.""" - logger.debug("Loading Slack session credentials for team %s", team_id) - try: - profile_root = _resolve_chrome_profile_root() - except ValueError as e: - logger.error("%s", e) - return None - if not profile_root.is_dir(): - logger.error( - "Session storage not found at %s. See .env.example.", - profile_root, - ) - return None - local_config = _read_local_config_v2(profile_root) - if not local_config: - logger.error( - "Failed to read session configuration from workspace storage. See .env.example." - ) - return None - team_ids = get_all_team_ids_from_config(local_config) - if team_ids: - logger.debug("Available team IDs: %s", ", ".join(team_ids)) - xoxd = _read_xoxd_cookie(profile_root) - if not xoxd: - logger.error( - "Failed to read secondary session credential from workspace storage." - ) - return None - logger.debug("Loading session credentials for team ID: %s", team_id) - tokens = extract_slack_tokens_from_config(local_config, xoxd, team_id) - if tokens: - return tokens - logger.warning("Failed to load session credentials for team %s", team_id) - return None diff --git a/slack_event_handler/utils/state.py b/slack_event_handler/utils/state.py deleted file mode 100644 index 1739648a..00000000 --- a/slack_event_handler/utils/state.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -JSON file persistence for the PR bot job queue and rate-limit state. - -State file layout: - { "postedAt": [, ...], "queue": [, ...] } - -When team_id is provided, state is stored in state_.json for multi-workspace support. -""" - -import json -import logging -import os -import re -import tempfile -import threading -import time -from contextlib import contextmanager -from copy import deepcopy -from typing import Any, Generator, Optional - -try: - import fcntl -except ImportError: # pragma: no cover - Windows - fcntl = None # type: ignore[assignment] - -import portalocker - -logger = logging.getLogger(__name__) - -_DEFAULT_STATE: dict[str, Any] = {"postedAt": [], "queue": []} - - -class _TeamThreadLockRegistry: - """In-process mutex registry paired with per-team file advisory locks. - - Guard lock is held only briefly to create/lookup per-path locks; never - held while waiting on the file lock. Acquisition order in - ``state_file_lock``: in-process team lock → advisory file lock. - """ - - def __init__(self) -> None: - self._locks: dict[str, threading.Lock] = {} - self._guard = threading.Lock() - - def lock_for(self, lock_file_path: str) -> threading.Lock: - with self._guard: - lock = self._locks.get(lock_file_path) - if lock is None: - lock = threading.Lock() - self._locks[lock_file_path] = lock - return lock - - -_thread_lock_registry = _TeamThreadLockRegistry() - - -def _thread_lock_for(team_id: Optional[str]) -> threading.Lock: - """In-process mutex paired with the file lock (required for reliable Windows locking).""" - return _thread_lock_registry.lock_for(_get_lock_file_path(team_id)) - - -def _sanitize_team_id_for_path(team_id: str) -> str: - """Safe filename segment from Slack team_id (e.g. T01234ABCD -> T01234ABCD).""" - if not team_id: - return "default" - return re.sub(r"[^a-zA-Z0-9_-]", "_", team_id) - - -def _get_state_file_path(team_id: Optional[str] = None) -> str: - """Resolve the state file path. If team_id is None, state.json; else state_.json.""" - from slack_event_handler.workspace import get_data_dir - - data_dir = get_data_dir() - if team_id: - safe = _sanitize_team_id_for_path(team_id) - return str(data_dir / f"state_{safe}.json") - return str(data_dir / "state.json") - - -def _get_lock_file_path(team_id: Optional[str] = None) -> str: - """Resolve the advisory lock file path (sibling of the state JSON file).""" - return f"{_get_state_file_path(team_id)}.lock" - - -@contextmanager -def state_file_lock(team_id: Optional[str] = None) -> Generator[None, None, None]: - """Exclusive advisory lock for per-team state read-modify-write critical sections.""" - with _thread_lock_for(team_id): - lock_path = _get_lock_file_path(team_id) - _ensure_dir(lock_path) - if fcntl is not None: - fd = os.open(lock_path, os.O_CREAT | os.O_RDWR) - try: - fcntl.flock(fd, fcntl.LOCK_EX) - yield - finally: - fcntl.flock(fd, fcntl.LOCK_UN) - os.close(fd) - else: - with portalocker.Lock(lock_path, timeout=-1): - yield - - -@contextmanager -def modify_state( - team_id: Optional[str] = None, -) -> Generator[dict[str, Any], None, None]: - """Load state under lock, yield for mutation, then save before releasing the lock.""" - with state_file_lock(team_id): - state = load_state(team_id) - yield state - save_state(state, team_id) - - -def _ensure_dir(path: str) -> None: - os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True) - - -def load_state(team_id: Optional[str] = None) -> dict[str, Any]: - """Load state for the given team. team_id=None uses state.json (single-workspace).""" - path = _get_state_file_path(team_id) - _ensure_dir(path) - if not os.path.exists(path): - return deepcopy(_DEFAULT_STATE) - try: - with open(path, "r", encoding="utf-8") as f: - return json.load(f) - except json.JSONDecodeError: - logger.exception("Corrupt state file decoding %s", path) - quarantine = f"{path}.corrupt.{int(time.time())}" - try: - os.replace(path, quarantine) - except OSError as e: - logger.warning("Could not quarantine %s to %s: %s", path, quarantine, e) - return deepcopy(_DEFAULT_STATE) - - -def save_state(state: dict[str, Any], team_id: Optional[str] = None) -> None: - """Save state for the given team. team_id=None uses state.json (single-workspace).""" - path = _get_state_file_path(team_id) - _ensure_dir(path) - dir_path = os.path.dirname(os.path.abspath(path)) - with tempfile.NamedTemporaryFile( - mode="w", - encoding="utf-8", - dir=dir_path, - delete=False, - suffix=".tmp", - ) as f: - json.dump(state, f, indent=2) - f.flush() - os.fsync(f.fileno()) - temp_path = f.name - os.replace(temp_path, path) diff --git a/slack_event_handler/workspace.py b/slack_event_handler/workspace.py deleted file mode 100644 index fc5edb3c..00000000 --- a/slack_event_handler/workspace.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Workspace paths for slack_event_handler. - -Layout: workspace/slack_event_handler/ - - data/ (state.json, raw event files) - - chrome_profile/ (session storage for huddle credentials) - - slack_internal_tokens.json (session credentials, not .env) -""" - -import os -from pathlib import Path - -from config.workspace import get_workspace_path - -_APP_SLUG = "slack_event_handler" -CHROME_PROFILE_DIRNAME = "chrome_profile" -SLACK_INTERNAL_TOKENS_FILENAME = "slack_internal_tokens.json" - - -def get_workspace_root() -> Path: - """Return this app's workspace directory (e.g. workspace/slack_event_handler/).""" - return get_workspace_path(_APP_SLUG) - - -def get_data_dir() -> Path: - """Return workspace/slack_event_handler/data/; creates if missing.""" - path = get_workspace_root() / "data" - path.mkdir(parents=True, exist_ok=True) - return path - - -def get_chrome_profile_path() -> Path: - """Session storage directory for Slack huddle credentials.""" - path = get_workspace_root() / CHROME_PROFILE_DIRNAME - path.mkdir(parents=True, exist_ok=True) - return path - - -def get_slack_internal_tokens_json_path() -> Path: - """JSON file storing session credentials per team.""" - return get_workspace_root() / SLACK_INTERNAL_TOKENS_FILENAME - - -def set_working_directory() -> None: - """Change current working directory to this app's workspace root (for runner).""" - root = get_workspace_root() - os.chdir(root) diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py index bd3a3e29..14e1a499 100644 --- a/wg21_paper_tracker/tests/test_services.py +++ b/wg21_paper_tracker/tests/test_services.py @@ -8,6 +8,7 @@ from wg21_paper_tracker.services import ( get_or_create_mailing, get_or_create_paper, + get_or_create_paper_author, mark_paper_downloaded, ) @@ -258,3 +259,149 @@ def test_mark_paper_downloaded_normalizes_paper_id(db): mark_paper_downloaded(" P1000R0 ", year=2025) paper.refresh_from_db() assert paper.is_downloaded is True + + +# --- get_or_create_paper edge cases --- + + +@pytest.mark.django_db +def test_get_or_create_paper_requires_paper_id(db): + mailing, _ = get_or_create_mailing("2025-01", "Title") + with pytest.raises(ValueError, match="paper_id is required"): + get_or_create_paper( + paper_id="", + url="https://example.com/p.pdf", + title="T", + document_date=None, + mailing=mailing, + year=2025, + ) + + +@pytest.mark.django_db +def test_get_or_create_paper_promotes_placeholder_year(db): + """Placeholder row (year=0) is promoted when a real year is supplied.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + placeholder, created_placeholder = get_or_create_paper( + paper_id="p5000", + url="https://example.com/p5000-draft.pdf", + title="Draft", + document_date=None, + mailing=mailing, + year=None, + ) + assert created_placeholder is True + assert placeholder.year == 0 + + paper, created = get_or_create_paper( + paper_id="p5000", + url="https://example.com/p5000.pdf", + title="Final", + document_date=date(2025, 3, 1), + mailing=mailing, + year=2025, + ) + assert created is False + paper.refresh_from_db() + assert paper.pk == placeholder.pk + assert paper.year == 2025 + assert paper.title == "Final" + + +@pytest.mark.django_db +def test_get_or_create_paper_replaces_authors_on_update(db): + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p7777", + url="https://example.com/p7777.pdf", + title="T", + document_date=None, + mailing=mailing, + author_names=["Alice"], + year=2025, + ) + assert paper.authors.count() == 1 + + paper2, created2 = get_or_create_paper( + paper_id="p7777", + url="https://example.com/p7777.pdf", + title="T", + document_date=None, + mailing=mailing, + author_names=["Bob", "Carol"], + year=2025, + ) + assert created2 is False + assert paper2.pk == paper.pk + names = list( + paper2.authors.order_by("author_order").values_list( + "profile__display_name", flat=True + ) + ) + assert names == ["Bob", "Carol"] + + +@pytest.mark.django_db +def test_get_or_create_paper_invalid_year_stored_as_zero(db): + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p8888", + url="https://example.com/p8888.pdf", + title="T", + document_date=None, + mailing=mailing, + year=99999, + ) + assert paper.year == 0 + + +@pytest.mark.django_db +def test_mark_paper_downloaded_requires_non_empty_paper_id(db): + with pytest.raises(ValueError, match="paper_id is required"): + mark_paper_downloaded("", year=2025) + + +# --- get_or_create_paper_author --- + + +@pytest.mark.django_db +def test_get_or_create_paper_author_creates_and_updates_order(db): + from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile + + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p3333", + url="https://example.com/p3333.pdf", + title="T", + document_date=None, + mailing=mailing, + year=2025, + ) + profile, _ = get_or_create_wg21_paper_author_profile("Author One") + + link, created = get_or_create_paper_author(paper, profile, 1) + assert created is True + assert link.author_order == 1 + + link2, created2 = get_or_create_paper_author(paper, profile, 2) + assert created2 is False + link2.refresh_from_db() + assert link2.author_order == 2 + + +@pytest.mark.django_db +def test_get_or_create_paper_author_rejects_invalid_order(db): + from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile + + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p4444", + url="https://example.com/p4444.pdf", + title="T", + document_date=None, + mailing=mailing, + year=2025, + ) + profile, _ = get_or_create_wg21_paper_author_profile("Author") + with pytest.raises(ValueError, match="author_order must be a positive integer"): + get_or_create_paper_author(paper, profile, 0)