diff --git a/.env.example b/.env.example index 4e33ca8..00b349d 100644 --- a/.env.example +++ b/.env.example @@ -1,69 +1,141 @@ -# FileForge Configuration -# Copy this file to .env and adjust values +# FileMorph Configuration +# Copy this file to .env and adjust values for your deployment. +# +# This file is structured top-down: required for every deployment first, +# then Cloud-overlay (account / payment / email features), then Compliance- +# Edition tunables, then operational knobs. A Community-Edition self-host +# only needs the first section. +# +# ────────────────────────────────────────────────────────────────────── +# === Required for every deployment === +# ────────────────────────────────────────────────────────────────────── -# Server +# Server bind. APP_HOST=0.0.0.0 makes the app listen on every interface +# inside the container; the reverse proxy (Caddy/nginx) is what limits +# external reach. APP_DEBUG=true enables FastAPI's interactive docs at +# /docs and verbose tracebacks — leave off in production. APP_HOST=0.0.0.0 APP_PORT=8000 APP_DEBUG=false -# Security -# Path to the JSON file storing hashed API keys +# Path to the JSON file holding hashed API keys for anonymous-tier +# clients. The file is created on first key generation; bind-mount the +# parent directory into your container if you want keys to survive +# restarts. API_KEYS_FILE=data/api_keys.json -# Limits +# Maximum HTTP request body size accepted by the upload endpoints. +# 100 MB is a sane default that fits typical convert/compress workloads +# without inviting OOM on small hosts. Operators with bigger files raise +# this; tier quotas (anonymous/free/pro/business) still apply on top via +# app/core/quotas.py. MAX_UPLOAD_SIZE_MB=100 -# CORS: comma-separated list of allowed origins (* for all) +# CORS: comma-separated list of allowed origins. `*` is fine for dev or +# a single-origin same-domain deployment. Production deployments behind +# a real domain should list explicit origins (e.g. +# `https://files.example.com,https://www.files.example.com`) so a +# malicious page on a different domain can't talk to your API. CORS_ORIGINS=* -# Optional: route heavy upload POSTs (convert/compress, single + batch) through -# a separate subdomain. Empty string = same-origin (default, simplest). Set -# only when the main site sits behind a proxy that caps request bodies and -# uploads must bypass it. See docs/self-hosting.md for CORS implications. +# Public canonical URL of the deployment. Used by the canonical/og:url +# meta tags, the sitemap, and the JSON-LD structured data. Localhost is +# fine for dev; set to your domain for production so search engines +# index the right URLs. +APP_BASE_URL=http://localhost:8000 + +# Optional: route heavy upload POSTs (convert/compress, single + batch) +# through a separate subdomain. Empty string = same-origin (default, +# simplest). Set this only when the main site sits behind a proxy that +# caps request bodies and uploads must bypass it via a tunnel +# subdomain. See docs/self-hosting.md for CORS implications. API_BASE_URL= -# Public canonical URL for the deployment — used for canonical/og:url meta -# tags, sitemap entries, and JSON-LD structured data. Default localhost is -# fine for dev; set to your domain for prod (e.g. https://files.example.com). -APP_BASE_URL=http://localhost:8000 +# ────────────────────────────────────────────────────────────────────── +# === Cloud-overlay (optional — accounts, payments, transactional email) === +# ────────────────────────────────────────────────────────────────────── +# A Community Edition deployment can leave everything below empty. +# Setting any of these activates the corresponding sub-processor (see +# docs/sub-processors.md). The application disables each feature +# automatically when its primary key/url is empty — no further toggle +# needed. + +# JWT signing secret. Required for the user-account features +# (registration, login, refresh, role checks). Must be at least 32 +# characters; rotate by changing this value (all sessions invalidate +# on the next request). +JWT_SECRET=dev-secret-change-me-min-32-chars-long -# Whether to expose /pricing as a commercial offer surface. Self-hosted -# Community deployments leave this off — there's no commercial offer to -# advertise. Set to `true` only on a SaaS deployment that has (or will -# soon have) Stripe configured. The page renders a "Coming Soon" banner -# automatically when STRIPE_SECRET_KEY is empty. +# PostgreSQL DSN for the Cloud overlay (users, api_keys, file_jobs, +# audit_events tables). Use the asyncpg driver. Empty string disables +# the database completely — registration/login routes return 503. +# DATABASE_URL=postgresql+asyncpg://user:pass@host:5432/filemorph + +# Stripe (leave empty to disable billing). When present, the /pricing +# page shows live upgrade buttons gated behind the BGB §356 (5) +# withdrawal-waiver checkbox; without these values the page renders a +# "Coming Soon" banner. +STRIPE_SECRET_KEY= +STRIPE_WEBHOOK_SECRET= +STRIPE_PRO_PRICE_ID= +STRIPE_BUSINESS_PRICE_ID= + +# Whether to expose /pricing as a commercial offer surface at all. +# Self-hosted Community deployments leave this off — there is no +# commercial offer to advertise. Set to `true` only on a SaaS +# deployment that has (or will soon have) Stripe configured. PRICING_PAGE_ENABLED=false -# S10-lite analytics — per-day counters for page views, conversions, -# registrations, and failures. Visible at /cockpit (admin-only). Counters -# are aggregates, not personal data: no cookie banner needed. Default on; -# set to `false` if you don't want the daily_metrics table populated at -# all (the cockpit Analytics tab then shows an empty-state notice). -METRICS_ENABLED=true +# Transactional email (password-reset, billing receipts, account +# verification). Empty SMTP_HOST disables outgoing mail; the routes +# that need it (forgot-password, register) return a graceful 503. +# SMTP_HOST=smtp.zoho.eu +# SMTP_PORT=587 +# SMTP_USER=no-reply@example.com +# SMTP_PASSWORD= +# SMTP_FROM=no-reply@example.com + +# ────────────────────────────────────────────────────────────────────── +# === Compliance-Edition tunables (audit log, retention, output integrity) === +# ────────────────────────────────────────────────────────────────────── +# These knobs target operators in regulated environments (DACH +# Behörden, healthcare, legal). The defaults below are Cloud-edition +# safe (fire-and-forget audit, no retention beyond the request); +# Compliance customers tighten them per their privacy/audit policy. -# NEU-B.1: Compliance-Edition tamper-evident audit log. Default off -# (Cloud-edition fire-and-forget — failures are logged at WARNING and -# never break the request). Set to `true` for ISO 27001 A.12.4.1 / -# BORA §50 / BeurkG §39a compliance — the convert/compress route -# refuses to serve a result it could not log to audit_events. Requires -# DATABASE_URL. +# NEU-B.1 — Compliance-Edition tamper-evident audit log gate. Default +# off (Cloud-edition: failures are logged at WARNING and never break +# the request). Set to `true` for ISO 27001 A.12.4.1 / BORA §50 / +# BeurkG §39a compliance — the convert/compress route then refuses +# to serve a result it could not log to audit_events. Requires +# DATABASE_URL above. AUDIT_FAIL_CLOSED=false -# NEU-B.2 retention policy. The Cloud edition is zero-retention by -# design (every conversion flushes its temp dir on completion or -# failure; there is no S3/R2 storage layer active). RETENTION_HOURS -# is an informational knob for self-hosters running a future -# storage-key-backed pipeline (FileJob.expires_at). Compliance-edition -# operators with an eDiscovery / GoBD retention requirement set this -# to the value their privacy policy declares; Cloud / Community keep -# it at 0. +# NEU-B.2 — Retention policy in hours. Cloud edition is zero-retention +# by design (every conversion flushes its temp dir on completion or +# failure; no S3/R2 storage layer is active). This knob is informational +# for self-hosters running a future storage-key-backed pipeline +# (FileJob.expires_at). Compliance-edition operators with an +# eDiscovery / GoBD retention requirement set this to the value their +# privacy policy declares; Cloud / Community keep it at 0. RETENTION_HOURS=0 +# ────────────────────────────────────────────────────────────────────── +# === Operational knobs (sweep cadence, concurrency, metrics) === +# ────────────────────────────────────────────────────────────────────── + +# S10-lite analytics — per-day counters for page views, conversions, +# registrations, and failures. Visible at /cockpit (admin-only). +# Counters are aggregates, not personal data — no cookie banner needed. +# Default on; set to `false` to leave the daily_metrics table empty +# (the cockpit Analytics tab then shows an empty-state notice). +METRICS_ENABLED=true + # Background sweep that removes orphaned `fm_*` temp dirs left behind # by crashes mid-conversion. The request path always cleans its own # temp dir in a `finally` block, so this only catches crash-recovery # cases. Set to 0 to disable the periodic sweep (the startup sweep -# still runs). +# still runs once on boot regardless). TEMP_SWEEP_INTERVAL_MINUTES=60 # How old (minutes) an `fm_*` temp dir must be before the sweep @@ -72,10 +144,10 @@ TEMP_SWEEP_INTERVAL_MINUTES=60 # Operators with very long batch pipelines raise this to match. TEMP_SWEEP_MAX_AGE_MINUTES=10 -# NEU-D.1 capacity guard. Total parallel conversions across all -# callers. Default 4 is sized for a 4 GB host with the existing -# per-tier output caps; raise to ~CPU-count on a bigger box. -# Past the cap → 503 + Retry-After. +# NEU-D.1 — global concurrency cap across all callers. Default 4 is +# sized for a 4 GB host with the existing per-tier output caps; raise +# to roughly CPU-count on a bigger box. Past the cap, requests get +# 503 + Retry-After. MAX_GLOBAL_CONCURRENCY=4 # How long a request waits for a free slot before giving up. Small diff --git a/.githooks/pre-commit b/.githooks/pre-commit index 8f5e69f..8ea360d 100644 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -13,7 +13,7 @@ FAIL=0 # locale/ catalogs are mechanically extracted from the impressum/privacy/terms templates above — # they cannot avoid carrying the same address/email strings. Treating them as public is consistent # with the source templates being public. -ALLOW_RE='^(app/templates/(impressum|privacy|terms)\.html|COMMERCIAL-LICENSE\.md|docs/gdpr-account-deletion-design\.md|docs/api-usage-guide\.md|docs/self-hosting\.md|docs/dpa-template\.md|docs-internal/.*|\.githooks/.*|\.github/workflows/scope-guard\.yml|CHANGELOG\.md|locale/.*\.(po|pot|mo))$' +ALLOW_RE='^(app/templates/(impressum|privacy|terms)\.html|COMMERCIAL-LICENSE\.md|docs/gdpr-account-deletion-design\.md|docs/api-usage-guide\.md|docs/self-hosting\.md|docs/dpa-template\.md|docs-internal/.*|\.githooks/.*|\.github/workflows/scope-guard\.yml|CHANGELOG\.md|locale/.*\.(po|pot|mo)|\.env\.example)$' # Personal/operational identifiers that should never land in public code. PATTERNS='lennart\.seidel@icloud\.com|lennart@filemorph\.io|Reetwerder|21029 Hamburg' diff --git a/.githooks/pre-push b/.githooks/pre-push index 40bd925..c50d8f5 100644 --- a/.githooks/pre-push +++ b/.githooks/pre-push @@ -15,7 +15,7 @@ set -e ZERO=0000000000000000000000000000000000000000 # Same patterns as pre-commit — keep in sync. -ALLOW_RE='^(app/templates/(impressum|privacy|terms)\.html|COMMERCIAL-LICENSE\.md|docs/gdpr-account-deletion-design\.md|docs/api-usage-guide\.md|docs/self-hosting\.md|docs/dpa-template\.md|docs-internal/.*|\.githooks/.*|\.github/workflows/scope-guard\.yml|CHANGELOG\.md|locale/.*\.(po|pot|mo))$' +ALLOW_RE='^(app/templates/(impressum|privacy|terms)\.html|COMMERCIAL-LICENSE\.md|docs/gdpr-account-deletion-design\.md|docs/api-usage-guide\.md|docs/self-hosting\.md|docs/dpa-template\.md|docs-internal/.*|\.githooks/.*|\.github/workflows/scope-guard\.yml|CHANGELOG\.md|locale/.*\.(po|pot|mo)|\.env\.example)$' PATTERNS='lennart\.seidel@icloud\.com|lennart@filemorph\.io|Reetwerder|21029 Hamburg' OPS_PATTERNS='/opt/filemorph(/|$|[[:space:]])|/var/log/filemorph|/home/deploy([[:space:]]|/)|Hetzner CX|HETZNER_HOST|HETZNER_SSH_USER|HETZNER_SSH_KEY|OPS_REPO_DISPATCH_PAT|GHCR_PAT|appleboy/ssh-action' SECRET_ASSIGN='(JWT_SECRET|SMTP_PASSWORD|STRIPE_SECRET_KEY|STRIPE_WEBHOOK_SECRET|DATABASE_URL|API_KEY|POSTGRES_PASSWORD|GHCR_PAT|OPS_REPO_DISPATCH_PAT|HETZNER_SSH_KEY)[[:space:]]*=[[:space:]]*[^[:space:]$]' diff --git a/app/core/config.py b/app/core/config.py index 37ab8bb..6b3624a 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -17,7 +17,7 @@ class Settings(BaseSettings): api_keys_file: str = "" # resolved below if empty - max_upload_size_mb: int = 2000 + max_upload_size_mb: int = 100 cors_origins: str = "http://localhost:8000" jwt_secret: str = "dev-secret-change-me-min-32-chars-long" diff --git a/docs/api-reference.md b/docs/api-reference.md index 278d6e3..4feba44 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -9,25 +9,95 @@ All responses are either a file download (`application/octet-stream`) or JSON. ## Authentication -All endpoints except `/health` and `/formats` require an API key in the request header: +FileMorph supports two parallel authentication schemes: -``` -X-API-Key: your-api-key-here -``` +| Scheme | Header | Issued by | Use case | +|---|---|---|---| +| **API key** (Community) | `X-API-Key: ` | `scripts/generate_api_key.py` | Self-host scripts, automation, CLI tooling | +| **JWT Bearer** (Cloud overlay) | `Authorization: Bearer ` | `POST /api/v1/auth/login` | Browser sessions, multi-user deployments | + +Either header satisfies the auth requirement on `/convert`, `/compress`, and their `/batch` variants. `/health` and `/formats` are public; the auth-flow endpoints (`/api/v1/auth/*`, `/api/v1/keys`, `/api/v1/billing/*`) require a JWT. -Generate a key with: +### API key (Community Edition) + +Generate a key: ```bash python scripts/generate_api_key.py # or via Docker: docker compose exec filemorph python scripts/generate_api_key.py ``` -Keys are stored as SHA-256 hashes in `data/api_keys.json`. The plaintext key is shown exactly once at generation time. +Keys are stored as SHA-256 hashes in `data/api_keys.json`. The plaintext key is shown exactly once at generation time. There is no key-rotation endpoint in the Community Edition — generate a new key and remove the old hash from the JSON file. + +### JWT Bearer (Cloud overlay) + +When `DATABASE_URL` is configured, the Cloud overlay enables registration / login / refresh: + +```bash +# Register (returns access + refresh tokens) +curl -X POST http://localhost:8000/api/v1/auth/register \ + -H "Content-Type: application/json" \ + -d '{"email":"alice@example.com","password":"correct-horse-battery-staple"}' + +# Login on a returning device +curl -X POST http://localhost:8000/api/v1/auth/login \ + -H "Content-Type: application/json" \ + -d '{"email":"alice@example.com","password":"correct-horse-battery-staple"}' + +# Use the access token +curl http://localhost:8000/api/v1/auth/me \ + -H "Authorization: Bearer " + +# Refresh expired access tokens (15 min TTL on access, 30 d on refresh) +curl -X POST http://localhost:8000/api/v1/auth/refresh \ + -H "Content-Type: application/json" \ + -d '{"refresh_token":""}' +``` + +Logged-in users can also generate API keys bound to their account at `POST /api/v1/keys`; those keys count against the user's tier quota rather than the anonymous tier. --- ## Endpoints +### Cloud-Edition endpoints (account / billing / keys) + +The endpoints in this section only respond when the Cloud overlay is configured (`DATABASE_URL` set, and where applicable `JWT_SECRET`, `STRIPE_SECRET_KEY`). Without those, they return `503 Service Unavailable`. All require `Authorization: Bearer ` unless noted. + +**Auth (`/api/v1/auth/*`)** + +| Method + Path | Auth | Purpose | +|---|---|---| +| `POST /api/v1/auth/register` | none | Create account; returns access + refresh tokens. Sends a verification email (fire-and-forget). | +| `POST /api/v1/auth/login` | none | Exchange email + password for access (15 min) + refresh (30 d) tokens. | +| `POST /api/v1/auth/refresh` | none (refresh-token in body) | Issue a new access token. | +| `GET /api/v1/auth/me` | Bearer | Return the currently authenticated user. | +| `POST /api/v1/auth/forgot-password` | none | Issue a single-use password-reset link via email (30 min TTL). | +| `POST /api/v1/auth/reset-password` | reset-token in body | Set a new password and invalidate older sessions via password-hash rotation. | +| `POST /api/v1/auth/verify-email` | verify-token | Mark the user's email as verified. | +| `POST /api/v1/auth/resend-verification` | Bearer | Re-send the verification mail (auth-required to avoid spam). | +| `DELETE /api/v1/auth/account` | Bearer | Self-service account deletion. Requires re-confirmation: current password, registered email, and the literal string `DELETE`. Free-tier accounts only — accounts with a Stripe customer ID return `409` and route to `privacy@filemorph.io` for the manual paid-tier path (HGB §257 / AO §147 retention). | + +**API keys (`/api/v1/keys`)** + +| Method + Path | Auth | Purpose | +|---|---|---| +| `POST /api/v1/keys` | Bearer | Create a new API key bound to the authenticated user. Plaintext key is shown exactly once in the response. | +| `GET /api/v1/keys` | Bearer | List the user's keys (id, name, prefix, created, last-used). | +| `DELETE /api/v1/keys/{id}` | Bearer | Revoke a key. | + +**Billing (`/api/v1/billing/*`)** + +| Method + Path | Auth | Purpose | +|---|---|---| +| `POST /api/v1/billing/checkout/{tier}` | Bearer | Start a Stripe Checkout for `pro` / `business`. Body MUST include `withdrawal_waiver_acknowledged: true` (BGB §356 (5) consent — see `terms.html` § 9). Returns the Stripe Checkout URL; an `auth.billing.withdrawal_waiver_recorded` audit event is written before the redirect. | +| `POST /api/v1/billing/portal` | Bearer | Return a Stripe Customer Portal URL so the user can manage card / cancel / re-subscribe. | +| `POST /api/v1/billing/webhook` | Stripe signature | Stripe → FileMorph webhook receiver. Handles `customer.subscription.{created,updated,deleted}`. Not exposed in OpenAPI. | + +For schema details (request bodies, response shapes), open the auto-generated Swagger UI at `/docs` on the live deployment. + +--- + ### POST `/api/v1/convert` Convert a file from one format to another. @@ -171,6 +241,58 @@ curl -X POST http://localhost:8000/api/v1/compress \ --- +### POST `/api/v1/convert/batch` + +Convert several files in one request. Returns a ZIP archive with all converted outputs. + +**Authentication**: Required (`X-API-Key` or `Authorization: Bearer`) + +**Request**: `multipart/form-data` + +| Field | Type | Required | Description | +|---|---|---|---| +| `files` | files (≥1) | Yes | One or more files to convert | +| `target_formats` | string[] | Yes | Target format per file. Either one value (applied to all) or one per file (length must match `files`) | +| `quality` | integer | No | Quality 1–100 (default 85). Applied uniformly. | + +**Response**: `200 OK` (`application/zip`) — archive with one entry per successful conversion. If at least one file fails, a `manifest.json` is added at archive root listing per-file results (success ZIP-only is preferred for all-success runs to keep the output clean). + +A run with **every** file failing returns `422 Unprocessable Content` with a JSON body listing per-file errors. + +```bash +curl -X POST http://localhost:8000/api/v1/convert/batch \ + -H "X-API-Key: YOUR_KEY" \ + -F "files=@a.heic" -F "files=@b.png" -F "files=@c.gif" \ + -F "target_formats=jpg" \ + --output batch.zip +``` + +--- + +### POST `/api/v1/compress/batch` + +Compress several files in one request. Same response shape as `/convert/batch`. + +**Authentication**: Required + +**Request**: `multipart/form-data` + +| Field | Type | Required | Description | +|---|---|---|---| +| `files` | files (≥1) | Yes | One or more files to compress | +| `quality` | integer | No | Quality 1–100 (default 85). Mutually exclusive with `target_size_kb`. | +| `target_size_kb` | integer | No | Per-file target size. Mutually exclusive with `quality`. | + +```bash +curl -X POST http://localhost:8000/api/v1/compress/batch \ + -H "X-API-Key: YOUR_KEY" \ + -F "files=@photo1.jpg" -F "files=@photo2.jpg" \ + -F "quality=70" \ + --output batch.zip +``` + +--- + ### GET `/api/v1/formats` Returns all supported conversion and compression formats. @@ -221,6 +343,22 @@ Health check for monitoring and load balancer probes. --- +## Response Headers + +Every successful conversion / compression carries integrity and classification metadata in response headers. CORS-enabled deployments expose these to browser clients (see `expose_headers` in `app/main.py`). + +| Header | Value | Set on | +|---|---|---| +| `X-Output-SHA256` | Hex-encoded SHA-256 of the response body | every `/convert`, `/compress`, and their batch variants | +| `X-Data-Classification` | One of `public`, `internal`, `confidential`, `restricted` | every response — echoes the request header value, defaults to `internal` when absent (NEU-C.3 / BSI-style taxonomy) | +| `X-FileMorph-Achieved-Bytes` | Actual output size in bytes | only on `/compress` calls with `target_size_kb` | +| `X-FileMorph-Final-Quality` | Quality value the binary search settled on (1–100) | only on `/compress` calls with `target_size_kb` | +| `Retry-After` | Seconds the client should wait before retrying | only on `503 Service Unavailable` (concurrency cap) | + +The `X-Data-Classification` value is also written to the audit-log entry for the request, so a downstream auditor can answer "what classification of data was processed in this call" from the database alone (see `app/core/audit.py`). + +--- + ## Error Responses All errors return JSON with a `detail` field: @@ -234,11 +372,14 @@ All errors return JSON with a `detail` field: | HTTP Status | Meaning | |---|---| | `400 Bad Request` | Missing or malformed request data (e.g. filename without extension) | -| `401 Unauthorized` | Missing or invalid `X-API-Key` | -| `413 Request Entity Too Large` | File exceeds `MAX_UPLOAD_SIZE_MB` (default: 100 MB) | -| `422 Unprocessable Entity` | Unsupported format combination, or missing form field | +| `401 Unauthorized` | Missing or invalid `X-API-Key` / `Authorization: Bearer` | +| `403 Forbidden` | Authenticated but role/tier doesn't permit the action (e.g. non-admin hitting `/cockpit/*`) | +| `413 Content Too Large` | File exceeds `MAX_UPLOAD_SIZE_MB` (default: 100 MB) | +| `415 Unsupported Media Type` | `target_size_kb` set on a lossless format (PNG/TIFF), or otherwise incompatible request shape | +| `422 Unprocessable Content` | Unsupported format combination, missing form field, or every file in a batch failed | | `429 Too Many Requests` | Rate limit exceeded (see Rate Limiting section below) | | `500 Internal Server Error` | Conversion failed (e.g. corrupt file, missing binary) | +| `503 Service Unavailable` | Global concurrency cap reached (`MAX_GLOBAL_CONCURRENCY`). Response carries `Retry-After`. | --- @@ -252,9 +393,10 @@ Per-route limits (per IP address): | `POST /api/v1/convert/batch` | 3 / minute | | `POST /api/v1/compress` | 10 / minute | | `POST /api/v1/compress/batch` | 3 / minute | -| `GET /api/v1/health` | 30 / minute | +| `GET /api/v1/health`, `GET /api/v1/ready` | 30 / minute | | `GET /api/v1/formats` | 120 / minute | | Auth endpoints (`/api/v1/auth/*`) | 3–5 / minute | +| Billing endpoints (`/api/v1/billing/*`) | 5 / minute | | Default (other routes) | 60 / minute | When exceeded, the response is `429 Too Many Requests`. For higher diff --git a/tests/test_billing_consent.py b/tests/test_billing_consent.py index 1bc6221..7c47056 100644 --- a/tests/test_billing_consent.py +++ b/tests/test_billing_consent.py @@ -204,6 +204,10 @@ def test_checkout_pro_with_acknowledgement_records_audit_event(client): rows = _events_by_type("billing.checkout.withdrawal_waiver_recorded") assert len(rows) == 1 assert str(rows[0].actor_user_id) == str(user.id) + # M10: pin actor_ip so a regression that drops `request.client.host` + # surfaces as a test failure rather than a silent loss of dispute + # reproducibility. TestClient's default client host is "testclient". + assert rows[0].actor_ip == "testclient" payload = json.loads(rows[0].payload_json) assert payload == {"tier": "pro"} @@ -222,6 +226,7 @@ def test_checkout_business_with_acknowledgement_records_audit_event(client): rows = _events_by_type("billing.checkout.withdrawal_waiver_recorded") assert len(rows) == 1 assert str(rows[0].actor_user_id) == str(user.id) + assert rows[0].actor_ip == "testclient" # M10 payload = json.loads(rows[0].payload_json) assert payload == {"tier": "business"} @@ -236,3 +241,87 @@ def test_checkout_unauthenticated_returns_401(client): ) assert res.status_code == 401 assert _events_by_type("billing.checkout.withdrawal_waiver_recorded") == [] + + +# ── 4. Audit-chain integrity (H3) ──────────────────────────────────────────── +# +# The hash-chain is what makes the audit log defensible at dispute time: +# verify_chain() walks the table and returns the first id where the +# recomputed SHA-256 of (prev_hash || payload) does not match the stored +# record_hash. These two tests pin both halves of the contract — the +# happy-path return of None for an intact chain, and the tamper-detection +# return of the mutated row's id when payload_json is altered after the +# fact. Without this guard, a refactor that switches the canonical-JSON +# serialiser, the hashing primitive, or the chaining order could silently +# break dispute reproducibility — the audit log would still record events +# (so existing tests pass) but verify_chain would no longer detect edits. + + +def test_audit_event_chain_intact_across_two_writes(client): + """Two real audit events written through the live route → verify_chain + returns None (chain intact). The /auth/login calls also write audit + rows; the chain spans all of them, so we check the global table + rather than just the checkout-typed subset. + """ + asyncio.run(_insert_user(email="chain-1@example.com")) + asyncio.run(_insert_user(email="chain-2@example.com")) + + for email, tier in [("chain-1@example.com", "pro"), ("chain-2@example.com", "business")]: + token = _login(client, email) + res = client.post( + f"/api/v1/billing/checkout/{tier}", + headers={"Authorization": f"Bearer {token}"}, + json={"withdrawal_waiver_acknowledged": True}, + ) + assert res.status_code == 200, res.text + + # Per-type sanity: both checkouts logged their consent event. Each + # event's record_hash chains forward through the global table — the + # second checkout's prev_hash will not equal the first checkout's + # record_hash unless they happen to be adjacent in id-order, which + # depends on what other events the auth flow writes. + checkout_rows = _events_by_type("billing.checkout.withdrawal_waiver_recorded") + assert len(checkout_rows) == 2 + + async def _verify(): + async with _TestSession() as s: + return await audit_module.verify_chain(s) + + assert asyncio.run(_verify()) is None + + +def test_audit_event_chain_detects_payload_tampering(client): + """Mutate one row's payload_json after-the-fact → verify_chain returns + that row's id. Pins the property that record_hash binds the payload.""" + asyncio.run(_insert_user(email="chain-tamper@example.com")) + token = _login(client, "chain-tamper@example.com") + res = client.post( + "/api/v1/billing/checkout/pro", + headers={"Authorization": f"Bearer {token}"}, + json={"withdrawal_waiver_acknowledged": True}, + ) + assert res.status_code == 200, res.text + + rows = _events_by_type("billing.checkout.withdrawal_waiver_recorded") + assert len(rows) == 1 + tampered_id = rows[0].id + + # Tamper: rewrite the payload to claim "business" while record_hash + # still binds the original "pro" payload. SQLite has no UPDATE + # trigger (Postgres does, via migration 005), so we can mutate + # directly to prove verify_chain catches it. + async def _tamper(): + async with _TestSession() as s: + row = ( + await s.execute(select(AuditEvent).where(AuditEvent.id == tampered_id)) + ).scalar_one() + row.payload_json = json.dumps({"tier": "business"}, separators=(",", ":")) + await s.commit() + + asyncio.run(_tamper()) + + async def _verify(): + async with _TestSession() as s: + return await audit_module.verify_chain(s) + + assert asyncio.run(_verify()) == tampered_id diff --git a/tests/test_hook_allowlist_regression.py b/tests/test_hook_allowlist_regression.py new file mode 100644 index 0000000..4be73ba --- /dev/null +++ b/tests/test_hook_allowlist_regression.py @@ -0,0 +1,247 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Regression guard for the .githooks/pre-commit + .githooks/pre-push allow/deny regexes (H4). + +The two scope-guard hooks share three regexes — ``ALLOW_RE``, ``FORBIDDEN_PATHS``, +and ``INTERNAL_PATHS`` — that decide whether content-pattern checks fire and +whether a file is hard-blocked. Silent regressions in these regexes are the +worst kind: a hook that lets a leak through (FN) is dangerous, but a hook +that suddenly blocks every i18n update (FP) is just as bad — the developer +gets a rejection on every push, blames their content, doesn't realise the +allowlist itself has drifted. + +This test extracts each regex literally from the hook script, compiles it +with Python's ``re`` (POSIX ERE is a subset of Python regex for the patterns +we use here), and pins the contract: + +- Files that MUST be allowed (locale/*.po, the impressum/privacy/terms HTML + templates, ``.env.example``, the public DPA template, etc.) match + ``ALLOW_RE``. A future commit that drops one of these silently breaks + the corresponding workflow (i18n update, GDPR doc edit, ...). +- Files that MUST be forbidden in the public repo (``CLAUDE.md`` at root, + ``compose.prod.yml``, ``runbooks/...``, ``docs-internal/...``) match + ``FORBIDDEN_PATHS``. +- Internal docs that MUST land in ``docs-internal/`` (admin-cockpit, + email-setup, runbook, marketing-plan, ...) match ``INTERNAL_PATHS``. + +The pre-commit and pre-push hooks share the same regex strings; the test +also enforces that they stay in sync, since drift between them would let +``--no-verify`` bypass the local hook AND defeat the pre-push backstop. +""" + +from __future__ import annotations + +import re +from pathlib import Path + +import pytest + +REPO = Path(__file__).resolve().parent.parent +PRE_COMMIT = REPO / ".githooks" / "pre-commit" +PRE_PUSH = REPO / ".githooks" / "pre-push" + + +def _extract_var(script: str, var: str) -> str: + """Pull the single-quoted assignment for ``var`` out of the shell script. + + The hooks declare each pattern as a one-line ``VAR='regex'`` assignment; + we match exactly that and return the regex (without surrounding quotes). + Multi-line values would defeat this — keep the hook formatting tight. + """ + m = re.search(rf"^{re.escape(var)}='([^']+)'", script, re.MULTILINE) + if m is None: + raise AssertionError(f"could not find {var}= assignment in hook script") + return m.group(1) + + +@pytest.fixture(scope="module") +def hooks() -> dict[str, dict[str, str]]: + """Load both hook scripts and extract the three regexes from each.""" + pre_commit_text = PRE_COMMIT.read_text(encoding="utf-8") + pre_push_text = PRE_PUSH.read_text(encoding="utf-8") + return { + "pre-commit": { + "ALLOW_RE": _extract_var(pre_commit_text, "ALLOW_RE"), + "FORBIDDEN_PATHS": _extract_var(pre_commit_text, "FORBIDDEN_PATHS"), + "INTERNAL_PATHS": _extract_var(pre_commit_text, "INTERNAL_PATHS"), + }, + "pre-push": { + "ALLOW_RE": _extract_var(pre_push_text, "ALLOW_RE"), + "FORBIDDEN_PATHS": _extract_var(pre_push_text, "FORBIDDEN_PATHS"), + "INTERNAL_PATHS": _extract_var(pre_push_text, "INTERNAL_PATHS"), + }, + } + + +# ── Drift between the two hooks ───────────────────────────────────────────── + + +def test_pre_commit_and_pre_push_regexes_stay_in_sync(hooks): + """If the two hooks drift, ``--no-verify`` defeats the local hook and + the pre-push backstop scans against a different rule set, leaving + real gaps. Treat the three regexes as a single source of truth.""" + for var in ("ALLOW_RE", "FORBIDDEN_PATHS", "INTERNAL_PATHS"): + assert hooks["pre-commit"][var] == hooks["pre-push"][var], ( + f"{var} drifted between pre-commit and pre-push — keep them identical." + ) + + +# ── ALLOW_RE: paths that MUST be content-scanning-exempt ──────────────────── + + +@pytest.mark.parametrize( + "path", + [ + # Locale catalogues — extracted mechanically from impressum/privacy/ + # terms templates which carry the operator's business address. Must stay + # allowed or every pybabel update is rejected by the hook. + "locale/de/LC_MESSAGES/messages.po", + "locale/en/LC_MESSAGES/messages.po", + "locale/messages.pot", + "locale/de/LC_MESSAGES/messages.mo", + # Public legal templates that intentionally contain the operator's + # business address. + "app/templates/impressum.html", + "app/templates/privacy.html", + "app/templates/terms.html", + # GDPR / DPA / commercial documents — public, but contain the + # business address by design. + "COMMERCIAL-LICENSE.md", + "docs/gdpr-account-deletion-design.md", + "docs/api-usage-guide.md", + "docs/self-hosting.md", + "docs/dpa-template.md", + # Hook scripts and CI workflow self-edits. + ".githooks/pre-commit", + ".githooks/pre-push", + ".github/workflows/scope-guard.yml", + # Repository-wide manifests that may carry contact info. + "CHANGELOG.md", + ".env.example", + # docs-internal/ is forbidden in PUBLIC commits but content checks + # don't apply to it (it never lands here at all). Listing it here + # is for completeness — the FORBIDDEN_PATHS match below blocks it. + "docs-internal/anything.md", + ], +) +def test_allow_re_includes_required_path(hooks, path): + pattern = re.compile(hooks["pre-commit"]["ALLOW_RE"]) + assert pattern.match(path), ( + f"{path} should be in ALLOW_RE — content-pattern scans would otherwise " + "block legitimate updates (i18n catalogues, address-bearing legal pages, ...)." + ) + + +@pytest.mark.parametrize( + "path", + [ + # Application code: must NOT be allowed; pattern scans must run. + "app/main.py", + "app/api/routes/billing.py", + "app/templates/dashboard.html", + # Random doc: not address-bearing → no exemption. + "README.md", + "docs/threat-model.md", + # Anything outside the allowlist's literal entries. + "scripts/i18n.py", + ], +) +def test_allow_re_excludes_normal_files(hooks, path): + pattern = re.compile(hooks["pre-commit"]["ALLOW_RE"]) + assert not pattern.match(path), ( + f"{path} should NOT be in ALLOW_RE — content-pattern scans must run on it." + ) + + +# ── FORBIDDEN_PATHS: ops-only filenames that must never land in public ────── + + +@pytest.mark.parametrize( + "path", + [ + "compose.prod.yml", + "deploy.sh", + ".env.production", + ".env.production.example", + "CLAUDE.md", + "runbooks/architecture-two-repo.md", + "runbooks/anything-else.md", + "docs-internal/whatever.md", + ], +) +def test_forbidden_paths_blocks_ops_artifacts(hooks, path): + pattern = re.compile(hooks["pre-commit"]["FORBIDDEN_PATHS"]) + assert pattern.search(path), ( + f"{path} should match FORBIDDEN_PATHS — ops-only artifact must never land in public repo." + ) + + +@pytest.mark.parametrize( + "path", + [ + # Public docker-compose for self-hosters. + "compose.yml", + # Deploy script in scripts/ is fine (only top-level deploy.sh is forbidden). + "scripts/deploy.py", + # Public env example. + ".env.example", + # Application code — must not be forbidden. + "app/main.py", + # Public docs. + "docs/self-hosting.md", + ], +) +def test_forbidden_paths_does_not_block_public_artifacts(hooks, path): + pattern = re.compile(hooks["pre-commit"]["FORBIDDEN_PATHS"]) + assert not pattern.search(path), ( + f"{path} should NOT match FORBIDDEN_PATHS — public asset incorrectly blocked." + ) + + +# ── INTERNAL_PATHS: docs/ files that must move to docs-internal/ ──────────── + + +@pytest.mark.parametrize( + "path", + [ + "docs/admin-cockpit.md", + "docs/email-setup.md", + "docs/open-tasks.md", + "docs/filemorph-io-runbook.md", + "docs/marketing-plan.md", + "docs/seo-strategy.md", + "docs/business-case.md", + "docs/claims-audit.md", + "docs/launch-gate-snapshot.md", + "docs/launch-readiness-tracker.md", + "docs/seo-audit.md", + "docs/user-acquisition-strategy.md", + "docs/requirements-v2.md", + "docs/sprint-5-multi-file-plan.md", + ], +) +def test_internal_paths_redirects_business_docs(hooks, path): + pattern = re.compile(hooks["pre-commit"]["INTERNAL_PATHS"]) + assert pattern.match(path), ( + f"{path} should match INTERNAL_PATHS — business/ops doc must land in docs-internal/." + ) + + +@pytest.mark.parametrize( + "path", + [ + # Public docs the self-hoster needs. + "docs/self-hosting.md", + "docs/api-reference.md", + "docs/api-usage-guide.md", + "docs/threat-model.md", + "docs/security-overview.md", + "docs/sub-processors.md", + "docs/dpa-template.md", + "docs/gdpr-account-deletion-design.md", + ], +) +def test_internal_paths_keeps_public_docs(hooks, path): + pattern = re.compile(hooks["pre-commit"]["INTERNAL_PATHS"]) + assert not pattern.match(path), ( + f"{path} should NOT match INTERNAL_PATHS — public doc incorrectly redirected." + ) diff --git a/tests/test_i18n.py b/tests/test_i18n.py index 0d6d35f..64789cc 100644 --- a/tests/test_i18n.py +++ b/tests/test_i18n.py @@ -332,3 +332,50 @@ def test_supported_locale_renders_each_page(client, locale): for page in pages: r = client.get(f"/{locale}{page}" if page != "/" else f"/{locale}/") assert r.status_code == 200, f"/{locale}{page} returned {r.status_code}" + + +# ── M9 — DE-content smoke (catalog-loaded asserter) ───────────────────────── +# +# The 200-status smoke above passes even when the .mo catalog is missing, +# corrupt, or out of sync — Babel falls back to the msgid (EN) silently. +# These tests pin a stable DE-only string per page, so a missing catalog +# (or a future PR that drops a translation) surfaces as a hard failure +# rather than a silent regression to English. +# +# Strings chosen for stability: +# - ``Datenschutz``: navbar / privacy heading; would never appear in EN. +# - ``Nutzungsbedingungen``: terms-of-use heading; would render as +# "Terms" or "Terms of Use" in EN. +# - ``Impressum``: legal-imprint header; same word in both locales but +# the page also carries DE-only ``Verantwortlich`` which we pin as +# the second anchor for the DE branch. +# - ``Widerrufsrecht``: BGB §356 right-of-withdrawal language; appears +# on /de/privacy as part of the consumer-protection clause and is +# untranslatable in an EN render. + + +@pytest.mark.parametrize( + "path,de_marker", + [ + ("/de/privacy", "Datenschutz"), + ("/de/terms", "Nutzungsbedingungen"), + ("/de/impressum", "Verantwortlich"), + ("/de/security", "Sicherheit"), + ], +) +def test_de_page_renders_german_content(client, path, de_marker): + """A DE-marker string must appear in the rendered output. If the .mo + catalog is missing or corrupt, gettext falls back to the EN msgid and + this assertion fails — without it the 200-status test passes silently. + """ + r = client.get(path) + assert r.status_code == 200, f"{path} returned {r.status_code}" + text = r.text + # Locale-resolution sanity (catches routing drift independently of + # the catalog status). + assert ' None: def test_enterprise_de_renders_authoritative_german(client, monkeypatch) -> None: """PR-i18n-2c: /de/enterprise carries the legal-authoritative DE text and must NOT show the EN-only disclaimer header (which calls itself - out as a non-binding translation).""" + out as a non-binding translation). + + M6 hardening: pin both `` N def test_impressum_en_has_preamble_then_german(client) -> None: """PR-i18n-2c: /en/impressum prepends an EN preamble explaining why - the legal body below stays in German (TMG § 5 Pflichtangaben).""" + the legal body below stays in German (TMG § 5 Pflichtangaben). + + M7 hardening: pin the *order* — preamble before the German body. + A template inversion (DE block above the EN explanation) would still + pass a presence-only check but break the document's purpose + (English speakers must see the preamble first to understand why the + rest is in German). + """ res = client.get("/en/impressum") assert res.status_code == 200 text = res.text - assert "as required by German law" in text, ( - "/en/impressum missing EN preamble explaining DE legal text" - ) - # Raw DE TMG content must still be present below the preamble - assert "Verantwortlich" in text, ( - "/en/impressum missing the raw DE TMG section after the preamble" + preamble_marker = "as required by German law" + body_marker = "Verantwortlich" + assert preamble_marker in text, "/en/impressum missing EN preamble explaining DE legal text" + assert body_marker in text, "/en/impressum missing the raw DE TMG section" + # Order pin: EN preamble must precede the DE body. + assert text.index(preamble_marker) < text.index(body_marker), ( + "/en/impressum has DE legal body before the EN preamble — template inverted?" )