agent-diff-bench
diff --git a/‎backend/pyproject.toml‎
Lines changed: 5 additions & 0 deletions b/‎backend/pyproject.toml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎backend/tests/integration/test_slack_api_docs.py‎
Lines changed: 141 additions & 0 deletions b/‎backend/tests/integration/test_slack_api_docs.py‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎backend/tests/validation/CONFORMANCE.md‎
Lines changed: 86 additions & 0 deletions b/‎backend/tests/validation/CONFORMANCE.md‎
Lines changed: 86 additions & 0 deletions
@@ -27,3 +27,8 @@ dependencies = [
 
 [tool.pytest.ini_options]
 addopts = ["--tb=short"]
+markers = [
+    "conformance: API conformance/parity tests against production APIs",
+    "external: requires live API credentials (tokens/keys)",
+    "replica_only: tests against replica only (no external credentials needed)",
+]
@@ -357,3 +357,144 @@ async def test_search_messages_doc_shape(self, slack_client: AsyncClient) -> Non
         }
         assert expected_match_keys <= match.keys()
         assert HIGHLIGHT_START in match["text"] and HIGHLIGHT_END in match["text"]
+
+    async def test_auth_test_doc_shape(self, slack_client: AsyncClient) -> None:
+        resp = await slack_client.post("/auth.test", json={})
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["ok"] is True
+        assert {"user_id", "user", "team_id", "team"} <= data.keys()
+        assert data["user_id"] == USER_AGENT
+
+    async def test_chat_update_doc_shape(self, slack_client: AsyncClient) -> None:
+        post_resp = await slack_client.post(
+            "/chat.postMessage",
+            json={"channel": CHANNEL_GENERAL, "text": "Original text for update"},
+        )
+        assert post_resp.status_code == 200
+        ts = post_resp.json()["ts"]
+
+        resp = await slack_client.post(
+            "/chat.update",
+            json={"channel": CHANNEL_GENERAL, "ts": ts, "text": "Updated text"},
+        )
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["ok"] is True
+        assert {"ok", "channel", "ts", "text"} <= data.keys()
+        assert data["text"] == "Updated text"
+
+    async def test_conversations_archive_doc_shape(
+        self, slack_client: AsyncClient
+    ) -> None:
+        channel_name = _unique_name("doc-archive")
+        create_resp = await slack_client.post(
+            "/conversations.create", json={"name": channel_name, "is_private": False}
+        )
+        assert create_resp.status_code == 200
+        channel_id = create_resp.json()["channel"]["id"]
+
+        resp = await slack_client.post(
+            "/conversations.archive", json={"channel": channel_id}
+        )
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["ok"] is True
+
+    async def test_conversations_unarchive_doc_shape(
+        self, slack_client: AsyncClient
+    ) -> None:
+        channel_name = _unique_name("doc-unarch")
+        create_resp = await slack_client.post(
+            "/conversations.create", json={"name": channel_name, "is_private": False}
+        )
+        assert create_resp.status_code == 200
+        channel_id = create_resp.json()["channel"]["id"]
+
+        await slack_client.post(
+            "/conversations.archive", json={"channel": channel_id}
+        )
+
+        resp = await slack_client.post(
+            "/conversations.unarchive", json={"channel": channel_id}
+        )
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["ok"] is True
+
+    async def test_conversations_rename_doc_shape(
+        self, slack_client: AsyncClient
+    ) -> None:
+        channel_name = _unique_name("doc-rename")
+        create_resp = await slack_client.post(
+            "/conversations.create", json={"name": channel_name, "is_private": False}
+        )
+        assert create_resp.status_code == 200
+        channel_id = create_resp.json()["channel"]["id"]
+
+        new_name = _unique_name("doc-renamed")
+        resp = await slack_client.post(
+            "/conversations.rename",
+            json={"channel": channel_id, "name": new_name},
+        )
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["ok"] is True
+        assert data["channel"]["name"] == new_name
+
+    async def test_conversations_kick_doc_shape(
+        self, slack_client: AsyncClient, slack_client_john: AsyncClient
+    ) -> None:
+        channel_name = _unique_name("doc-kick")
+        create_resp = await slack_client.post(
+            "/conversations.create", json={"name": channel_name, "is_private": False}
+        )
+        assert create_resp.status_code == 200
+        channel_id = create_resp.json()["channel"]["id"]
+
+        await slack_client.post(
+            "/conversations.invite",
+            json={"channel": channel_id, "users": USER_JOHN},
+        )
+
+        resp = await slack_client.post(
+            "/conversations.kick",
+            json={"channel": channel_id, "user": USER_JOHN},
+        )
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["ok"] is True
+
+    async def test_conversations_members_doc_shape(
+        self, slack_client: AsyncClient
+    ) -> None:
+        resp = await slack_client.get(
+            f"/conversations.members?channel={CHANNEL_GENERAL}&limit=10"
+        )
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["ok"] is True
+        assert "members" in data
+        assert isinstance(data["members"], list)
+        assert "response_metadata" in data
+
+    async def test_users_list_doc_shape(self, slack_client: AsyncClient) -> None:
+        resp = await slack_client.get("/users.list?limit=5")
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["ok"] is True
+        assert "members" in data
+        assert isinstance(data["members"], list)
+        if data["members"]:
+            user = data["members"][0]
+            assert {"id", "name", "profile"} <= user.keys()
+
+    async def test_users_conversations_doc_shape(
+        self, slack_client: AsyncClient
+    ) -> None:
+        resp = await slack_client.get(f"/users.conversations?user={USER_AGENT}&limit=5")
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["ok"] is True
+        assert "channels" in data
+        assert isinstance(data["channels"], list)
@@ -0,0 +1,86 @@
+# API Conformance Testing
+
+## Overview
+
+This directory contains conformance tests that validate Agent-Diff API replicas against their real-world production counterparts. The tests compare **response schema/shape** (field presence, types, and structure), **status codes**, **error semantics**, and **mutation behavior** -- not exact values, since IDs and timestamps will naturally differ between environments.
+
+## Per-Service Methodology
+
+### Box (REST API)
+
+**Approach:** Dual-fire against production Box API and replica. Each operation is executed against both environments, and response schemas are compared using recursive shape extraction.
+
+- **Token:** `BOX_DEV_TOKEN` (Box developer token)
+- **Endpoints tested:** 33/33 implemented endpoints
+- **What is validated:** Response field presence and types, status code parity, error shapes (404, 400, 409), CRUD operations (folders, files, comments, tasks, hubs, collections, search), file upload/download, file version upload
+- **Enterprise-only fields** (54 fields like `role`, `enterprise`, `sync_state`) are excluded from comparison, as they only appear for enterprise Box accounts
+- **Last run:** 105/106 passed (99%)
+
+### Google Calendar (REST API)
+
+**Approach:** Dual-fire against Google Calendar API v3 and replica. Creates matching resources (calendars, events) in both environments, then validates all operations.
+
+- **Token:** `GOOGLE_CALENDAR_ACCESS_TOKEN` (OAuth2 bearer token)
+- **Endpoints tested:** 37/37 implemented endpoints (calendars, calendarList, events, ACL, settings, colors, freeBusy, batch, watch, channels)
+- **What is validated:** Response schema parity, status codes, CRUD operations, recurring events, quickAdd, event move, ETag behavior, batch requests, error handling, delete operations
+- **Optional data-dependent fields** (55+ fields like `nextPageToken`, `attendees`, `conferenceData`) are excluded from comparison
+
+### Linear (GraphQL API)
+
+**Approach:** Dual-fire against Linear production GraphQL API and replica. Creates matching resources (issues, labels, comments) in both environments, then validates queries and mutations. Additionally runs **focused schema introspection** to detect drift between production and replica GraphQL schemas.
+
+- **Token:** `LINEAR_API_KEY` (Linear API key)
+- **Operations tested:** 31 queries + 16 mutations + schema introspection
+- **Queries validated:** Issue filters (string, number, ID, team, assignee, creator, state, date, label, comment comparators), search operations (with pagination, ordering, partial match), resource queries (teams, projects, users, workflowStates, issueLabels, viewer), pagination/sorting, query by identifier, error handling
+- **Mutations validated:** issueCreate, issueUpdate, issueDelete, issueArchive/Unarchive, commentCreate, commentUpdate, commentDelete, issueLabelCreate, issueLabelUpdate, issueLabelDelete, issueAddLabel, issueRemoveLabel
+- **Schema introspection:** Compares focused type surfaces (StringComparator, IssueFilter, Issue, Query, Mutation, etc.) between production and replica schemas
+- **Last run:** 89/90 passed (98%) -- single failure is schema drift on newer Linear API fields (expected as Linear evolves their API)
+
+### Slack (Docs-Golden)
+
+**Approach:** Replica-only, validated against documented Slack API contracts. Unlike Box/Calendar/Linear, Slack conformance does not compare against a live Slack workspace because live-workspace parity is difficult to standardize (workspace state, installed apps, and permissions vary).
+
+- **No external token required**
+- **Methods tested:** 22/28 implemented methods
+- **What is validated:** Response field presence (exact key sets), error semantics (`ok: false` with specific error codes), warning shapes, pagination structure
+- **Methods covered:** auth.test, chat.postMessage, chat.update, chat.delete, conversations.create, conversations.join, conversations.history, conversations.replies, conversations.info, conversations.leave, conversations.setTopic, conversations.archive, conversations.unarchive, conversations.rename, conversations.kick, conversations.members, reactions.add, reactions.get, users.info, users.list, users.conversations, search.messages
+- **Last run:** 22/22 passed (100%)
+
+## How to Run
+
+```bash
+# All conformance tests (requires all tokens set)
+pytest -m conformance -v
+
+# Individual services
+BOX_DEV_TOKEN=<token> pytest tests/validation/test_box_parity.py -v -s
+GOOGLE_CALENDAR_ACCESS_TOKEN=<token> pytest tests/validation/test_calendar_parity_comprehensive.py -v -s
+LINEAR_API_KEY=<key> pytest tests/validation/test_linear_parity_comprehensive.py -v -s
+
+# Slack (no external token needed)
+pytest tests/validation/test_slack_conformance.py -v
+
+# Or run standalone (with detailed output):
+BOX_DEV_TOKEN=<token> python tests/validation/test_box_parity.py
+GOOGLE_CALENDAR_ACCESS_TOKEN=<token> python tests/validation/test_calendar_parity_comprehensive.py
+LINEAR_API_KEY=<key> python tests/validation/test_linear_parity_comprehensive.py
+```
+
+**Prerequisites:**
+- Backend replica must be running (`docker-compose up` from `ops/`)
+- For Slack tests: must run inside Docker (`docker exec ops-backend-1 pytest ...`) or have local database access
+
+## Interpreting Results
+
+- **Pass threshold:** pytest entry points assert >= 70% pass rate. This threshold allows for minor schema differences (e.g., enterprise-only fields, newer API fields) while catching significant divergence.
+- **Schema mismatches** indicate fields present in one environment but not the other. These are logged with the specific field path and should be investigated -- many are benign (optional fields, tier-specific fields).
+- **Error parity** means both environments return the same error class (e.g., both return 404, or both return a GraphQL error with similar keywords). Exact error messages may differ.
+
+## Coverage Summary
+
+| Service  | Protocol | Endpoints Tested | Test Count | Pass Rate | Methodology |
+|----------|----------|-----------------|------------|-----------|-------------|
+| Box      | REST     | 33/33           | 106        | 99%       | Production parity |
+| Calendar | REST     | 37/37           | 77         | 100%      | Production parity |
+| Linear   | GraphQL  | 47 operations   | 90         | 98%       | Production parity + introspection |
+| Slack    | REST     | 22/28 methods   | 22         | 100%      | Docs-golden |