diff --git a/.claude/settings.json b/.claude/settings.json index 01dbe140..27da5362 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -1,5 +1,16 @@ { "permissions": { + "allow": [ + "Bash(just:*)", + "Bash(uv run:*)", + "Bash(npm:*)", + "Bash(git status:*)", + "Bash(git diff:*)", + "Bash(git log:*)", + "Bash(git add:*)", + "Bash(git commit:*)", + "Read(*)" + ], "deny": [ "Bash(rm -rf:*)", "Bash(git push --force:*)", @@ -16,17 +27,6 @@ "Edit(alembic/*)", "Edit(.github/*)", "Write(alembic/*)" - ], - "allow": [ - "Bash(just:*)", - "Bash(uv run:*)", - "Bash(npm:*)", - "Bash(git status:*)", - "Bash(git diff:*)", - "Bash(git log:*)", - "Bash(git add:*)", - "Bash(git commit:*)", - "Read(*)" ] }, "hooks": { diff --git a/client/bypassed-login-layout.png b/client/bypassed-login-layout.png deleted file mode 100644 index 755a4e3a..00000000 Binary files a/client/bypassed-login-layout.png and /dev/null differ diff --git a/client/current-state.png b/client/current-state.png deleted file mode 100644 index 7dc8b96d..00000000 Binary files a/client/current-state.png and /dev/null differ diff --git a/client/debug-screenshot.png b/client/debug-screenshot.png deleted file mode 100644 index 7dc8b96d..00000000 Binary files a/client/debug-screenshot.png and /dev/null differ diff --git a/client/layout-error-check.png b/client/layout-error-check.png deleted file mode 100644 index c42deae2..00000000 Binary files a/client/layout-error-check.png and /dev/null differ diff --git a/client/vite.config.ts.timestamp-1768838215376-66642caaea0b4.mjs b/client/vite.config.ts.timestamp-1768838215376-66642caaea0b4.mjs deleted file mode 100644 index 03803fa4..00000000 --- a/client/vite.config.ts.timestamp-1768838215376-66642caaea0b4.mjs +++ /dev/null @@ -1,71 +0,0 @@ -// vite.config.ts -import { defineConfig } from "file:///Users/forrest.murray/Documents/ralph-refactor/client/node_modules/vite/dist/node/index.js"; -import react from "file:///Users/forrest.murray/Documents/ralph-refactor/client/node_modules/@vitejs/plugin-react-swc/index.mjs"; -import path from "path"; -var __vite_injected_original_dirname = "/Users/forrest.murray/Documents/ralph-refactor/client"; -var vite_config_default = defineConfig({ - plugins: [react()], - resolve: { - alias: { - "@": path.resolve(__vite_injected_original_dirname, "./src") - } - }, - test: { - environment: "jsdom", - globals: true, - setupFiles: ["./src/test/setup.ts"], - css: true, - include: ["src/**/*.{test,spec}.{ts,tsx}"], - coverage: { - provider: "v8", - reporter: ["text", "html", "lcov"], - reportsDirectory: "coverage", - exclude: ["**/node_modules/**", "tests/**", "build/**", "src/client/**"] - } - }, - server: { - port: 3e3, - proxy: { - "/users": { - target: "http://localhost:8000", - changeOrigin: true - }, - "/workshops": { - target: "http://localhost:8000", - changeOrigin: true - }, - "/dbsql-export": { - target: "http://localhost:8000", - changeOrigin: true - }, - "/health": { - target: "http://localhost:8000", - changeOrigin: true - }, - "/test": { - target: "http://localhost:8000", - changeOrigin: true - }, - "/databricks": { - target: "http://localhost:8000", - changeOrigin: true - } - } - }, - build: { - outDir: "build", - minify: "terser", - terserOptions: { - compress: { - // Temporarily keep console statements for debugging - // TODO: Re-enable drop_console: true for production - drop_console: false, - drop_debugger: true - } - } - } -}); -export { - vite_config_default as default -}; -//# sourceMappingURL=data:application/json;base64,ewogICJ2ZXJzaW9uIjogMywKICAic291cmNlcyI6IFsidml0ZS5jb25maWcudHMiXSwKICAic291cmNlc0NvbnRlbnQiOiBbImNvbnN0IF9fdml0ZV9pbmplY3RlZF9vcmlnaW5hbF9kaXJuYW1lID0gXCIvVXNlcnMvZm9ycmVzdC5tdXJyYXkvRG9jdW1lbnRzL3JhbHBoLXJlZmFjdG9yL2NsaWVudFwiO2NvbnN0IF9fdml0ZV9pbmplY3RlZF9vcmlnaW5hbF9maWxlbmFtZSA9IFwiL1VzZXJzL2ZvcnJlc3QubXVycmF5L0RvY3VtZW50cy9yYWxwaC1yZWZhY3Rvci9jbGllbnQvdml0ZS5jb25maWcudHNcIjtjb25zdCBfX3ZpdGVfaW5qZWN0ZWRfb3JpZ2luYWxfaW1wb3J0X21ldGFfdXJsID0gXCJmaWxlOi8vL1VzZXJzL2ZvcnJlc3QubXVycmF5L0RvY3VtZW50cy9yYWxwaC1yZWZhY3Rvci9jbGllbnQvdml0ZS5jb25maWcudHNcIjtpbXBvcnQgeyBkZWZpbmVDb25maWcgfSBmcm9tICd2aXRlJ1xuaW1wb3J0IHJlYWN0IGZyb20gJ0B2aXRlanMvcGx1Z2luLXJlYWN0LXN3YydcbmltcG9ydCBwYXRoIGZyb20gJ3BhdGgnXG5cbi8vIGh0dHBzOi8vdml0ZWpzLmRldi9jb25maWcvXG5leHBvcnQgZGVmYXVsdCBkZWZpbmVDb25maWcoe1xuICBwbHVnaW5zOiBbcmVhY3QoKV0sXG4gIHJlc29sdmU6IHtcbiAgICBhbGlhczoge1xuICAgICAgJ0AnOiBwYXRoLnJlc29sdmUoX19kaXJuYW1lLCAnLi9zcmMnKSxcbiAgICB9LFxuICB9LFxuICB0ZXN0OiB7XG4gICAgZW52aXJvbm1lbnQ6ICdqc2RvbScsXG4gICAgZ2xvYmFsczogdHJ1ZSxcbiAgICBzZXR1cEZpbGVzOiBbJy4vc3JjL3Rlc3Qvc2V0dXAudHMnXSxcbiAgICBjc3M6IHRydWUsXG4gICAgaW5jbHVkZTogWydzcmMvKiovKi57dGVzdCxzcGVjfS57dHMsdHN4fSddLFxuICAgIGNvdmVyYWdlOiB7XG4gICAgICBwcm92aWRlcjogJ3Y4JyxcbiAgICAgIHJlcG9ydGVyOiBbJ3RleHQnLCAnaHRtbCcsICdsY292J10sXG4gICAgICByZXBvcnRzRGlyZWN0b3J5OiAnY292ZXJhZ2UnLFxuICAgICAgZXhjbHVkZTogWycqKi9ub2RlX21vZHVsZXMvKionLCAndGVzdHMvKionLCAnYnVpbGQvKionLCAnc3JjL2NsaWVudC8qKiddLFxuICAgIH0sXG4gIH0sXG4gIHNlcnZlcjoge1xuICAgIHBvcnQ6IDMwMDAsXG4gICAgcHJveHk6IHtcbiAgICAgICcvdXNlcnMnOiB7XG4gICAgICAgIHRhcmdldDogJ2h0dHA6Ly9sb2NhbGhvc3Q6ODAwMCcsXG4gICAgICAgIGNoYW5nZU9yaWdpbjogdHJ1ZSxcbiAgICAgIH0sXG4gICAgICAnL3dvcmtzaG9wcyc6IHtcbiAgICAgICAgdGFyZ2V0OiAnaHR0cDovL2xvY2FsaG9zdDo4MDAwJyxcbiAgICAgICAgY2hhbmdlT3JpZ2luOiB0cnVlLFxuICAgICAgfSxcbiAgICAgICcvZGJzcWwtZXhwb3J0Jzoge1xuICAgICAgICB0YXJnZXQ6ICdodHRwOi8vbG9jYWxob3N0OjgwMDAnLFxuICAgICAgICBjaGFuZ2VPcmlnaW46IHRydWUsXG4gICAgICB9LFxuICAgICAgJy9oZWFsdGgnOiB7XG4gICAgICAgIHRhcmdldDogJ2h0dHA6Ly9sb2NhbGhvc3Q6ODAwMCcsXG4gICAgICAgIGNoYW5nZU9yaWdpbjogdHJ1ZSxcbiAgICAgIH0sXG4gICAgICAnL3Rlc3QnOiB7XG4gICAgICAgIHRhcmdldDogJ2h0dHA6Ly9sb2NhbGhvc3Q6ODAwMCcsXG4gICAgICAgIGNoYW5nZU9yaWdpbjogdHJ1ZSxcbiAgICAgIH0sXG4gICAgICAnL2RhdGFicmlja3MnOiB7XG4gICAgICAgIHRhcmdldDogJ2h0dHA6Ly9sb2NhbGhvc3Q6ODAwMCcsXG4gICAgICAgIGNoYW5nZU9yaWdpbjogdHJ1ZSxcbiAgICAgIH0sXG4gICAgfSxcbiAgfSxcbiAgYnVpbGQ6IHtcbiAgICBvdXREaXI6ICdidWlsZCcsXG4gICAgbWluaWZ5OiAndGVyc2VyJyxcbiAgICB0ZXJzZXJPcHRpb25zOiB7XG4gICAgICBjb21wcmVzczoge1xuICAgICAgICAvLyBUZW1wb3JhcmlseSBrZWVwIGNvbnNvbGUgc3RhdGVtZW50cyBmb3IgZGVidWdnaW5nXG4gICAgICAgIC8vIFRPRE86IFJlLWVuYWJsZSBkcm9wX2NvbnNvbGU6IHRydWUgZm9yIHByb2R1Y3Rpb25cbiAgICAgICAgZHJvcF9jb25zb2xlOiBmYWxzZSxcbiAgICAgICAgZHJvcF9kZWJ1Z2dlcjogdHJ1ZSxcbiAgICAgIH0sXG4gICAgfSxcbiAgfSxcbn0pIl0sCiAgIm1hcHBpbmdzIjogIjtBQUFpVixTQUFTLG9CQUFvQjtBQUM5VyxPQUFPLFdBQVc7QUFDbEIsT0FBTyxVQUFVO0FBRmpCLElBQU0sbUNBQW1DO0FBS3pDLElBQU8sc0JBQVEsYUFBYTtBQUFBLEVBQzFCLFNBQVMsQ0FBQyxNQUFNLENBQUM7QUFBQSxFQUNqQixTQUFTO0FBQUEsSUFDUCxPQUFPO0FBQUEsTUFDTCxLQUFLLEtBQUssUUFBUSxrQ0FBVyxPQUFPO0FBQUEsSUFDdEM7QUFBQSxFQUNGO0FBQUEsRUFDQSxNQUFNO0FBQUEsSUFDSixhQUFhO0FBQUEsSUFDYixTQUFTO0FBQUEsSUFDVCxZQUFZLENBQUMscUJBQXFCO0FBQUEsSUFDbEMsS0FBSztBQUFBLElBQ0wsU0FBUyxDQUFDLCtCQUErQjtBQUFBLElBQ3pDLFVBQVU7QUFBQSxNQUNSLFVBQVU7QUFBQSxNQUNWLFVBQVUsQ0FBQyxRQUFRLFFBQVEsTUFBTTtBQUFBLE1BQ2pDLGtCQUFrQjtBQUFBLE1BQ2xCLFNBQVMsQ0FBQyxzQkFBc0IsWUFBWSxZQUFZLGVBQWU7QUFBQSxJQUN6RTtBQUFBLEVBQ0Y7QUFBQSxFQUNBLFFBQVE7QUFBQSxJQUNOLE1BQU07QUFBQSxJQUNOLE9BQU87QUFBQSxNQUNMLFVBQVU7QUFBQSxRQUNSLFFBQVE7QUFBQSxRQUNSLGNBQWM7QUFBQSxNQUNoQjtBQUFBLE1BQ0EsY0FBYztBQUFBLFFBQ1osUUFBUTtBQUFBLFFBQ1IsY0FBYztBQUFBLE1BQ2hCO0FBQUEsTUFDQSxpQkFBaUI7QUFBQSxRQUNmLFFBQVE7QUFBQSxRQUNSLGNBQWM7QUFBQSxNQUNoQjtBQUFBLE1BQ0EsV0FBVztBQUFBLFFBQ1QsUUFBUTtBQUFBLFFBQ1IsY0FBYztBQUFBLE1BQ2hCO0FBQUEsTUFDQSxTQUFTO0FBQUEsUUFDUCxRQUFRO0FBQUEsUUFDUixjQUFjO0FBQUEsTUFDaEI7QUFBQSxNQUNBLGVBQWU7QUFBQSxRQUNiLFFBQVE7QUFBQSxRQUNSLGNBQWM7QUFBQSxNQUNoQjtBQUFBLElBQ0Y7QUFBQSxFQUNGO0FBQUEsRUFDQSxPQUFPO0FBQUEsSUFDTCxRQUFRO0FBQUEsSUFDUixRQUFRO0FBQUEsSUFDUixlQUFlO0FBQUEsTUFDYixVQUFVO0FBQUE7QUFBQTtBQUFBLFFBR1IsY0FBYztBQUFBLFFBQ2QsZUFBZTtBQUFBLE1BQ2pCO0FBQUEsSUFDRjtBQUFBLEVBQ0Y7QUFDRixDQUFDOyIsCiAgIm5hbWVzIjogW10KfQo= diff --git a/debt-docs/ARCHITECTURE_DEBT.md b/debt-docs/ARCHITECTURE_DEBT.md new file mode 100644 index 00000000..771eae73 --- /dev/null +++ b/debt-docs/ARCHITECTURE_DEBT.md @@ -0,0 +1,532 @@ +# Architecture & Modularity Debt + +## Overview + +The codebase has significant architectural debt around **separation of concerns**, **modularity**, and **abstraction usage**. The backend lacks a proper service layer — route handlers perform business logic, and the single `DatabaseService` class (3,692 lines, 100+ methods) acts as a god service. On the frontend, an OpenAPI-generated client exists but is widely bypassed in favor of raw `fetch()` calls, type definitions are duplicated across files, and context providers mix API, state, and storage concerns. These issues compound testing debt (untestable monoliths) and make the codebase brittle to change. + +Cross-references: CQ-1 (god file workshops.py), CQ-2 (god component JudgeTuningPage), SEC-4 (unprotected endpoints), PERF-5 (dead cache), DX-8 (API client not documented). + +--- + +## Items + +### ARCH-1: God Service - DatabaseService (3,692 lines, 100+ public methods) + +**Severity**: CRITICAL +**Location**: `server/services/database_service.py` + +**Description**: A single class handles all database operations across every domain: workshop CRUD, user authentication, trace management, discovery findings, annotations, rubric parsing and reconstruction, MLflow configuration, participant notes, judge prompts and evaluations, facilitator config, IRR calculations, and trace randomization. Over 100 public methods with no domain boundaries. + +Examples of unrelated responsibilities in one class: +- `authenticate_facilitator_from_yaml()` (line ~170) — auth logic +- `add_finding()` (line ~738) — discovery domain +- `_parse_rubric_questions()` (line ~1151) — rubric parsing +- `get_irr_data()` (line ~2500+) — statistical analysis +- `randomize_trace_assignment()` (line ~2700+) — assignment algorithm + +**Impact**: Cannot test any domain in isolation. Changes to annotation logic risk breaking auth. No clear ownership boundaries. Impossible to onboard to a specific domain without understanding all 3,692 lines. + +**Remediation**: Split into focused services: +- `WorkshopService` — workshop CRUD, phase management +- `AuthenticationService` — login, facilitator auth, session management +- `AnnotationService` — annotation submission, retrieval, assignment +- `RubricService` — rubric parsing, reconstruction, validation +- `TraceService` — trace CRUD, assignment, randomization +- `JudgeService` — judge prompts, evaluations, metrics +- `ParticipantService` — participant management, notes + +**Acceptance Criteria**: +- [ ] No single service file exceeds 600 lines +- [ ] Each service handles exactly one domain +- [ ] Services communicate through defined interfaces, not shared state +- [ ] All existing tests still pass after split + +--- + +### ARCH-2: Business Logic in Route Handlers (Backend) + +**Severity**: CRITICAL +**Location**: `server/routers/workshops.py` (multiple locations), `server/routers/users.py` + +**Description**: Route handlers perform domain logic, orchestration, and direct database operations instead of delegating to services. Routes should be thin — validate input, call a service, return output. + +**Key violations**: + +1. **Direct database queries in route handler** (`workshops.py:541-547`): + ```python + workshop_db = db.query(WorkshopDB).filter(WorkshopDB.id == workshop_id).first() + workshop_db.show_participant_notes = not current_value + db.commit() + ``` + `DatabaseService.get_workshop()` already exists but is bypassed. + +2. **Complex orchestration inline in route** (`workshops.py:1100-1230`): + Auto-evaluation logic manually creates jobs, manages state, orchestrates AlignmentService + DatabricksService + token management, and spawns background threads — all inside a route handler. + +3. **Round-robin annotation assignment in route** (`workshops.py:1300-1350`): + ```python + for i, trace in enumerate(traces): + annotator = annotators[i % len(annotators)] + assignments[annotator.user_id].append(trace.id) + ``` + Load-balancing algorithm belongs in an `AnnotationAssignmentService`. + +4. **Login handler orchestrates 4 service calls with inline auth logic** (`users.py:33-69`): + Route checks facilitator YAML auth, then regular auth, then validates workshop access, then activates user — all inline. Should be a single `auth_service.login(credentials)` call. + +**Impact**: Routes are untestable without full stack. Business logic scattered across routes and services with no single source of truth. Adding a new auth method requires modifying route handlers. + +**Remediation**: Extract business logic to service methods. Route handlers should be 5-15 lines: parse input, call service, return response. + +**Acceptance Criteria**: +- [ ] No `db.query()` calls in any router file +- [ ] No `db.commit()` calls in any router file +- [ ] Route handlers are < 20 lines (excluding request model definitions) +- [ ] All orchestration logic lives in services + +--- + +### ARCH-3: Generated API Client Exists but Is Widely Bypassed + +**Severity**: CRITICAL +**Location**: +- `client/src/hooks/useWorkshopApi.ts` — 15+ raw `fetch()` calls +- `client/src/hooks/useDatabricksApi.ts` — 7+ raw `fetch()` calls +- `client/src/components/RubricSuggestionPanel.tsx:65` +- `client/src/components/AnnotationStartPage.tsx:51` +- `client/src/components/DiscoveryStartPage.tsx:46` +- `client/src/components/RoleBasedWorkflow.tsx:79` + +**Description**: An OpenAPI-generated TypeScript client exists at `client/src/client/services/` with type-safe methods (e.g., `WorkshopsService.getFindingsWorkshopsWorkshopIdFindingsGet()`). However, 16+ locations use raw `fetch()` with manually constructed URLs: + +```typescript +// useWorkshopApi.ts — raw fetch instead of generated client +const response = await fetch(`/workshops/${workshopId}/all-traces`); +const response = await fetch(`/workshops/${workshopId}/annotations-with-users`); +const response = await fetch(`/workshops/${workshopId}/aggregate-all-feedback`, { method: 'POST' }); + +// Components — raw fetch instead of generated client +const response = await fetch(`/workshops/${workshopId}/generate-rubric-suggestions`, { method: 'POST' }); +const response = await fetch(`/workshops/${workshopId}/begin-annotation`, { method: 'POST' }); +``` + +Some generated service methods exist for these exact endpoints but are not used (e.g., `WorkshopsService.isUserDiscoveryCompleteWorkshopsWorkshopIdUsersUserIdDiscoveryCompleteGet()` exists but `RoleBasedWorkflow.tsx:79` calls `fetch()` directly). + +**Impact**: No type safety on 16+ API calls. If backend changes a response shape, TypeScript won't catch it. Endpoint URLs duplicated as strings. Error handling patterns inconsistent between raw fetch and generated client. + +**Remediation**: Replace all raw `fetch()` calls with corresponding generated service methods. If a method doesn't exist in the generated client, regenerate from the current OpenAPI spec. + +**Acceptance Criteria**: +- [ ] Zero raw `fetch()` calls to backend API in application code +- [ ] All API calls go through generated `WorkshopsService`, `UsersService`, `DatabricksService` +- [ ] justfile recipe exists for regenerating the client (`just generate-api-client`) +- [ ] CI check validates generated client is up-to-date with backend + +--- + +### ARCH-4: God Components (Frontend) + +**Severity**: HIGH +**Location**: +- `client/src/components/TraceViewer.tsx` — 1,650 lines +- `client/src/components/FacilitatorDashboard.tsx` — 1,325 lines +- `client/src/components/TraceDataViewer.tsx` — 730 lines +- `client/src/components/RoleBasedWorkflow.tsx` — 630 lines + +**Description**: These components mix data fetching, business logic, and rendering in a single file. (CQ-2 already covers `JudgeTuningPage.tsx` at 2,754 lines.) + +Specific violations: + +- **FacilitatorDashboard.tsx**: Lines 1-52 import 10+ React Query hooks. Lines 71-124 compute progress metrics, user contributions, and trace coverage via complex `useMemo` + array operations. Lines 126-400+ render multiple dashboard panels. All three concerns (data, logic, UI) tightly coupled. + +- **TraceViewer.tsx**: Lines 42-140 define utility functions (`isMarkdownContent()`, `isUrl()`, `isJsonString()`, `fixMalformedJson()`) that belong in `utils/`. Lines 150-500+ mix smart JSON rendering logic with the component render tree. + +- **TraceDataViewer.tsx**: Lines 42-140 contain `extractLLMContent()` — a 100-line function for parsing various LLM response formats. This is domain logic, not a UI concern. + +- **RoleBasedWorkflow.tsx**: Lines 48-65 contain business logic for starting phases. Lines 75-97 define inline query functions instead of using custom hooks. + +**Impact**: Cannot test business logic without rendering components. Cannot reuse progress calculation or LLM content extraction outside these components. High cognitive load. + +**Remediation**: Extract: +- Business logic to service modules (e.g., `services/workshopProgressService.ts`, `utils/llmResponseParser.ts`) +- Data fetching to custom hooks (e.g., `useFacilitatorDashboardData()`) +- Sub-panels to separate components +- Utility functions to `utils/` + +**Acceptance Criteria**: +- [ ] No component file exceeds 500 lines +- [ ] Business logic extractable and testable without React +- [ ] Utility functions in `utils/` with independent unit tests + +--- + +### ARCH-5: Infrastructure Code in Router Module (Jobs + Threading) + +**Severity**: HIGH +**Location**: `server/routers/workshops.py:24-130, 1120+` + +**Description**: The workshops router contains infrastructure that doesn't belong there: + +1. **AlignmentJob dataclass with file-based persistence** (lines 24-130): + ```python + class AlignmentJob: + def save(self): + temp_path = self._meta_path + ".tmp" + with open(temp_path, "w") as f: + json.dump(data, f) + os.rename(temp_path, self._meta_path) + ``` + Job persistence (file I/O, JSON serialization, atomic writes) embedded in a router module. + +2. **Background threads spawned directly in routes** (lines 1120+): + ```python + def run_auto_evaluation_for_new_traces(): + from server.database import SessionLocal + thread_db = SessionLocal() + # ...complex evaluation logic... + threading.Thread(target=run_auto_evaluation_for_new_traces, daemon=True).start() + ``` + Routes create database sessions in background threads with no pool management, no cancellation mechanism, and errors only caught via logging. + +3. **Retry utility function** (lines 170-227): + Generic retry logic with exponential backoff defined inline in the router. + +**Impact**: Job lifecycle is untestable. Thread resource leaks possible. Retry logic is not reusable. Circular dependency risk from late imports inside threads. + +**Remediation**: +- Extract `AlignmentJob` + persistence to `services/job_service.py` with a `JobRepository` abstraction +- Extract background work to `services/background_task_service.py` (or use FastAPI `BackgroundTasks`) +- Extract retry logic to `utils/retry.py` or a decorator + +**Acceptance Criteria**: +- [ ] No `threading.Thread` usage in router files +- [ ] No file I/O in router files +- [ ] Job persistence abstracted behind a repository interface +- [ ] Background tasks use FastAPI's `BackgroundTasks` or a managed task queue + +--- + +### ARCH-6: Frontend Context Providers Mix API, State, and Storage Concerns + +**Severity**: HIGH +**Location**: +- `client/src/context/UserContext.tsx` +- `client/src/context/WorkshopContext.tsx` +- `client/src/context/WorkflowContext.tsx` + +**Description**: Context providers are doing triple duty — API calls, state management, and localStorage persistence — all in one layer. + +- **UserContext.tsx**: + - Lines 62-121: `initializeUser()` mixes localStorage reads, `UsersService` API calls, error handling, and permission loading + - Lines 132-159: `loadPermissions()` handles API calls AND state updates AND localStorage cleanup + - Lines 161-169: `updateLastActive()` fires API calls with silent failures (empty catch block) + +- **WorkshopContext.tsx**: + - Lines 28-80: `getWorkshopIdFromUrl()` utility function mixes URL parsing with localStorage management + - Lines 87-122: State initialization with direct localStorage access mixed with React Query cache clearing + +- **WorkflowContext.tsx**: + - Lines 41-54: Multiple `useQuery` hooks with inline `fetch()` calls instead of using generated services + +**Impact**: Cannot test state logic without mocking APIs and localStorage. Storage strategy change (e.g., sessionStorage, IndexedDB) requires modifying context providers. Silent failures in API calls make debugging hard. + +**Remediation**: Separate concerns into layers: +- `services/authService.ts` — API calls for auth +- `hooks/usePersistedState.ts` — localStorage abstraction +- Contexts hold state only, delegate API/storage to services and hooks + +**Acceptance Criteria**: +- [ ] Context providers contain zero `fetch()` or service calls +- [ ] Context providers contain zero direct `localStorage` access +- [ ] API calls extracted to hooks or services +- [ ] Storage abstracted behind a custom hook + +--- + +### ARCH-7: Ad-Hoc Authorization Pattern + +**Severity**: HIGH +**Location**: `server/routers/users.py`, `server/routers/workshops.py` + +**Description**: Authorization checks are implemented inline in individual route handlers rather than via middleware or decorators. Each endpoint independently decides whether and how to check permissions. + +```python +# users.py:117-120 — inline role check +inviter = db_service.get_user(invitation_data.invited_by) +if not inviter or inviter.role != UserRole.FACILITATOR: + raise HTTPException(status_code=403, detail='Only facilitators can create invitations') +``` + +Additionally, `server/models.py:74-127` places permission calculation logic on a Pydantic response model (`UserPermissions.for_role()`), which is a data contract, not a business logic layer. + +Cross-reference: SEC-4 covers the specific unprotected endpoints. This item covers the systemic pattern. + +**Impact**: Easy to forget auth on new endpoints. Authorization logic duplicated and inconsistent. Cannot audit protection from a single location. Violates the auth spec's intent to keep permission logic abstract and swappable. + +**Remediation**: Create a FastAPI dependency-based auth system: +```python +async def require_role(role: UserRole): + def dependency(user: User = Depends(get_current_user)): + if user.role != role: + raise HTTPException(403) + return user + return dependency + +@router.post('/invitations/') +async def create_invitation(..., user: User = Depends(require_role(UserRole.FACILITATOR))): +``` +Move `UserPermissions.for_role()` to an `AuthorizationService`. + +**Acceptance Criteria**: +- [ ] All protected endpoints use a shared auth dependency +- [ ] Permission logic lives in a service, not in Pydantic models +- [ ] New endpoints require explicit auth opt-in (or opt-out with comment) +- [ ] Auth coverage auditable from a single module + +--- + +### ARCH-8: Duplicated Type Definitions (Frontend) + +**Severity**: MEDIUM +**Location**: +- `RubricSuggestion`: defined in both `RubricSuggestionPanel.tsx:34-41` and `useWorkshopApi.ts:816-823` +- `TraceData`: defined in `TraceDataViewer.tsx:21-27`, `AnnotationReviewPage.tsx:30-36`, and `FocusedAnalysisView.tsx:32-45` +- `ParticipantNote`: defined in `useWorkshopApi.ts:656-666` (should come from generated client) + +**Description**: The same TypeScript interfaces are defined independently in multiple files. Some of these types already exist in the generated OpenAPI client at `client/src/client/models/` but are re-declared manually. + +**Impact**: Type drift — if one definition is updated, others become stale. Increases maintenance burden. Defeats the purpose of having a generated client. + +**Remediation**: Centralize shared types: +1. Import types from generated client where they exist +2. For types not in the generated client, create `client/src/types/` with shared definitions +3. Remove duplicate definitions from components and hooks + +**Acceptance Criteria**: +- [ ] Each type is defined in exactly one location +- [ ] Components import from `types/` or generated client, never define inline +- [ ] No duplicate interface definitions across files + +--- + +### ARCH-9: Duplicated Business Logic (Frontend) + +**Severity**: MEDIUM +**Location**: +- Rubric parsing: `utils/rubricUtils.ts` (canonical) vs. `RubricViewPage.tsx:22-30` (`convertApiRubricToQuestions()`) vs. `AnnotationReviewPage.tsx:39-48` (`parseRubricQuestionsWithType()`) +- Rating display: `AnnotationReviewPage.tsx:103-123` (`getRatingStars()`, `getBinaryDisplay()`) with no shared utility +- Trace data conversion: `AnnotationReviewPage.tsx:30-36` (`convertTraceToTraceData()`) and `FocusedAnalysisView.tsx:73-82` (similar inline conversion) + +**Description**: The same transformation logic is independently implemented in multiple components. `rubricUtils.ts` already exists as the canonical location for rubric parsing, but components implement their own local versions. + +**Impact**: Bug fixes must be applied to each copy. Behavior divergence between components displaying the same data differently. + +**Remediation**: +- Delete local rubric conversion functions, import from `rubricUtils.ts` +- Create `utils/ratingUtils.ts` for rating display functions +- Create `utils/traceUtils.ts` for trace data conversion + +**Acceptance Criteria**: +- [ ] Each transformation function exists in exactly one utility module +- [ ] Components import from utils, never reimplement +- [ ] Utility modules have independent unit tests + +--- + +### ARCH-10: Scattered localStorage Access + +**Severity**: MEDIUM +**Location**: +- `client/src/context/UserContext.tsx` — lines 64, 78, 87, 126, 128, 142, 233 (7 call sites) +- `client/src/context/WorkshopContext.tsx` — lines 40, 42, 58, 65, 70, 115, 118, 126 (8 call sites) +- `client/src/components/AnnotationStartPage.tsx:78` + +**Description**: Direct `localStorage.getItem()` and `localStorage.setItem()` calls scattered across 3+ context providers and multiple components (20+ total call sites). No abstraction layer, no key namespace management, no type safety on stored values. + +**Impact**: Changing storage strategy (e.g., sessionStorage, IndexedDB for larger data) requires modifying every call site. Key name collisions possible. Stored values not validated on read. + +**Remediation**: Create a `useLocalStorage(key, defaultValue)` hook or a `StorageService` class that: +- Centralizes all storage keys as constants +- Provides type-safe get/set +- Handles serialization/deserialization +- Enables swapping storage backend + +**Acceptance Criteria**: +- [ ] Zero direct `localStorage` calls in components or contexts +- [ ] All storage access goes through a shared abstraction +- [ ] Storage keys defined as constants in one location + +--- + +### ARCH-11: Late/Internal Imports in workshops.py + +**Severity**: MEDIUM +**Location**: `server/routers/workshops.py` — 30+ locations + +**Description**: Services and utilities are imported inside function bodies rather than at module level: +- Line 310: `from server.utils.jsonpath_utils import validate_jsonpath` +- Line 352: `from server.utils.jsonpath_utils import apply_jsonpath` +- Line 622: `from server.database import SessionLocal` +- Line 1101: `from server.services.token_storage_service import token_storage` +- Line 1133: `from server.services.alignment_service import AlignmentService` +- Line 1165: `from server.models import JudgePromptCreate, JudgeEvaluation` +- Line 2060: `from server.services.databricks_service import DatabricksService` + +30+ internal imports scattered throughout the file. + +Cross-reference: CQ-10 covers `import time` in `database_service.py`. This item covers the broader pattern of hiding dependencies. + +**Impact**: Makes dependency graph impossible to trace statically. Hides circular dependencies instead of fixing them. Complicates refactoring — moving a function may silently break a late import. Prevents tools like `isort` from managing imports. + +**Remediation**: Move all imports to module level. If circular imports exist, resolve them by: +1. Extracting shared models/types to a separate module +2. Using dependency injection instead of direct imports +3. Using `TYPE_CHECKING` for type-only imports + +**Acceptance Criteria**: +- [ ] Zero function-body imports (except justified circular dependency avoidance with comment) +- [ ] All dependencies visible at top of file +- [ ] `isort` runs clean + +--- + +### ARCH-12: Direct Environment Variable Mutations in Routes + +**Severity**: MEDIUM +**Location**: +- `server/routers/workshops.py:1456-1459` +- `server/routers/databricks.py:198-206` + +**Description**: Route handlers directly mutate `os.environ` to configure Databricks/MLflow clients: +```python +# workshops.py:1456-1459 +os.environ['DATABRICKS_HOST'] = mlflow_config.databricks_host.rstrip('/') +if not has_oauth: + os.environ['DATABRICKS_TOKEN'] = mlflow_config.databricks_token + +# databricks.py:198-206 — multiple os.environ writes and deletes +``` + +**Impact**: Global state mutation from request handlers. In a multi-worker/multi-thread environment, one request can overwrite another's config. No cleanup guarantee — if a route handler errors, environment may be left in a modified state. Untestable without real environment modification. + +**Remediation**: Use a `DatabricksConfigService` or context manager that sets credentials for the scope of an operation without mutating global state. Or use SDK client instances with per-call configuration. + +**Acceptance Criteria**: +- [ ] Zero `os.environ` writes in router files +- [ ] Credential configuration handled through service abstraction +- [ ] Configuration scoped per-operation, not global + +--- + +### ARCH-13: Permission Logic in Pydantic Response Model + +**Severity**: MEDIUM +**Location**: `server/models.py:74-127` + +**Description**: `UserPermissions.for_role()` is a classmethod on a Pydantic model that maps roles to permission sets: +```python +class UserPermissions(BaseModel): + @classmethod + def for_role(cls, role: UserRole) -> 'UserPermissions': + if role == UserRole.FACILITATOR: + return cls(can_view_discovery=True, can_create_findings=False, ...) +``` + +Pydantic models are data contracts (request/response schemas). Role-to-permission mapping is authorization business logic. + +**Impact**: Permission rules coupled to serialization schema. Cannot change permission logic without touching the API contract module. Difficult to test permission rules independently. + +**Remediation**: Move to an `AuthorizationService.get_permissions_for_role(role)` method. The Pydantic model should only define the shape. + +**Acceptance Criteria**: +- [ ] `UserPermissions` model has no business logic methods +- [ ] Permission mapping lives in a service +- [ ] Service is independently testable + +--- + +### ARCH-14: React Query Key Sprawl + +**Severity**: LOW +**Location**: `client/src/hooks/useWorkshopApi.ts`, various components + +**Description**: Query keys are hardcoded strings scattered across hooks and components: +```typescript +['workshop', workshopId] +['traces', workshopId] +['participant-notes', workshopId] +['annotations', workshopId] +['findings', workshopId] +``` + +No centralized `queryKeys.ts` constants file. + +**Impact**: Cache invalidation requires knowing the exact key string used in each hook. Typos create silent cache misses. Cannot grep for all queries related to a domain. + +**Remediation**: Create `client/src/constants/queryKeys.ts`: +```typescript +export const queryKeys = { + workshop: (id: string) => ['workshop', id] as const, + traces: (workshopId: string) => ['traces', workshopId] as const, + // ... +}; +``` + +**Acceptance Criteria**: +- [ ] All query keys defined in `queryKeys.ts` +- [ ] Zero hardcoded query key arrays in hooks or components +- [ ] Cache invalidation uses the same key factory functions + +--- + +### ARCH-15: Inconsistent React Query Configuration + +**Severity**: LOW +**Location**: `client/src/hooks/useWorkshopApi.ts` (multiple hooks) + +**Description**: Each hook has different cache/refetch settings with no shared configuration: +- `useListWorkshops()`: `staleTime: 30000`, `refetchOnMount: true` +- `useWorkshop()`: `staleTime: 10000`, `refetchInterval: 30000` +- `useAllTraces()`: `staleTime: 30 * 1000`, `gcTime: 10 * 60 * 1000` +- `useParticipantNotes()`: `refetchInterval: 15 * 1000` +- Other hooks: `refetchInterval: false` with comment "DISABLED: Was causing Chrome hangs" + +**Impact**: Difficult to reason about total polling load. Inconsistent user experience (some data refreshes, some doesn't). Tuning history captured in scattered comments rather than centralized config. + +**Remediation**: Define query configuration presets: +```typescript +const QUERY_PRESETS = { + realtime: { staleTime: 5_000, refetchInterval: 10_000 }, + standard: { staleTime: 30_000, refetchOnMount: true }, + static: { staleTime: Infinity }, +}; +``` + +**Acceptance Criteria**: +- [ ] Query configuration presets defined in one location +- [ ] Each hook references a preset, not inline numbers +- [ ] Total polling load documented + +--- + +## Prioritized Backlog + +| Priority | ID | Title | Effort | Impact | +|----------|------|-------|--------|--------| +| P0 | ARCH-1 | Split DatabaseService into domain services | L | Critical - enables isolation and testing | +| P0 | ARCH-2 | Extract business logic from route handlers | L | Critical - enables thin routes and testable logic | +| P0 | ARCH-3 | Replace raw fetch() with generated API client | M | Critical - type safety across API boundary | +| P1 | ARCH-4 | Decompose god components (FacilitatorDashboard, TraceViewer, etc.) | L | High - enables component testing | +| P1 | ARCH-5 | Extract job persistence and background tasks from router | M | High - enables job lifecycle testing | +| P1 | ARCH-6 | Separate API/state/storage in context providers | M | High - enables context testing | +| P1 | ARCH-7 | Implement dependency-based auth pattern | M | High - consistent authorization | +| P2 | ARCH-8 | Centralize frontend type definitions | S | Medium - prevents type drift | +| P2 | ARCH-9 | Deduplicate business logic to shared utils | S | Medium - single source of truth | +| P2 | ARCH-10 | Abstract localStorage behind shared hook | S | Medium - enables storage changes | +| P2 | ARCH-11 | Move late imports to module level | M | Medium - dependency visibility | +| P2 | ARCH-12 | Replace direct env var mutations with service | S | Medium - thread safety | +| P2 | ARCH-13 | Move permission logic out of Pydantic model | S | Medium - separation of concerns | +| P3 | ARCH-14 | Create queryKeys.ts constants | S | Low - cache management clarity | +| P3 | ARCH-15 | Standardize React Query configuration | S | Low - consistency | + +**Effort**: S = < 2 hours, M = 2-8 hours, L = 1-3 days diff --git a/debt-docs/CODE_QUALITY_DEBT.md b/debt-docs/CODE_QUALITY_DEBT.md new file mode 100644 index 00000000..5648a87d --- /dev/null +++ b/debt-docs/CODE_QUALITY_DEBT.md @@ -0,0 +1,431 @@ +# Code Quality & Architecture Debt + +## Overview + +The codebase has significant code quality debt concentrated in two areas: the backend workshop router (`server/routers/workshops.py` at 5,229 lines) and the frontend judge tuning page (`client/src/pages/JudgeTuningPage.tsx` at 2,754 lines). These god files create cascading issues with testing, maintenance, and onboarding. Additionally, inconsistent error handling patterns, debug artifacts in production code, and missing type safety compound the problem. + +--- + +## Items + +### CQ-1: God File - workshops.py Router (5,229 lines) + +**Severity**: CRITICAL +**Location**: `server/routers/workshops.py` + +**Description**: Single file handles all workshop API endpoints including job management, trace handling, phase transitions, evaluations, alignment, and metrics. Contains functions exceeding 500 lines (`run_alignment_background()` at line 3389+, `run_evaluation_background()` at line 3583+) with 4-5 levels of nesting. + +**Impact**: Untestable in isolation, high merge conflict risk, impossible to navigate, violates single responsibility principle. + +**Remediation**: Split into domain-specific router modules: +- `routers/workshop_crud.py` - CRUD operations and phase management +- `routers/workshop_traces.py` - Trace ingestion and assignment +- `routers/workshop_annotations.py` - Annotation submission and retrieval +- `routers/workshop_evaluation.py` - Judge evaluation and alignment jobs +- `routers/workshop_metrics.py` - IRR, alignment metrics, results + +**Acceptance Criteria**: +- [ ] No single router file exceeds 800 lines +- [ ] No single function exceeds 50 lines +- [ ] Each module has independent unit tests +- [ ] All existing E2E tests still pass + +--- + +### CQ-2: God Component - JudgeTuningPage.tsx (2,754 lines) + +**Severity**: CRITICAL +**Location**: `client/src/pages/JudgeTuningPage.tsx` + +**Description**: Single React component with 49 `useState`/hook calls (lines 62-124), 16+ error/loading states, duplicated polling logic across 4 operations, and extensive `as any` casts (lines 1811, 1813, 1815, 1821, 2014, 2016, 2020, 2064, 2068, 2122, 2191, 2193, 2202, 2203). + +**Impact**: Impossible to test individual features, state synchronization bugs, high cognitive load, performance issues from excessive re-renders. + +**Remediation**: Extract into composed components and custom hooks: +- `useEvaluationPolling` hook for shared polling logic +- `useAlignmentJob` hook for alignment state management +- `JudgeConfigPanel`, `AlignmentPanel`, `EvaluationResultsPanel` components +- Proper TypeScript interfaces to replace all `any` types + +**Acceptance Criteria**: +- [ ] No single component file exceeds 500 lines +- [ ] Zero `any` type annotations +- [ ] Polling logic extracted to a single reusable hook +- [ ] Each sub-component has unit tests + +--- + +### CQ-3: Bare Except Clauses + +**Severity**: CRITICAL +**Location**: `server/routers/workshops.py:94, 4080, 4086, 4093, 4099` + +**Description**: Multiple bare `except:` statements that catch and silently swallow all exceptions including `KeyboardInterrupt` and `SystemExit`: +```python +# Line 94 +except: + pass + +# Lines 4080-4099 in metrics calculation +try: + kappa = cohen_kappa_score(human, predicted) +except: + kappa = 0.0 +``` + +**Impact**: Hides critical errors, makes debugging impossible, can mask data corruption. + +**Remediation**: Replace with specific exception types and add logging: +```python +except (ValueError, TypeError) as e: + logger.warning(f"Kappa calculation failed: {e}") + kappa = 0.0 +``` + +**Acceptance Criteria**: +- [ ] Zero bare `except:` clauses in codebase +- [ ] All exception handlers use specific types +- [ ] All caught exceptions are logged with context + +--- + +### CQ-4: TypeScript `any` Types Throughout Frontend + +**Severity**: HIGH +**Location**: +- `client/src/pages/JudgeTuningPage.tsx:70, 109` - `useState(null)` +- `client/src/pages/JudgeTuningPage.tsx:1811-2203` - 14+ `as any` casts +- `client/src/context/UserContext.tsx:84, 137, 221` - `any` error types +- `client/src/pages/AnnotationDemo.tsx:205` - `any` variable + +**Description**: Widespread use of `any` type defeating TypeScript's type safety. Pattern examples: +```typescript +const [mlflowConfig, setMlflowConfig] = useState(null); +(prompt.model_parameters as any)?.judge_name +``` + +**Impact**: IDE autocomplete disabled, refactoring becomes risky, runtime errors not caught at compile time. + +**Remediation**: Define proper interfaces for all `any` usages. Create types for MLflow config, alignment results, judge parameters. + +**Acceptance Criteria**: +- [ ] Zero `any` annotations in production code (excluding generated code in `client/src/client/`) +- [ ] All API response types have corresponding TypeScript interfaces + +--- + +### CQ-5: Debug Print Statements in Production Code + +**Severity**: HIGH +**Location**: +- `server/routers/workshops.py:963-977` - Discovery endpoint debug prints +- `server/routers/workshops.py:2391, 2426` - Error handling prints +- `server/services/judge_service.py:64-68` - Evaluate prompt debug print +- `server/database.py` - 135+ print() calls across backend + +**Description**: Debug `print()` statements left in production code: +```python +print(f" DEBUG begin_discovery: ...") +print(f" DEBUG trace_ids: {[t.id for t in traces]}") +print(f" DEBUG: Taking first {trace_limit} traces...") +``` + +**Impact**: Exposes sensitive data in logs, clutters stdout, no log level control, unprofessional in production. + +**Remediation**: Replace all `print()` with `logger.debug()` or `logger.info()` as appropriate. + +**Acceptance Criteria**: +- [ ] Zero `print()` statements in `/server/` (except `make_openapi.py`) +- [ ] All logging uses the `logging` module with appropriate levels + +--- + +### CQ-6: Console.log Statements in Production Client Code + +**Severity**: HIGH +**Location**: +- `client/src/pages/JudgeTuningPage.tsx:151` and 40+ other locations +- `client/src/pages/WorkshopDemoLanding.tsx:47-48` - DEBUG_ENABLE_USER_SWITCHING flag +- `client/src/context/UserContext.tsx` - Multiple console.error calls + +**Description**: `console.log`, `console.error`, and `console.warn` calls throughout client code. Combined with `drop_console: false` in `client/vite.config.ts:68`, these ship to production. + +**Impact**: Performance degradation, security risk (leaks internal state to browser console), bundle size increase. + +**Remediation**: Remove all console statements, re-enable `drop_console: true` in Vite config, use a structured logging service for production error reporting. + +**Acceptance Criteria**: +- [ ] Zero `console.log` in production code +- [ ] `drop_console: true` in `vite.config.ts` +- [ ] `console.error` replaced with error boundary or logging service + +--- + +### CQ-7: Duplicated Polling/Retry Logic (Client) + +**Severity**: HIGH +**Location**: +- `client/src/pages/JudgeTuningPage.tsx` - 4 separate polling implementations +- `client/src/pages/AnnotationDemo.tsx:213, 816` - Retry with exponential backoff +- `client/src/pages/TraceViewerDemo.tsx:265, 463` - Polling intervals +- `client/src/components/IntakeWaitingView.tsx:43` - Fixed 5-second polling + +**Description**: Nearly identical polling patterns repeated with different intervals and inconsistent error handling: +```typescript +// Pattern repeated 4+ times with variations +const pollInterval = setInterval(async () => { ... }, timeout); +await new Promise(r => setTimeout(r, 1500 * (attempt + 1))); +retryIntervalRef.current = setInterval(() => { ... }, 2000); +``` + +**Impact**: Maintenance nightmare, inconsistent retry behavior across features, hard to update globally. + +**Remediation**: Extract to a single `usePolling(fn, interval, options)` hook with exponential backoff support. + +**Acceptance Criteria**: +- [ ] Single `usePolling` hook in `/client/src/hooks/` +- [ ] All polling uses the shared hook +- [ ] Configurable backoff, max retries, and cleanup + +--- + +### CQ-8: Deeply Nested Error Handling + +**Severity**: HIGH +**Location**: `server/routers/workshops.py:1380-1640` + +**Description**: 5-6 levels of nested try/except in background job processing: +```python +try: + try: + try: + try: + ... + except Exception: + except Exception: + except Exception: +except Exception: +``` +Lines 1384, 1428, 1476, 1480, 1584, 1596, 1604 show the nesting. + +**Impact**: Extremely difficult to follow control flow, impossible to debug, high cognitive load. + +**Remediation**: Use early returns and guard clauses. Extract nested operations to separate functions. Use context managers for resource cleanup. + +**Acceptance Criteria**: +- [ ] Maximum nesting depth of 3 in any function +- [ ] Background job functions decomposed into steps + +--- + +### CQ-9: Inconsistent Exception Handling Patterns + +**Severity**: HIGH +**Location**: `server/routers/workshops.py` + +**Description**: Three different exception handling styles mixed in the same file: +1. Lines 97, 627, 656: `except Exception as e` with logging +2. Lines 188, 2080: Specific exception types (`OperationalError`, `ValueError`) +3. Lines 94, 4080-4099: Bare `except:` (see CQ-3) + +**Impact**: Inconsistent error recovery, unclear error contract, debugging harder. + +**Remediation**: Establish a consistent pattern: catch specific exceptions, log with context, re-raise or return appropriate HTTP status. + +**Acceptance Criteria**: +- [ ] Documented error handling pattern in CONTRIBUTING.md +- [ ] All handlers follow the documented pattern + +--- + +### CQ-10: Redundant Imports Inside Methods + +**Severity**: MEDIUM +**Location**: `server/services/database_service.py:78, 125, 137, 523, 657, 752, 1498` + +**Description**: `import time` appears 8 times inside method bodies instead of once at module level. + +**Impact**: Code smell, minor performance penalty, violates Python style. + +**Remediation**: Move all imports to module level. + +**Acceptance Criteria**: +- [ ] Zero in-function imports (except for circular dependency avoidance) + +--- + +### CQ-11: Hardcoded Magic Numbers and Strings + +**Severity**: MEDIUM +**Location**: +- `server/routers/workshops.py:21` - `JOB_DIR = "/tmp/workshop_jobs"` +- `server/routers/workshops.py:170` - `max_retries=5, base_delay=0.5` +- `server/routers/workshops.py:190` - `random.uniform(0, 0.5)` jitter +- `client/src/pages/JudgeTuningPage.tsx:114` - `'databricks-claude-sonnet-4-5'` +- `client/src/pages/JudgeTuningPage.tsx:1142, 1152` - `setTimeout(poll, 2000)` / `5000` + +**Impact**: Difficult to tune behavior, scattered configuration, hard to maintain. + +**Remediation**: Extract to named constants or configuration values. + +**Acceptance Criteria**: +- [ ] All retry parameters in a shared config +- [ ] All polling intervals as named constants +- [ ] All model names configurable + +--- + +### CQ-12: Manual In-Memory Cache (Thread-Unsafe) + +**Severity**: MEDIUM +**Location**: `server/services/database_service.py:115-139` + +**Description**: Manual cache with 30-second TTL using timestamp checking. Since `DatabaseService` is instantiated per-request, the cache is never actually reused across requests. +```python +self._cache = {} +self._cache_ttl = 30 # Useless - new instance per request +``` + +**Impact**: False sense of caching, wasted code, potential thread-safety issues if ever shared. + +**Remediation**: Either remove the dead cache code, or implement module-level caching with proper invalidation. + +**Acceptance Criteria**: +- [ ] Cache either works correctly or is removed +- [ ] If kept, thread-safe and shared across requests + +--- + +### CQ-13: Global State in sqlite_rescue.py + +**Severity**: MEDIUM +**Location**: `server/sqlite_rescue.py:57, 382, 413, 447, 486` + +**Description**: Multiple global variables for state management: +```python +global _workspace_client +global _backup_timer, _backup_timer_running +global _shutdown_handlers_installed +``` + +**Impact**: Hidden dependencies, difficult to test, potential race conditions. + +**Remediation**: Encapsulate in a class with explicit lifecycle management. + +**Acceptance Criteria**: +- [ ] Zero module-level mutable globals +- [ ] State encapsulated in a testable class + +--- + +### CQ-14: useEffect Hooks Without Proper Cleanup + +**Severity**: MEDIUM +**Location**: `client/src/pages/JudgeTuningPage.tsx:180, 244, 339, 349, 420, 442, 454, 487, 494` + +**Description**: Multiple useEffect hooks setting intervals/listeners. Some may have missing dependency arrays or incomplete cleanup functions. + +**Impact**: Memory leaks, resource accumulation over time in long-running sessions. + +**Remediation**: Audit all useEffect hooks for proper cleanup. Will be largely resolved by CQ-2 refactoring. + +**Acceptance Criteria**: +- [ ] Every useEffect with intervals/listeners has a cleanup return +- [ ] React strict mode enabled to catch missing cleanups + +--- + +### CQ-15: Commented-Out Code and Unclear TODOs + +**Severity**: LOW +**Location**: +- `server/services/judge_service.py:444, 449` - `"TODO: this was ostensibly here for a reason, but I don't know what it is."` +- `server/services/irr_service.py:193` - Same unclear TODO +- `server/services/database_service.py:2711, 2739` - `"TODO: pretty sure this does nothing"` +- `server/services/databricks_service.py:250` - `"TODO: this is a noop, actually handle connection testing?"` + +**Impact**: Code archaeology required to understand intent, maintenance overhead. + +**Remediation**: Either fix the code or remove it. Convert actionable TODOs to GitHub issues with context. + +**Acceptance Criteria**: +- [ ] Zero TODOs without linked GitHub issues +- [ ] No commented-out code blocks + +--- + +### CQ-16: Missing Python Type Annotations + +**Severity**: LOW +**Location**: +- `server/services/database_service.py:66` - `operation` param untyped +- `server/routers/workshops.py:170` - `operations_fn` untyped +- Multiple service methods missing return type hints + +**Impact**: Reduced IDE support, harder to understand APIs. + +**Remediation**: Add type hints to all public functions. Run mypy in strict mode. + +**Acceptance Criteria**: +- [ ] All public functions have parameter and return type annotations +- [ ] mypy passes with no errors + +--- + +### CQ-17: Naming Inconsistencies Across Backend/Frontend Boundary + +**Severity**: LOW +**Location**: +- TypeScript code uses `snake_case` for API response fields (`judge_type`, `model_parameters`) +- Mixed naming in `JudgeTuningPage.tsx:147` - `pJudgeName` vs `p.model_parameters?.judge_name` + +**Impact**: Slight cognitive overhead at the API boundary. + +**Remediation**: Use consistent camelCase in TypeScript with transformer at the API layer. + +**Acceptance Criteria**: +- [ ] Consistent naming convention within each language +- [ ] API response transformation layer if needed + +--- + +### CQ-18: Type Ignore Suppressions + +**Severity**: LOW +**Location**: `server/db_config.py:123, 125, 127` + +**Description**: `# type: ignore` comments suppressing type checking instead of fixing types. + +**Impact**: Undermines type safety. + +**Remediation**: Fix underlying type issues. + +**Acceptance Criteria**: +- [ ] Zero `# type: ignore` comments without justification + +--- + +## Prioritized Backlog + +| Priority | ID | Title | Effort | Impact | +|----------|-----|-------|--------|--------| +| P0 | CQ-3 | Replace bare except clauses | S | High - prevents silent failures | +| P0 | CQ-5 | Remove debug print statements | S | High - production hygiene | +| P0 | CQ-6 | Remove console.log + enable drop_console | S | High - production hygiene | +| P1 | CQ-1 | Split workshops.py into modules | L | Critical - enables testing and maintenance | +| P1 | CQ-2 | Decompose JudgeTuningPage | L | Critical - enables testing and maintenance | +| P1 | CQ-7 | Extract usePolling hook | M | High - reduces duplication | +| P1 | CQ-4 | Eliminate `any` types | M | High - type safety | +| P1 | CQ-8 | Flatten nested error handling | M | High - readability | +| P2 | CQ-9 | Standardize exception patterns | M | Medium - consistency | +| P2 | CQ-11 | Extract magic numbers to constants | S | Medium - maintainability | +| P2 | CQ-12 | Fix or remove dead cache code | S | Medium - correctness | +| P2 | CQ-10 | Move imports to module level | S | Low - code quality | +| P2 | CQ-13 | Encapsulate sqlite_rescue globals | M | Medium - testability | +| P2 | CQ-14 | Audit useEffect cleanup | S | Medium - memory leaks | +| P3 | CQ-15 | Resolve or remove TODOs | S | Low - clarity | +| P3 | CQ-16 | Add missing type annotations | M | Low - IDE support | +| P3 | CQ-17 | Fix naming inconsistencies | S | Low - consistency | +| P3 | CQ-18 | Remove type: ignore comments | S | Low - type safety | + +**Effort**: S = < 2 hours, M = 2-8 hours, L = 1-3 days diff --git a/debt-docs/COMPLEXITY_DEBT.md b/debt-docs/COMPLEXITY_DEBT.md new file mode 100644 index 00000000..3cf23419 --- /dev/null +++ b/debt-docs/COMPLEXITY_DEBT.md @@ -0,0 +1,523 @@ +# Code Complexity Debt + +## Overview + +Code complexity is concentrated in a small number of files on both server and client. On the backend, `workshops.py` (5,229 lines) and `database_service.py` (3,692 lines) contain functions with cyclomatic complexity exceeding 20, nesting depths of 6+ levels, and 200+ line functions that mix multiple concerns. On the frontend, `JudgeTuningPage.tsx` (2,754 lines) manages 23 useState hooks and 10 useEffect hooks in a single component, while `TraceViewer.tsx` (1,650 lines) has a 5-layer JSON parsing fallback chain. These files account for the majority of both maintenance burden and bug risk. + +Cross-references: CQ-1 (god file workshops.py), CQ-2 (god component JudgeTuningPage), CQ-8 (deeply nested error handling), ARCH-1 (god service DatabaseService), ARCH-4 (god frontend components). + +--- + +## Items + +### Server Complexity + +--- + +### CPLX-1: begin_annotation_phase() — Cyclomatic Complexity ~20 + +**Severity**: CRITICAL +**Location**: `server/routers/workshops.py:1301-1663` (362 lines) + +**Description**: This route handler performs request parsing, trace selection, database updates, trace tagging, and auto-evaluation setup — including spawning a background thread with its own 200-line nested function (`run_auto_evaluation_background()` at lines 1437-1637). + +**Complexity breakdown**: +- 27+ decision points (if statements, except clauses) +- 6+ nesting levels: route handler → conditional → for loop → try/except → if/else → inner try/except +- 13 separate try/except blocks within the nested background function (lines 1438, 1454, 1466, 1476, 1480, 1517, 1540, 1564, 1584, 1590, 1596, 1604, 1634) +- Background function captures parent scope via closure (no explicit parameters) +- Generator consumption inside a loop inside a thread inside a route handler + +**Impact**: Untestable without running the full stack. A single bug in any branch is nearly impossible to isolate. Any modification risks unintended side effects in other branches. + +**Remediation**: Decompose into: +1. `validate_annotation_request()` — input validation +2. `select_annotation_traces()` — trace selection logic +3. `AutoEvaluationService.start()` — background evaluation orchestration +4. Route handler becomes ~20 lines calling the above + +**Acceptance Criteria**: +- [ ] No function exceeds cyclomatic complexity of 10 +- [ ] Background thread logic extracted to a service +- [ ] Each decomposed function has independent unit tests + +--- + +### CPLX-2: add_annotation() — Cyclomatic Complexity ~22 + +**Severity**: CRITICAL +**Location**: `server/services/database_service.py:1400-1672` (272 lines) + +**Description**: The most complex function in the codebase. Handles rating validation, retry logic with multiple exception types, update-vs-create branching, and MLflow sync — all in a single method. + +**Complexity breakdown**: +- 30+ decision points across all branches +- 6+ nesting levels: retry loop → try → if (existing) → validation → if/elif/else → inner try +- 3 different exception handlers (IntegrityError, OperationalError, Exception), each with their own nested if/else for retry decisions +- Final-attempt recovery logic (lines 1604-1635) adds 3 more nesting levels +- Complex rating validation (lines 1451-1492) builds 3 intermediate dicts without intermediate types: + - `question_judge_types_by_id` + - `question_judge_types_by_index` + - `validated_ratings` + +**Impact**: Estimated cognitive complexity of 35-40. Any change to annotation logic requires understanding all 272 lines. Rating validation bugs are extremely hard to trace. + +**Remediation**: Extract to separate methods: +1. `_validate_ratings(annotation_data, rubric)` → validated ratings dict +2. `_retry_with_backoff(operation_fn)` → generic retry wrapper +3. `_update_existing_annotation(db_annotation, validated_data)` → update path +4. `_create_new_annotation(validated_data)` → create path + +**Acceptance Criteria**: +- [ ] Rating validation is a standalone testable function +- [ ] Retry logic is a generic utility, not embedded in business logic +- [ ] Update and create paths are separate methods +- [ ] No function exceeds 50 lines + +--- + +### CPLX-3: run_alignment() — Mixed Concerns, 360+ Lines + +**Severity**: CRITICAL +**Location**: `server/services/alignment_service.py:944-1311` (367 lines, 6 parameters) + +**Description**: A single function that handles Databricks environment setup, MLflow experiment setup, trace searching, model URI construction, optimizer setup, background thread execution with async logging, and result reporting. + +**Complexity breakdown**: +- Cyclomatic complexity ~14-16 +- 5+ nesting levels +- 5 sequential phases with different concerns: + 1. Databricks environment setup (lines 982-990) — credential management + 2. MLflow experiment setup (lines 1001-1005) — external service + 3. Trace search (lines 1008-1019) — data retrieval + 4. Model URI construction with OAuth vs. token branching (lines 1022-1027, 1085-1090) — config logic + 5. Background thread with log capture (lines 1165-1202) — execution management +- Complex nested dict construction for results (lines 1608-1615) + +**Impact**: Cannot test any phase independently. A failure in trace search is indistinguishable from a failure in model construction. Thread management mixes with business logic. + +**Remediation**: Extract each phase into a dedicated method or service. The main function becomes an orchestrator calling 5 focused operations. + +**Acceptance Criteria**: +- [ ] Each phase is a separate testable method +- [ ] Thread management extracted to infrastructure layer +- [ ] Function body is < 50 lines of orchestration + +--- + +### CPLX-4: run_evaluation_with_answer_sheet() — Complex Generator Pipeline + +**Severity**: HIGH +**Location**: `server/services/alignment_service.py:380-943` (563 lines) + +**Description**: Uses a generator pattern that yields both strings (progress messages) and dicts (evaluation results) without a discriminated union type. Callers must use `isinstance()` checks to differentiate. + +**Complexity breakdown**: +- Cyclomatic complexity ~16-18 +- 5+ nesting levels in conditional logic +- Multiple branching paths for `require_human_ratings` True/False +- Nested loops with exception handling (lines 462-471, 498-584) +- Complex trace filtering pipeline: DataFrame → dict → database query → filtered DataFrame +- Type transitions without intermediate validation + +**Impact**: Generator protocol is implicit — callers must know to check `isinstance(message, dict)`. Any change to yielded structure breaks callers silently. + +**Remediation**: Define an explicit result type (e.g., `EvaluationProgress` dataclass with a `type` discriminator). Split into trace preparation, evaluation execution, and result collection phases. + +**Acceptance Criteria**: +- [ ] Generator yields typed dataclass, not raw strings/dicts +- [ ] Trace filtering logic is a separate function +- [ ] Function body < 100 lines + +--- + +### CPLX-5: add_finding() — Retry Loop with 5 Nesting Levels + +**Severity**: HIGH +**Location**: `server/services/database_service.py:738-870` (132 lines) + +**Description**: Discovery finding creation wrapped in a retry loop with 3 different exception handlers, each with their own recovery logic. + +**Complexity breakdown**: +- Cyclomatic complexity ~18 +- Outer retry loop (lines 761-866) +- 5+ nesting levels: for → try → if/else → except → if +- 3 exception handlers (IntegrityError, OperationalError, Exception) with different retry decisions +- Complex recovery logic (lines 815-839) + +**Impact**: Nearly identical retry pattern to `add_annotation()` — duplicated complexity. + +**Remediation**: Extract generic retry wrapper (shared with CPLX-2). Business logic becomes a simple create-or-skip operation. + +**Acceptance Criteria**: +- [ ] Retry logic extracted to shared utility +- [ ] Business logic < 30 lines +- [ ] Each exception type has documented recovery behavior + +--- + +### CPLX-6: sync_annotations_to_mlflow() — Multi-Level State Tracking + +**Severity**: HIGH +**Location**: `server/services/database_service.py:1972-2073` (101 lines) + +**Description**: Tracks 6+ counters (`synced_count`, `total_logged`, `skipped_no_mlflow_id`, etc.) while iterating over annotations with complex question parsing and fallback logic. + +**Complexity breakdown**: +- String parsing without validation (lines 1988-2004): splits question strings by colon, assumes format +- Nested loops with conditionals (lines 2027-2058): outer loop over annotations, inner checks for trace existence, MLflow ID, ratings +- Two separate error collection mechanisms (`sync_errors` and `errors` lists) +- Deep dict access patterns: `annotation_db.trace.mlflow_trace_id` + +**Impact**: Question parsing is fragile — malformed question strings cause silent failures. Error tracking split across two lists makes debugging hard. + +**Remediation**: Extract question parsing to `RubricParser.parse_question_titles()`. Use a single `SyncResult` dataclass for tracking counters and errors. + +**Acceptance Criteria**: +- [ ] Question parsing is a standalone tested function +- [ ] Single result type tracks all sync state +- [ ] No implicit string format assumptions without validation + +--- + +### CPLX-7: Database Schema Updates via Raw SQL + +**Severity**: MEDIUM +**Location**: `server/database.py:589-733` (`_apply_schema_updates()`, 144 lines) + +**Description**: Runtime schema modifications using raw SQL strings with complex conditional logic for checking table/column existence across SQLite and PostgreSQL. + +**Complexity breakdown**: +- 12+ separate ALTER TABLE statements +- Each guarded by existence checks (try/except or conditional) +- Database-specific branching (SQLite vs PostgreSQL syntax differences) +- No migration framework — changes are applied imperatively on startup + +**Impact**: Schema drift between environments possible. No rollback mechanism. Hard to understand current schema state. + +**Remediation**: Migrate all schema changes to Alembic migrations. Remove runtime SQL. + +**Acceptance Criteria**: +- [ ] All schema changes managed by Alembic +- [ ] Zero raw ALTER TABLE in application code +- [ ] Migration history is the source of truth for schema + +--- + +### CPLX-8: Server Dependency Graph — High Fan-In/Fan-Out + +**Severity**: MEDIUM +**Location**: `server/services/database_service.py`, `server/routers/workshops.py` + +**Description**: Two modules sit at the center of the dependency graph with extreme coupling: + +**database_service.py (fan-in: 80+ import locations)**: +- Imported by: workshops.py (50+ refs), alignment_service.py (15+ refs), judge_service.py (10+ refs), rubric_generation_service.py (5+ refs), databricks_service.py (3+ refs) +- Imports from: 14 database models, 20+ API models, 3 utility modules, 2 config modules (fan-out: 40+ imports) + +**workshops.py (fan-out: 20+ internal imports)**: +- Imports from 10+ internal modules +- 30+ of those imports are late/function-level (hiding the coupling) + +**Impact**: Any change to database_service.py potentially affects 5+ consumers. Cannot refactor without understanding all 80+ usage sites. + +**Remediation**: Splitting DatabaseService (ARCH-1) naturally reduces fan-in. Moving late imports to module level (ARCH-11) makes coupling visible. + +**Acceptance Criteria**: +- [ ] No module has fan-in > 30 +- [ ] No module has fan-out > 15 +- [ ] All imports at module level + +--- + +## Client Complexity + +--- + +### CPLX-9: JudgeTuningPage — 23 useState + 10 useEffect Hooks + +**Severity**: CRITICAL +**Location**: `client/src/pages/JudgeTuningPage.tsx` (2,754 lines) + +**Description**: A single component managing the entire judge tuning workflow with 23 useState hooks (lines 62-123) and 10 useEffect hooks. Related state is scattered rather than grouped. + +**State clusters that should be reducers**: +- **Evaluation state** (5 vars): `evaluations`, `metrics`, `evaluationError`, `hasEvaluated`, `evaluationComplete` +- **Auto-eval state** (4 vars): `autoEvalStatus`, `autoEvalJobId`, `autoEvalDerivedPrompt`, `isPollingAutoEval` +- **Prompt state** (3 vars): `currentPrompt`, `originalPromptText`, `isModified` +- **Alignment state** (4 vars): `isRunningAlignment`, `alignmentLogs`, `alignmentResult`, `showAlignmentLogs` + +**State cascading risk**: Changing `selectedQuestionIndex` triggers useEffect (line 244) → sets `currentPrompt`, `originalPromptText`, `selectedPromptId` → triggers useEffect (line 454) → sets `hasEvaluated`, `evaluationComplete`. Multiple renders and potential race conditions. + +**Complex useEffect dependency arrays**: +- Line 336: `[selectedQuestionIndex, selectedQuestion, workshopId, prompts, rubric]` (5 deps) +- Line 417: `[workshopId, isPollingAutoEval, autoEvalStatus, updateAlignmentLogs]` (4 deps, polling loop) + +**Impact**: Extremely high cognitive load. State bugs are nearly impossible to trace through 23 variables and 10 effects. Every re-render evaluates all hooks. + +**Remediation**: +1. Extract state clusters to `useReducer` (evaluation, auto-eval, prompt, alignment) +2. Extract to custom hooks: `useAutoEvaluation()`, `useAlignmentJob()`, `usePromptManagement()` +3. Split into sub-components: `PromptEditor`, `EvaluationSection`, `AlignmentSection`, `JudgeSelector` + +**Acceptance Criteria**: +- [ ] No component has > 8 useState hooks +- [ ] Related state grouped in useReducer +- [ ] Custom hooks extract reusable state logic +- [ ] Component file < 500 lines + +--- + +### CPLX-10: handleRunAlignment() — 200+ Line Async Function + +**Severity**: HIGH +**Location**: `client/src/pages/JudgeTuningPage.tsx:1308-1622` (314 lines) + +**Description**: A single async function that handles alignment job submission, status polling (1-second intervals, max 180 attempts), auto-evaluation validation, result processing, and UI state updates. + +**Complexity breakdown**: +- Cyclomatic complexity ~12+ +- Nested try-catch-finally +- While loop polling with multiple break conditions +- Multiple sequential state updates within the loop +- Complex conditional for auto-eval (3 pre-condition checks) + +**Impact**: Cannot unit test any part of this function in isolation. Polling logic, business validation, and UI updates are interleaved. + +**Remediation**: Extract to a custom `useAlignmentJob()` hook that returns `{ start, status, logs, result }`. Polling logic should be a separate `usePolling()` utility. + +**Acceptance Criteria**: +- [ ] Polling logic in a reusable hook +- [ ] Business validation separate from UI updates +- [ ] No function > 50 lines + +--- + +### CPLX-11: extractLLMResponseContent() — 5-Layer Parsing Fallback + +**Severity**: HIGH +**Location**: `client/src/components/TraceViewer.tsx:768-1000+` (~230 lines) + +**Description**: Handles 6+ different LLM response formats with a cascading fallback chain. Each format requires different parsing logic. + +**Parsing layers**: +1. Direct JSON parse +2. Flattened format (lines 796-817) +3. OpenAI/ChatCompletion format (lines 820-895) +4. Anthropic format (lines 899+) +5. Quoted object unescaping with regex +6. Character-by-character newline reconstruction + +**Supporting helper functions** (also in TraceViewer.tsx): +- `fixUnescapedNewlines()` (lines 126-170): character-by-character state machine, CC ~5 +- `extractJudgeResultFromMalformed()` (lines 176-213): string pattern matching, CC ~6 +- `isMarkdownContent()` (lines 50-74): 4+ nested pattern checks +- `fixQuotedJsonObjects()` (lines 103-120): regex replacement with fallback + +**Impact**: Very high cognitive load — unclear which parsing path will succeed for any given input. No way to test parsing independently from the component. + +**Remediation**: Extract to `utils/llmResponseParser.ts` with a `parseLLMResponse(raw: unknown): ParsedResponse` function. Each format handler becomes a named function. Add comprehensive unit tests for each format. + +**Acceptance Criteria**: +- [ ] Parsing logic in a standalone utility module +- [ ] Each format handler is a named, testable function +- [ ] Unit tests cover all 6 formats plus malformed input + +--- + +### CPLX-12: extractLLMContent() — Duplicate Extraction Logic + +**Severity**: HIGH +**Location**: `client/src/components/TraceDataViewer.tsx:42-212` (170 lines) + +**Description**: A second LLM content extraction function, separate from the one in TraceViewer.tsx (CPLX-11). Handles 6+ format variations with nested try-catch and conditional chains. + +**Complexity breakdown**: +- Cyclomatic complexity ~12+ +- Overlapping responsibility with `extractLLMResponseContent()` in TraceViewer.tsx +- `extractContentFromString()` helper (lines 48-65): nested try-catch with JSON parsing fallback +- 2 useMemo hooks for computed output values + +**Impact**: Two independent implementations parsing the same data formats. Bugs fixed in one aren't fixed in the other. + +**Remediation**: Consolidate with CPLX-11 into a single `utils/llmResponseParser.ts`. Both components import from the shared utility. + +**Acceptance Criteria**: +- [ ] Single LLM content extraction implementation +- [ ] Both TraceViewer and TraceDataViewer use the shared utility + +--- + +### CPLX-13: FacilitatorDashboard useMemo Chains + +**Severity**: MEDIUM +**Location**: `client/src/components/FacilitatorDashboard.tsx:71-188` + +**Description**: Three complex `useMemo` computations that derive dashboard metrics from raw data. While the component avoids useState (good), the computation logic is dense. + +**Specific computations**: +- `userContributions` (lines 93-123): nested `reduce()` with ternary on `focusPhase`, CC ~4 +- `traceCoverageDetails` (lines 126-188): multiple if/else branches for phase filtering, nested ternary for reviewer counting, complex sort logic, CC ~8+ +- Progress calculations (lines 71-84): deep conditional for determining active trace counts based on phase + +**Impact**: Business logic for progress calculation is locked inside a React component. Cannot be tested without rendering. + +**Remediation**: Extract computation functions to `services/workshopMetrics.ts`: +- `calculateUserContributions(findings, annotations, focusPhase)` +- `calculateTraceCoverage(traces, findings, annotations, phase)` +- `calculateDiscoveryProgress(workshop, traces, findings)` + +**Acceptance Criteria**: +- [ ] Computation functions testable without React +- [ ] useMemo calls delegate to imported functions +- [ ] Unit tests for each metric calculation + +--- + +### CPLX-14: Deep JSX Nesting with Inline Ternaries + +**Severity**: MEDIUM +**Location**: `client/src/pages/JudgeTuningPage.tsx:1673-2754` (1,081 lines of JSX) + +**Description**: The render section alone is 1,081 lines with JSX nesting reaching 8 levels, compounded by inline ternary operators at the deepest levels. + +**Worst case** (lines 1812-1827): +``` + → {map()} →
+ → {ternary → ternary} +``` +8 levels deep with 2 nested ternaries at the leaf. + +**Other examples**: +- Lines 2014-2022: multiple chained `&&` conditions with complex metric display logic +- Lines 1962-1964: IIFE inside JSX with return statement +- 97 `&&` operators in the render section + +**Impact**: Impossible to read without IDE support. Changes to layout risk breaking conditional logic. Cannot extract sub-sections without untangling state dependencies. + +**Remediation**: Extract sub-sections to components: +- `PromptHistorySelector` — prompt dropdown with version badges +- `EvaluationMetricsCard` — metrics display with warnings +- `EvaluationResultsTable` — paginated results grid +- `AlignmentControlPanel` — alignment job controls and logs + +**Acceptance Criteria**: +- [ ] No JSX nesting deeper than 5 levels +- [ ] No inline ternaries deeper than 1 level +- [ ] Each extracted component < 200 lines + +--- + +### CPLX-15: WorkflowContext — 8-Dependency useEffect + +**Severity**: MEDIUM +**Location**: `client/src/context/WorkflowContext.tsx:126` + +**Description**: A single useEffect with 8 dependencies: `[traces, findings, rubric, annotations, participants, workshopId, user, workshop?.current_phase]`. Any change to any of these triggers the effect. + +**Impact**: Difficult to reason about when the effect fires. Potential for cascading updates when multiple deps change in the same render cycle. + +**Remediation**: Split into focused effects — one per concern (e.g., phase tracking, data loading, user state). + +**Acceptance Criteria**: +- [ ] No useEffect with > 4 dependencies +- [ ] Each effect has a clear, documented trigger condition + +--- + +### CPLX-16: 21 `as any` Type Casts in JudgeTuningPage + +**Severity**: MEDIUM +**Location**: `client/src/pages/JudgeTuningPage.tsx:1811-2068` + +**Description**: 21 instances of `as any` concentrated in two areas: +1. `(prompt.model_parameters as any)?.judge_name` — 5 occurrences (lines 1811, 1813, 1815, 1821, 1823) +2. `(metrics as any).total_evaluations_all` — 6 occurrences (lines 2014, 2016, 2020, 2022, 2064, 2068) + +**Root cause**: `model_parameters` and metrics types are not properly defined. The backend sends fields that the TypeScript types don't declare. + +**Impact**: Type system provides zero safety for these code paths. Renamed or removed backend fields won't be caught at compile time. + +**Remediation**: Define proper interfaces: +```typescript +interface ModelParameters { + judge_name?: string; + aligned?: boolean; + // ...other known fields +} + +interface EvaluationMetrics { + total_evaluations: number; + total_evaluations_all?: number; + // ...other fields +} +``` + +**Acceptance Criteria**: +- [ ] Zero `as any` casts in application code +- [ ] All backend response shapes have corresponding TypeScript interfaces + +--- + +## Complexity Summary Tables + +### Server — Top Functions by Cyclomatic Complexity + +| Function | File | CC | Lines | Nesting | +|----------|------|----|-------|---------| +| `add_annotation()` | database_service.py | ~22 | 272 | 6+ | +| `begin_annotation_phase()` | workshops.py | ~20 | 362 | 6+ | +| `add_finding()` | database_service.py | ~18 | 132 | 5+ | +| `run_evaluation_with_answer_sheet()` | alignment_service.py | ~18 | 563 | 5+ | +| `run_alignment()` | alignment_service.py | ~16 | 367 | 5+ | +| `get_active_annotation_traces()` | database_service.py | ~14 | 95 | 4+ | +| `_calculate_eval_metrics()` | alignment_service.py | ~13 | 172 | 4+ | +| `create_tables()` | database.py | ~12 | 66 | 4+ | + +### Client — Top Components/Functions by Complexity + +| Component/Function | File | CC | Lines | Hooks | +|--------------------|------|----|-------|-------| +| `JudgeTuningPage` | JudgeTuningPage.tsx | — | 2,754 | 23 useState, 10 useEffect | +| `handleRunAlignment()` | JudgeTuningPage.tsx | ~12 | 314 | — | +| `extractLLMResponseContent()` | TraceViewer.tsx | ~15 | 230+ | — | +| `extractLLMContent()` | TraceDataViewer.tsx | ~12 | 170 | — | +| `traceCoverageDetails` | FacilitatorDashboard.tsx | ~8 | 62 | — | +| `loadInitialData()` | JudgeTuningPage.tsx | ~8 | 197 | — | + +### File-Level Complexity + +| File | LOC | Functions/Components | Single Responsibility? | +|------|-----|---------------------|----------------------| +| `workshops.py` | 5,229 | 100+ route handlers | NO — 20+ domains | +| `database_service.py` | 3,692 | 60+ methods | NO — 15+ domains | +| `JudgeTuningPage.tsx` | 2,754 | 1 component | NO — prompt editing + eval + alignment + export | +| `TraceViewer.tsx` | 1,650 | 1 component + 6 helpers | NO — JSON parsing + rendering + LLM extraction | +| `AnnotationDemo.tsx` | 1,557 | 1 component | MIXED — display + annotation + notes | +| `RubricCreationDemo.tsx` | 1,405 | 1 component | MIXED — display + creation + suggestions | +| `alignment_service.py` | 1,377 | 2 classes, 8 methods | PARTIALLY — alignment + evaluation | +| `FacilitatorDashboard.tsx` | 1,325 | 1 component | MIXED — metrics + phase control + findings | + +--- + +## Prioritized Backlog + +| Priority | ID | Title | Effort | Impact | +|----------|------|-------|--------|--------| +| P0 | CPLX-1 | Decompose begin_annotation_phase() | M | Critical — 362-line route with background thread | +| P0 | CPLX-2 | Decompose add_annotation() | M | Critical — CC 22, most complex backend function | +| P0 | CPLX-9 | Split JudgeTuningPage state into reducers + hooks | L | Critical — 23 useState, 10 useEffect | +| P0 | CPLX-3 | Decompose run_alignment() | M | Critical — 367-line mixed-concern function | +| P1 | CPLX-4 | Type and split run_evaluation_with_answer_sheet() | M | High — implicit generator protocol | +| P1 | CPLX-10 | Extract handleRunAlignment() to custom hook | M | High — 314-line async function | +| P1 | CPLX-11 | Extract LLM parsing to shared utility | M | High — 5-layer fallback chain | +| P1 | CPLX-12 | Consolidate duplicate LLM extraction | S | High — two implementations of same logic | +| P1 | CPLX-5 | Extract generic retry wrapper | S | High — duplicated retry patterns | +| P2 | CPLX-6 | Simplify sync_annotations_to_mlflow() | M | Medium — fragile string parsing | +| P2 | CPLX-13 | Extract FacilitatorDashboard computations | S | Medium — untestable business logic | +| P2 | CPLX-14 | Decompose JudgeTuningPage JSX | L | Medium — 1,081 lines of render | +| P2 | CPLX-16 | Replace `as any` with proper types | S | Medium — 21 type safety gaps | +| P2 | CPLX-15 | Split 8-dep useEffect in WorkflowContext | S | Medium — unpredictable triggers | +| P3 | CPLX-7 | Migrate runtime SQL to Alembic | M | Medium — schema drift risk | +| P3 | CPLX-8 | Reduce dependency graph fan-in/fan-out | L | Medium — enables all other refactoring | + +**Effort**: S = < 2 hours, M = 2-8 hours, L = 1-3 days diff --git a/debt-docs/DEPENDENCY_DEBT.md b/debt-docs/DEPENDENCY_DEBT.md new file mode 100644 index 00000000..02b58ae0 --- /dev/null +++ b/debt-docs/DEPENDENCY_DEBT.md @@ -0,0 +1,111 @@ +# Dependency Debt + +## Overview + +The dependency debt in this codebase is relatively contained. The main issues are two unused Python packages that add bloat and attack surface, overly loose version pinning that could introduce breaking changes during deployment, and a minor dependency duplication. Frontend dependencies are well-managed with no unused packages detected. + +--- + +## Items + +### DEP-1: Unused Python Dependencies (litellm, dspy) + +**Severity**: HIGH +**Location**: `pyproject.toml:41, 43` + +**Description**: +```toml +"dspy>=3.0.4", +"litellm>=1.75.9", +``` + +Neither `litellm` nor `dspy` is imported anywhere in the codebase. Grep for `from litellm`, `import litellm`, `from dspy`, and `import dspy` returned zero results across all files. + +**Impact**: +- Increased install time and image size (litellm alone pulls in 50+ transitive dependencies) +- Larger attack surface for CVEs +- Maintenance burden for version upgrades +- Developer confusion about what's actually used + +**Remediation**: Remove both from `pyproject.toml` dependencies. Verify no indirect usage first with `uv pip show --files`. + +**Acceptance Criteria**: +- [ ] `dspy` removed from pyproject.toml +- [ ] `litellm` removed from pyproject.toml +- [ ] `uv sync` succeeds without them +- [ ] All tests pass after removal + +--- + +### DEP-2: Overly Loose Version Pinning + +**Severity**: MEDIUM +**Location**: `pyproject.toml:24-47` + +**Description**: Critical packages use `>=` pinning which allows major version jumps: + +| Package | Current | Risk | +|---------|---------|------| +| `fastapi>=0.104.1` | Allows FastAPI 1.0+ with breaking changes | +| `pydantic>=2.5.0` | Allows Pydantic 3.x with breaking changes | +| `mlflow[databricks,genai]>=3.9` | Allows MLflow 4.x with breaking changes | +| `uvicorn[standard]>=0.24.0` | Allows major version changes | +| `cryptography>=41.0.0` | Security-sensitive package with loose pin | +| `sqlalchemy>=2.0.23` | ORM changes can break query patterns | + +**Impact**: A `pip install` or `uv sync` could pull in a new major version with breaking API changes, causing production failures that are hard to diagnose. + +**Remediation**: Use compatible release syntax to allow patch updates but block major/minor: +```toml +"fastapi>=0.104.1,<1.0.0", +"pydantic>=2.5.0,<3.0.0", +"mlflow[databricks,genai]>=3.9,<4.0.0", +"sqlalchemy>=2.0.23,<3.0.0", +``` + +The `uv.lock` file provides reproducible installs, but the pyproject.toml constraints are still important for communicating compatibility intent and for environments that don't use the lock file. + +**Acceptance Criteria**: +- [ ] All critical packages have upper-bound version constraints +- [ ] `uv sync` and `pip install` still resolve successfully +- [ ] Documented process for bumping version constraints + +--- + +### DEP-3: Duplicated httpx Dependency + +**Severity**: LOW +**Location**: `pyproject.toml:32` (main deps) and `pyproject.toml:65` (test deps) + +**Description**: +```toml +# Main dependencies +"httpx>=0.25.0", + +# [project.optional-dependencies] test +"httpx>=0.25.0", # For testing +``` + +`httpx` appears in both main dependencies and test optional-dependencies. Since it's in main, the test duplicate is redundant. + +**Impact**: Minor confusion about whether httpx is a runtime or test dependency. If it's truly test-only, it shouldn't be in main deps. + +**Remediation**: Determine if httpx is used at runtime: +- If yes: remove from `[project.optional-dependencies] test` +- If no: move to test-only and remove from main dependencies + +**Acceptance Criteria**: +- [ ] `httpx` appears in exactly one dependency section +- [ ] Comment explains why it's in that section + +--- + +## Prioritized Backlog + +| Priority | ID | Title | Effort | Impact | +|----------|-----|-------|--------|--------| +| P1 | DEP-1 | Remove unused litellm and dspy | S | High - reduces bloat and attack surface | +| P2 | DEP-2 | Add upper-bound version constraints | S | Medium - prevents breaking upgrades | +| P3 | DEP-3 | Deduplicate httpx dependency | S | Low - minor cleanup | + +**Effort**: S = < 2 hours, M = 2-8 hours, L = 1-3 days diff --git a/debt-docs/DEPLOYMENT_DEBT.md b/debt-docs/DEPLOYMENT_DEBT.md new file mode 100644 index 00000000..e4215db3 --- /dev/null +++ b/debt-docs/DEPLOYMENT_DEBT.md @@ -0,0 +1,392 @@ +# Deployment Debt + +## Overview + +The deployment infrastructure has critical gaps in environment configuration validation, health checks, and operational observability. The application can silently fall back to SQLite when PostgreSQL is expected, health endpoints always report healthy even when the database is broken, and there is no structured logging for production troubleshooting. Worker and connection pool configurations are inconsistent between backends. + +--- + +## Items + +### DEPLOY-1: Missing Environment Configuration Validation + +**Severity**: CRITICAL +**Location**: +- `server/db_config.py:54-73, 138-160` +- `server/app.py` (startup) + +**Description**: +```python +def from_env(cls) -> LakebaseConfig | None: + host = os.getenv("PGHOST") + database = os.getenv("PGDATABASE") + user = os.getenv("PGUSER") + if not all([host, database, user]): + return None # Silently returns None - falls back to SQLite +``` + +When `DATABASE_ENV=postgres` is set but required PostgreSQL variables (`PGHOST`, `PGDATABASE`, `PGUSER`) are missing: +1. Falls back to SQLite **silently** (line 159) +2. No warning about the mismatch +3. Data written to ephemeral SQLite is lost on restart +4. Token refresh errors are warned but non-fatal (line 112) + +**Impact**: Production deployment can silently use wrong database. Data loss when container restarts. + +**Remediation**: Fail fast if `DATABASE_ENV=postgres` but PostgreSQL config is incomplete: +```python +if os.getenv("DATABASE_ENV") == "postgres": + config = LakebaseConfig.from_env() + if config is None: + raise RuntimeError("DATABASE_ENV=postgres but PGHOST/PGDATABASE/PGUSER not set") +``` + +**Acceptance Criteria**: +- [ ] App fails to start if DATABASE_ENV doesn't match available config +- [ ] All required env vars validated on startup +- [ ] Startup log clearly states which backend is active + +--- + +### DEPLOY-2: CORS Allows All Origins in Production + +**Severity**: CRITICAL +**Location**: `server/config.py:24`, `server/app.py:201-207` + +**Description**: See SEC-2 in SECURITY_DEBT.md. Listed here as well because this is a deployment configuration issue. + +```python +CORS_ORIGINS: list = ['*'] # Default allows all origins +``` + +**Impact**: Security vulnerability in production. See SEC-2 for details. + +**Remediation**: Read CORS_ORIGINS from environment variable. Default to localhost for development. + +**Acceptance Criteria**: +- [ ] CORS_ORIGINS configurable via environment variable +- [ ] Production deployment sets specific origins + +--- + +### DEPLOY-3: Health Check Always Returns Healthy + +**Severity**: HIGH +**Location**: `server/app.py:212-251` + +**Description**: +```python +@app.get("/health") +async def health(): + return {"status": "healthy"} # Always 200, even if DB is down + +@app.get("/health/detailed") +async def detailed_health(): + with engine.connect() as conn: + conn.execute(text("SELECT 1")) + # Doesn't check: + # - Database migrations are current + # - Required tables exist + # - Configuration is valid + # - Encryption key is set +``` + +The basic health endpoint always returns 200. The detailed health only checks `SELECT 1` but not whether the schema is initialized or migrations applied. + +**Impact**: Load balancer routes traffic to containers that can't serve requests. App appears healthy but returns 500s on every real request. + +**Remediation**: Make `/health` check database connectivity and required tables: +```python +@app.get("/health") +async def health(): + try: + with engine.connect() as conn: + conn.execute(text("SELECT 1")) + return {"status": "healthy"} + except Exception: + return JSONResponse(status_code=503, content={"status": "unhealthy"}) +``` + +Add a readiness probe that validates full initialization. + +**Acceptance Criteria**: +- [ ] `/health` returns 503 if database is unreachable +- [ ] `/health/detailed` checks table existence and migration status +- [ ] Startup readiness probe validates full initialization + +--- + +### DEPLOY-4: Worker Count Inconsistency + +**Severity**: HIGH +**Location**: +- `server/config.py:12` - `WORKERS: int = int(os.getenv('WORKERS', '4'))` +- `app.yaml:5` - `gunicorn server.app:app -w 2` + +**Description**: `config.py` defaults to 4 workers but `app.yaml` starts gunicorn with 2. The config.py value is unused when running via gunicorn (which has its own `-w` flag). + +**Impact**: Confusion about actual worker count. If the app auto-scales or is started via config.py's uvicorn path, it uses 4 workers which may exceed container resources. + +**Remediation**: Align worker configuration. Use a single source of truth: +```yaml +# app.yaml +command: gunicorn server.app:app -w ${WORKERS:-2} --worker-class uvicorn.workers.UvicornWorker +``` + +**Acceptance Criteria**: +- [ ] Single source of truth for worker count +- [ ] Worker count matches container CPU allocation +- [ ] config.py WORKERS value used or removed + +--- + +### DEPLOY-5: Database Pool Size Inconsistency + +**Severity**: HIGH +**Location**: `server/db_config.py:215-223` (SQLite) vs `server/db_config.py:255-267` (PostgreSQL) + +**Description**: +```python +# SQLite +pool_size=20, max_overflow=30, pool_recycle=3600 # 1 hour + +# PostgreSQL +pool_size=5, max_overflow=10, pool_recycle=300 # 5 minutes +``` + +SQLite gets 4x the pool size of PostgreSQL, which is backwards. SQLite has file-level locking and benefits less from large pools. PostgreSQL (especially serverless Lakebase) has connection limits. + +**Impact**: SQLite: wasted resources. PostgreSQL: potential connection exhaustion if pool is too small under load. + +**Remediation**: Tune pool sizes based on backend characteristics and worker count: +- SQLite: `pool_size=5, max_overflow=10` (file-locked, less concurrency) +- PostgreSQL: `pool_size=10, max_overflow=5` (per worker, with connection limits) +- Make configurable via environment variables + +**Acceptance Criteria**: +- [ ] Pool sizes appropriate for each backend +- [ ] Configurable via environment variables +- [ ] Documented rationale for default values + +--- + +### DEPLOY-6: Migration Strategy Has No Rollback Support + +**Severity**: HIGH +**Location**: `migrations/versions/` (all files), `server/database.py:589-734` + +**Description**: +1. Migration files may lack complete `downgrade()` implementations +2. No documented rollback procedure +3. Schema changes in `_apply_schema_updates()` use `CREATE TABLE IF NOT EXISTS` and `ALTER TABLE ADD COLUMN IF NOT EXISTS` which bypass Alembic entirely +4. No pre-deployment backup strategy documented + +**Impact**: Failed deployments cannot be rolled back. Schema changes outside Alembic create version drift. + +**Remediation**: +1. Audit all migration files for complete `downgrade()` functions +2. Move all manual schema updates from `_apply_schema_updates()` into proper Alembic migrations +3. Document rollback procedure +4. Add pre-deployment backup requirement + +**Acceptance Criteria**: +- [ ] All migrations have tested `downgrade()` functions +- [ ] Zero manual schema updates outside Alembic +- [ ] Rollback procedure documented in deployment guide +- [ ] Pre-deployment backup automated + +--- + +### DEPLOY-7: No Structured Logging + +**Severity**: MEDIUM +**Location**: Throughout `server/` (mix of `print()` and `logging.getLogger()`) + +**Description**: Logs are free-form strings with no structured format: +```python +logger.warning("Database connection failed (attempt %d/%d), ...", attempt + 1, max_retries, ...) +``` + +No JSON logging, no request correlation IDs, no structured fields for filtering. + +**Impact**: Cannot query logs for specific errors, set up alerts, or track error patterns across deployments. + +**Remediation**: Implement structured JSON logging: +```python +import structlog +logger = structlog.get_logger() +logger.warning("db_connection_failed", attempt=attempt, max_retries=max_retries, error=str(e)) +``` + +**Acceptance Criteria**: +- [ ] All logging in JSON format +- [ ] Request ID propagated through all log messages +- [ ] Log levels consistent (DEBUG for verbose, INFO for operations, WARNING for recoverable, ERROR for failures) + +--- + +### DEPLOY-8: Build Process Missing Security and Quality Checks + +**Severity**: MEDIUM +**Location**: `justfile:240-257` (ui-build recipe) + +**Description**: Frontend build does not include: +- `npm audit` for vulnerability checking +- `knip` for dead code detection (installed but never run) +- Bundle size validation +- License compliance check + +```bash +# justfile ui-build +npm -C {{client-dir}} install +npm -C {{client-dir}} run build +# No audit, no knip, no size check +``` + +**Impact**: Known vulnerabilities ship to production. Unused code bloats bundle. + +**Remediation**: Add checks to build pipeline: +```bash +npm -C client audit --audit-level=moderate +npx -C client knip +# Bundle size check against budget +``` + +**Acceptance Criteria**: +- [ ] `npm audit` runs in CI +- [ ] `knip` runs in CI (or as justfile command) +- [ ] Bundle size budget documented and enforced + +--- + +### DEPLOY-9: Database Backup Strategy Gaps + +**Severity**: MEDIUM +**Location**: `app.yaml:14-38` + +**Description**: SQLite rescue backup is optional and commented out: +```yaml +# env: +# - name: SQLITE_VOLUME_PATH +# valueFrom: db_backup_volume +``` + +When using PostgreSQL (Lakebase): no backup configuration at all. Relies entirely on Databricks infrastructure backups. + +**Impact**: SQLite data loss if rescue not configured. PostgreSQL recovery depends on Databricks SLA. + +**Remediation**: +1. Make SQLite backup mandatory when using SQLite backend +2. Document PostgreSQL backup options +3. Add pre-deployment backup job for major migrations + +**Acceptance Criteria**: +- [ ] SQLite backup enabled by default (warn if not configured) +- [ ] PostgreSQL backup strategy documented +- [ ] Pre-migration backup procedure documented + +--- + +### DEPLOY-10: No API Versioning + +**Severity**: MEDIUM +**Location**: `server/app.py:189-193` + +**Description**: +```python +app = FastAPI( + title="Databricks App API", + version="0.1.0", # Hardcoded, never updated +) +``` + +Version is hardcoded and never incremented. No way to determine which version is deployed. + +**Impact**: Cannot correlate issues to specific deployments. No API versioning for backward compatibility. + +**Remediation**: Read version from `pyproject.toml` or git tag: +```python +from importlib.metadata import version +app_version = version("human-eval-workshop") +``` + +Add `/version` endpoint returning version and git commit. + +**Acceptance Criteria**: +- [ ] Version read from pyproject.toml +- [ ] `/version` endpoint returns version and build info +- [ ] Version auto-incremented on release + +--- + +### DEPLOY-11: No API Rate Limiting + +**Severity**: LOW +**Location**: `server/app.py:196-207` (middleware configuration) + +**Description**: No rate limiting middleware. Users can send unlimited requests. + +**Impact**: Potential for abuse, resource exhaustion, accidental DDoS from polling bugs. + +**Remediation**: Add `slowapi` middleware with per-user and per-IP limits. + +**Acceptance Criteria**: +- [ ] Rate limiting configured (e.g., 100 req/min per user) +- [ ] Limits configurable via environment +- [ ] 429 responses include `Retry-After` header + +--- + +### DEPLOY-12: No Request Timeout Configuration + +**Severity**: LOW +**Location**: `server/config.py:16, 48` + +**Description**: No per-request timeout. Long operations hold connections indefinitely. See PERF-9 for details. + +**Acceptance Criteria**: +- [ ] Default request timeout configured +- [ ] Long operations use background tasks + +--- + +### DEPLOY-13: CI Pipeline Missing Caching + +**Severity**: LOW +**Location**: `.github/workflows/e2e-test.yml` + +**Description**: CI workflow installs all dependencies from scratch on every run. No caching for: +- Python packages (`uv` cache) +- Node modules +- Playwright browsers + +**Impact**: Slow CI pipeline, wasted compute. + +**Remediation**: Add caching steps to CI workflow. + +**Acceptance Criteria**: +- [ ] Python package cache across runs +- [ ] Node modules cache across runs +- [ ] Playwright browser cache across runs + +--- + +## Prioritized Backlog + +| Priority | ID | Title | Effort | Impact | +|----------|-----|-------|--------|--------| +| P0 | DEPLOY-1 | Validate env config on startup | S | Critical - prevents silent wrong backend | +| P0 | DEPLOY-2 | Fix CORS for production | S | Critical - security (see SEC-2) | +| P1 | DEPLOY-3 | Fix health checks to validate DB | M | High - prevents bad routing | +| P1 | DEPLOY-4 | Align worker count configuration | S | High - prevents resource mismatch | +| P1 | DEPLOY-5 | Fix pool size per backend | S | High - prevents connection issues | +| P1 | DEPLOY-6 | Audit and fix migration rollback | M | High - deployment safety | +| P2 | DEPLOY-7 | Implement structured logging | M | Medium - observability | +| P2 | DEPLOY-8 | Add security/quality checks to build | M | Medium - vulnerability prevention | +| P2 | DEPLOY-9 | Document backup strategy | S | Medium - data safety | +| P2 | DEPLOY-10 | Add API versioning | S | Medium - operational visibility | +| P3 | DEPLOY-11 | Add rate limiting | M | Low - abuse prevention | +| P3 | DEPLOY-12 | Add request timeout | S | Low - resource protection | +| P3 | DEPLOY-13 | Add CI caching | S | Low - faster pipelines | + +**Effort**: S = < 2 hours, M = 2-8 hours, L = 1-3 days diff --git a/debt-docs/DX_DEBT.md b/debt-docs/DX_DEBT.md new file mode 100644 index 00000000..0d4b44f8 --- /dev/null +++ b/debt-docs/DX_DEBT.md @@ -0,0 +1,335 @@ +# Developer Experience & Documentation Debt + +## Overview + +Developer experience debt spans stale workspace artifacts, missing tooling configuration, inconsistent logging patterns, and significant documentation gaps. The most impactful issue is the complete absence of architecture documentation - no system diagrams, data flow docs, or deployment architecture guide. Combined with monolithic 5000+ line files and 6+ configuration formats, onboarding friction is high. The logging inconsistency (135+ print statements mixed with proper logging) makes production debugging impractical. + +--- + +## Items + +### DX-1: Missing Architecture Documentation + +**Severity**: CRITICAL +**Location**: None - entirely missing + +**Description**: No architecture documentation exists anywhere in the repo: +- No system diagrams +- No data flow documentation (MLflow traces -> ingestion -> annotation -> export) +- No deployment architecture guide (local vs Databricks Apps) +- No integration documentation (SQLite/Postgres, MLflow, Databricks APIs, custom LLM providers) +- No component communication patterns (frontend state management, API client organization) + +The `/doc/` directory contains only release notes and FACILITATOR_GUIDE.md. The `/specs/` directory covers behavioral specs but not architecture. + +**Impact**: Onboarding takes 2x longer. Architectural decisions are implicit and easily violated. New contributors misunderstand the system. Production troubleshooting requires reading code. + +**Remediation**: Create `doc/ARCHITECTURE.md` covering: +1. System overview diagram +2. Data flow: traces -> ingestion -> discovery -> rubric -> annotation -> results -> export +3. Backend layers: routers -> services -> database +4. Frontend architecture: pages, components, hooks, context, API client +5. Deployment: local dev vs Databricks Apps, database backends, volume management +6. Integration points: MLflow, Databricks SDK, custom LLM providers + +**Acceptance Criteria**: +- [ ] `doc/ARCHITECTURE.md` exists with system diagram +- [ ] Data flow documented end-to-end +- [ ] Deployment architecture documented +- [ ] New contributor can understand the system from docs alone + +--- + +### DX-2: Stale Git Artifacts and Workspace Clutter + +**Severity**: HIGH +**Location**: Git staging area (from `git status`) + +**Description**: Multiple deleted files not committed, temp files, and database artifacts: + +**Deleted but not committed**: +- `client/bypassed-login-layout.png` +- `client/current-state.png` +- `client/debug-screenshot.png` +- `client/layout-error-check.png` +- `client/vite.config.ts.timestamp-*.mjs` (Vite temp file) +- `e2e-test-output.txt` + +**Should be in .gitignore**: +- `.coverage` (pytest coverage artifacts) +- `mlflow.db` (MLflow database) +- `workshop*.db` (stale workshop databases - 4+ MB) +- `.claude/mlflow/*.log` (Claude tracing logs showing as modified) + +**Impact**: Pollutes git status, confuses developers, bloats repository. + +**Remediation**: +1. `git rm --cached` the deleted files +2. Add missing patterns to `.gitignore`: `*.db`, `.coverage`, `.claude/mlflow/*.log` +3. Clean up untracked files + +**Acceptance Criteria**: +- [ ] `git status` is clean (no stale deletions) +- [ ] `.gitignore` covers all generated artifacts +- [ ] No database files or coverage artifacts in repo + +--- + +### DX-3: Missing Frontend Tooling Configuration + +**Severity**: HIGH +**Location**: `client/` directory + +**Description**: Several expected configuration files are missing: +- **No ESLint config** - `eslint` is in package.json but no `.eslintrc.cjs` or `eslint.config.js` exists. `npm run lint` uses default rules only. +- **No Prettier config** - `prettier` is listed but no `.prettierrc` exists. Formatting is inconsistent. +- **No pre-commit hooks** - No `.pre-commit-config.yaml` or `husky` configuration. Developers can commit unformatted code, debug statements, and lint violations. + +**Impact**: Inconsistent code quality across frontend. No automated enforcement of standards. + +**Remediation**: +1. Create `client/eslint.config.js` with TypeScript + React rules +2. Create `client/.prettierrc` with project standards +3. Add pre-commit hooks (husky + lint-staged for frontend, pre-commit for Python) + +**Acceptance Criteria**: +- [ ] ESLint config file exists and `npm run lint` enforces rules +- [ ] Prettier config exists and `npm run format` is available +- [ ] Pre-commit hooks run lint + format on staged files + +--- + +### DX-4: 135+ Print Statements Mixed with Logging + +**Severity**: HIGH +**Location**: Throughout `server/` (see CQ-5 in CODE_QUALITY_DEBT.md) + +**Description**: Backend mixes `print()` with `logging.getLogger()`: +```python +# server/database.py +print(' Creating database tables...') +print(f' Created/verified schema: {schema_name}') + +# server/routers/workshops.py +print(f" DEBUG trace_ids: {[t.id for t in traces]}") +``` + +135+ print statements found across the backend. No structured format, no log levels. + +**Impact**: Cannot filter, aggregate, or alert on logs. Production debugging requires reading raw stdout. Overlaps with CQ-5 and DEPLOY-7. + +**Remediation**: Replace all `print()` with `logger.info()` / `logger.debug()`. See DEPLOY-7 for structured logging plan. + +**Acceptance Criteria**: +- [ ] Zero `print()` statements in production code +- [ ] All logging uses `logging` module with appropriate levels + +--- + +### DX-5: Missing API Endpoint Documentation + +**Severity**: MEDIUM +**Location**: `server/routers/` (all files) + +**Description**: OpenAPI/Swagger descriptions are minimal: +```python +@router.post('/auth/login', response_model=AuthResponse) +async def login(login_data: UserLogin, ...): + """Authenticate a user with email and password.""" # Too minimal +``` + +Missing from most endpoints: +- Parameter constraints (required fields, valid ranges) +- Error response documentation +- Request/response examples +- Authentication requirements + +**Impact**: Frontend developers must read backend code to understand API contracts. + +**Remediation**: Add OpenAPI descriptions with `response_model`, `responses`, and `description` parameters. + +**Acceptance Criteria**: +- [ ] All endpoints have descriptive docstrings +- [ ] Error responses documented (400, 401, 403, 404, 500) +- [ ] OpenAPI UI at `/docs` is usable for API exploration + +--- + +### DX-6: Configuration Sprawl (6+ Formats) + +**Severity**: MEDIUM +**Location**: Multiple files + +**Description**: Configuration is spread across 8+ files in different formats: + +| File | Format | Purpose | +|------|--------|---------| +| `pyproject.toml` | TOML | Python project, tools (black, ruff, mypy, pytest) | +| `client/package.json` | JSON | Frontend deps, scripts | +| `client/vite.config.ts` | TypeScript | Build, dev server, test | +| `alembic.ini` | INI | Migration config | +| `app.yaml` | YAML | Databricks Apps deployment | +| `config/auth.yaml` | YAML | Auth configuration | +| `.env.local` | Env | Environment variables | +| `justfile` | Just | Task runner (with inline Python/shell) | + +**Impact**: New developers must understand 6 different config formats. No single source of truth. Configuration validation is minimal. + +**Remediation**: Document all configuration in a single guide (`doc/CONFIGURATION.md`). Add startup validation for all required config values. + +**Acceptance Criteria**: +- [ ] `doc/CONFIGURATION.md` lists all config files and their purpose +- [ ] All required environment variables documented with defaults +- [ ] Startup validates required configuration + +--- + +### DX-7: No Database Schema Documentation + +**Severity**: MEDIUM +**Location**: `server/database.py`, `migrations/versions/` + +**Description**: No human-readable schema documentation: +- No entity relationship diagram (ERD) +- No schema evolution timeline +- No data model explanation +- Relationships between tables undocumented + +Developers must read Alembic migration code or SQLAlchemy models to understand the schema. + +**Impact**: Schema relationships unclear. Easy to create queries that violate implicit constraints. + +**Remediation**: Create `doc/DATABASE.md` with: +1. ERD diagram (can use Mermaid in markdown) +2. Table descriptions and relationships +3. Important constraints and invariants +4. Query patterns and recommended joins + +**Acceptance Criteria**: +- [ ] `doc/DATABASE.md` exists with ERD +- [ ] All tables and key columns documented +- [ ] Updated when schema changes + +--- + +### DX-8: Generated API Client Not Documented + +**Severity**: MEDIUM +**Location**: `client/src/client/` (generated code) + +**Description**: +```typescript +/* generated using openapi-typescript-codegen -- do not edit */ +``` + +Generated from OpenAPI but no documentation on: +- How/when to regenerate +- What OpenAPI spec URL to use +- How to update when backend changes +- Whether manual edits are allowed + +**Impact**: Frontend developers don't know how to update the API client when backend endpoints change. + +**Remediation**: Add `client/src/client/README.md` or comments in the generator script explaining: +1. How to regenerate: `npx openapi-typescript-codegen --input http://localhost:8000/openapi.json --output src/client` +2. When to regenerate: after backend API changes +3. What not to modify: generated files + +**Acceptance Criteria**: +- [ ] Regeneration process documented +- [ ] justfile recipe for regeneration +- [ ] CI check that generated code is up to date + +--- + +### DX-9: Console Error Handling Without Structured Logging (Frontend) + +**Severity**: MEDIUM +**Location**: `client/src/context/UserContext.tsx` + +**Description**: +```typescript +console.error('Failed to load permissions on cached user:', permError); +console.error('Error initializing user:', e); +console.warn('Failed to load permissions, using defaults'); +``` + +Frontend error handling goes to browser console only. No error reporting service. + +**Impact**: Production errors invisible to the team. No error aggregation or alerting. + +**Remediation**: Integrate an error reporting service (Sentry, LogRocket, or custom) for production error visibility. + +**Acceptance Criteria**: +- [ ] Error reporting service configured +- [ ] Unhandled errors captured automatically +- [ ] Key error paths reported with context + +--- + +### DX-10: Onboarding Friction in README + +**Severity**: MEDIUM +**Location**: `README.md`, `CONTRIBUTING.md` + +**Description**: +- README line 11 says "Download project-with-build.zip" but no link provided +- Setup instructions scattered between README and CONTRIBUTING.md +- No troubleshooting section +- Minimum versions buried in text (Python 3.11+, Node.js 22.16+) +- CONTRIBUTING.md has excellent spec-driven dev guidance but no: + - Local development quick start + - Common development tasks (how to add an endpoint, add a component) + - Debugging tips + +**Impact**: New contributors spend extra time figuring out setup and conventions. + +**Remediation**: Add quick start section to README, troubleshooting FAQ, and common tasks guide. + +**Acceptance Criteria**: +- [ ] README has clear quick start (clone, setup, run) +- [ ] Version requirements prominently listed +- [ ] Troubleshooting section for common issues +- [ ] CONTRIBUTING.md includes "How to add an endpoint" guide + +--- + +### DX-11: Unresolved TODO/FIXME Comments + +**Severity**: LOW +**Location**: See CQ-15 in CODE_QUALITY_DEBT.md + +**Description**: 6+ TODO comments with no linked issues or clear context: +```python +# TODO: pretty sure this does nothing (no commit, no update)? +# TODO: this was ostensibly here for a reason, but I don't know what it is. +# TODO: this is a noop, actually handle connection testing? +``` + +**Impact**: Uncertainty about code behavior. Developer stops to investigate each one. + +**Remediation**: Convert each to a GitHub issue or resolve. Remove unclear comments. + +**Acceptance Criteria**: +- [ ] Every TODO has a linked GitHub issue +- [ ] Unclear TODOs resolved or removed + +--- + +## Prioritized Backlog + +| Priority | ID | Title | Effort | Impact | +|----------|-----|-------|--------|--------| +| P0 | DX-2 | Clean up stale git artifacts | S | High - immediate workspace hygiene | +| P1 | DX-1 | Create architecture documentation | L | Critical - onboarding and understanding | +| P1 | DX-3 | Add ESLint, Prettier, pre-commit hooks | M | High - code quality enforcement | +| P1 | DX-4 | Replace print() with logging | M | High - production debugging (see CQ-5) | +| P2 | DX-5 | Add API endpoint documentation | M | Medium - developer productivity | +| P2 | DX-6 | Document configuration | M | Medium - onboarding | +| P2 | DX-7 | Create database schema documentation | M | Medium - understanding | +| P2 | DX-8 | Document API client regeneration | S | Medium - maintenance | +| P2 | DX-10 | Improve README and CONTRIBUTING | M | Medium - onboarding | +| P3 | DX-9 | Add frontend error reporting | M | Medium - observability | +| P3 | DX-11 | Resolve TODO comments | S | Low - code clarity | + +**Effort**: S = < 2 hours, M = 2-8 hours, L = 1-3 days diff --git a/debt-docs/PERFORMANCE_DEBT.md b/debt-docs/PERFORMANCE_DEBT.md new file mode 100644 index 00000000..5e14552b --- /dev/null +++ b/debt-docs/PERFORMANCE_DEBT.md @@ -0,0 +1,291 @@ +# Performance Debt + +## Overview + +The codebase has performance debt primarily in the database layer (missing indexes, N+1 query patterns, no pagination on list endpoints) and in the frontend (inefficient polling without backoff, dead cache code, large JSON columns fetched unnecessarily). While the current scale may tolerate these issues, they will become blockers as workshops grow to hundreds of traces and dozens of concurrent users. + +--- + +## Items + +### PERF-1: Missing Database Indexes on Foreign Keys + +**Severity**: HIGH +**Location**: `server/database.py:90-407` (model definitions) + +**Description**: Frequently filtered/joined columns lack indexes: + +| Table | Column(s) | Used in | Current Index | +|-------|-----------|---------|---------------| +| `discovery_findings` | `workshop_id`, `trace_id`, `user_id` | Filter queries in database_service.py | None | +| `annotations` | `workshop_id`, `trace_id`, `user_id` | Filter queries throughout | None | +| `traces` | `workshop_id` | Lines 496, 502, 543, 588, 676, 721 in database_service.py | None | +| `judge_evaluations` | `workshop_id`, `prompt_id`, `trace_id` | Evaluation queries | None | +| `participant_notes` | `workshop_id`, `user_id` | Already indexed (only one) | `ix_participant_notes_workshop_user` | + +Only one index exists: `ix_participant_notes_workshop_user` on line 714 of database.py. + +**Impact**: Full table scans on every query. For a workshop with 1000 traces and 50 annotations per trace, queries that should be O(log n) are O(n). PostgreSQL partially mitigates this with query planning, but SQLite does not. + +**Remediation**: Add composite indexes via Alembic migration: +```python +Index('ix_annotations_workshop_trace', 'workshop_id', 'trace_id') +Index('ix_annotations_workshop_user', 'workshop_id', 'user_id') +Index('ix_traces_workshop', 'workshop_id') +Index('ix_findings_workshop_trace', 'workshop_id', 'trace_id') +Index('ix_evaluations_workshop_prompt', 'workshop_id', 'prompt_id') +``` + +**Acceptance Criteria**: +- [ ] Indexes added for all frequently filtered foreign keys +- [ ] Alembic migration created (not manual ALTER TABLE) +- [ ] Query performance validated with EXPLAIN on representative data + +--- + +### PERF-2: N+1 Query Patterns in Database Service + +**Severity**: HIGH +**Location**: `server/services/database_service.py:217-257` + +**Description**: List operations fetch all records then process individually: +```python +db_workshops = query.all() +return [self._workshop_from_db(w) for w in db_workshops] +``` + +The `get_workshops_for_user()` method (line 237-240) queries all workshop_ids, then makes a second query for full workshop objects. `get_findings_with_user_details()` (lines 897-906) does a JOIN but still loads all columns for both tables regardless of what's needed. + +**Impact**: For a user in 20 workshops, each with 100 traces, this can generate hundreds of database round trips per request. + +**Remediation**: +- Use SQLAlchemy `joinedload()` or `selectinload()` for eager loading relationships +- Combine multi-step queries into single statements +- Use `load_only()` to select specific columns + +**Acceptance Criteria**: +- [ ] No queries inside loops (verified by code review) +- [ ] Relationship loading strategy documented per query +- [ ] Response times validated under load (50+ workshops) + +--- + +### PERF-3: No Pagination on List Endpoints + +**Severity**: HIGH +**Location**: `server/routers/workshops.py:229-252` and others + +**Description**: List endpoints return entire result sets with no limit: +```python +@router.get("/") +async def list_workshops(...) -> List[Workshop]: + return db_service.get_workshops_for_user(user_id) # No limit +``` + +Affected endpoints: +- `list_workshops()` - all workshops +- `get_discovery_findings()` - all findings +- `get_participant_notes()` - all notes +- `get_all_annotations()` - all annotations + +**Impact**: With 1000+ workshops or traces, response payloads become megabytes, causing timeouts and high memory usage. + +**Remediation**: Add `skip` and `limit` query parameters: +```python +@router.get("/") +async def list_workshops(skip: int = 0, limit: int = 50, ...) -> List[Workshop]: + return db_service.get_workshops_for_user(user_id, skip=skip, limit=limit) +``` + +**Acceptance Criteria**: +- [ ] All list endpoints support `skip` and `limit` parameters +- [ ] Default limit of 50, maximum limit of 200 +- [ ] Response includes total count for pagination UI + +--- + +### PERF-4: Frontend Polling Without Exponential Backoff + +**Severity**: MEDIUM +**Location**: +- `client/src/components/IntakeWaitingView.tsx:43` - Fixed 5-second interval +- `client/src/pages/JudgeTuningPage.tsx:1142, 1152` - Fixed 2s/5s intervals +- `client/src/pages/AnnotationDemo.tsx` - Various fixed intervals + +**Description**: +```typescript +const interval = setInterval(loadStatus, 5000); // Fixed forever +``` + +Polling runs at fixed intervals regardless of: +- Whether status has changed +- How many users are polling simultaneously +- Server load + +With 100 users in intake phase = 1200 requests/minute to the same endpoint. + +**Impact**: Unnecessary server load, wasted bandwidth, poor mobile experience. + +**Remediation**: Implement exponential backoff in the shared `usePolling` hook (see CQ-7): +- Start at 2s, increase to 30s if no change +- Stop polling when terminal state reached +- Use `ETag` or `If-Modified-Since` for conditional requests + +**Acceptance Criteria**: +- [ ] All polling uses exponential backoff +- [ ] Terminal states stop polling +- [ ] Server load tested with 50+ concurrent users + +--- + +### PERF-5: Dead Cache Code (Per-Request Instance) + +**Severity**: MEDIUM +**Location**: `server/services/database_service.py:113-139` + +**Description**: +```python +def __init__(self, db: Session): + self.db = db + self._cache = {} + self._cache_ttl = 30 # 30 seconds +``` + +`DatabaseService` is instantiated per-request (line 245 in workshops.py: `db_service = DatabaseService(db)`). The cache is created and destroyed with each request, so the 30-second TTL is never utilized. + +**Impact**: Dead code that provides false sense of caching. Adds complexity without benefit. + +**Remediation**: Either: +1. Remove the dead cache code entirely, OR +2. Implement module-level cache (e.g., `functools.lru_cache` or Redis) that persists across requests + +**Acceptance Criteria**: +- [ ] Cache either works correctly or is removed +- [ ] If kept, validated that cache hits occur in practice + +--- + +### PERF-6: Large JSON Columns Fetched Unnecessarily + +**Severity**: MEDIUM +**Location**: `server/database.py:127, 147, 150-151, 199-200` + +**Description**: +```python +assigned_traces = Column(JSON, default=list) # Line 127 - per-user trace list +active_discovery_trace_ids = Column(JSON, default=list) # Line 150 - could be thousands +active_annotation_trace_ids = Column(JSON, default=list) # Line 151 - could be thousands +context = Column(JSON, nullable=True) # Line 199 - full trace context +trace_metadata = Column(JSON, nullable=True) # Line 200 - arbitrary metadata +``` + +These columns are always loaded even when only the workshop name or trace ID is needed. For a workshop with 5000 traces, `active_annotation_trace_ids` alone could be 100KB. + +**Impact**: Excessive memory usage and network transfer for simple queries. + +**Remediation**: +- Use `load_only()` in queries that don't need JSON columns +- Consider moving trace ID lists to junction tables +- For PostgreSQL: use `jsonb` with partial indexes + +**Acceptance Criteria**: +- [ ] List/summary endpoints don't load JSON columns +- [ ] Detail endpoints load JSON columns selectively + +--- + +### PERF-7: Blocking `time.sleep()` in Async Context + +**Severity**: MEDIUM +**Location**: `server/routers/workshops.py:170-201` + +**Description**: +```python +def _retry_db_operations(operations_fn, db_session, max_retries=5, base_delay=0.5): + for attempt in range(max_retries): + try: + return operations_fn() + except OperationalError as e: + time.sleep(delay) # Blocks the event loop +``` + +`time.sleep()` blocks the async event loop. During retries (up to 5 with exponential backoff), no other requests can be processed on that worker. + +**Impact**: Under high concurrency with database contention, this can cascade into thread pool exhaustion. + +**Remediation**: Use `asyncio.sleep()` in an async function, or run the synchronous retry in a thread pool: +```python +await asyncio.sleep(delay) +``` + +**Acceptance Criteria**: +- [ ] Zero `time.sleep()` calls in async code paths +- [ ] Retry logic uses `asyncio.sleep()` or runs in executor + +--- + +### PERF-8: Production Bundle Includes Console Statements + +**Severity**: LOW +**Location**: `client/vite.config.ts:67-69` + +**Description**: +```typescript +terserOptions: { + compress: { + drop_console: false, // TODO: Re-enable for production + }, +}, +``` + +All `console.log`, `console.error`, `console.warn` from both application code and dependencies are included in the production bundle. + +**Impact**: ~5-10KB unnecessary bundle size, minor runtime overhead from string formatting. + +**Remediation**: Set `drop_console: true`. Replace needed error reporting with a dedicated service. + +**Acceptance Criteria**: +- [ ] `drop_console: true` in vite.config.ts +- [ ] No runtime errors after enabling (replace needed console calls first) + +--- + +### PERF-9: No Request Timeout for Heavy Operations + +**Severity**: LOW +**Location**: `server/config.py:16, 48` + +**Description**: No per-request timeout configured. Long-running operations (large trace ingestion, evaluation jobs) can hold connections open indefinitely. + +```python +KEEP_ALIVE_TIMEOUT: int = int(os.getenv('KEEP_ALIVE_TIMEOUT', '65')) +'timeout_graceful_shutdown': 30, +# No request-level timeout +``` + +**Impact**: Resource exhaustion under high load or with large uploads. + +**Remediation**: Add request timeout middleware or per-endpoint timeouts for heavy operations. + +**Acceptance Criteria**: +- [ ] Default request timeout of 60 seconds +- [ ] Long-running operations use background tasks instead +- [ ] Timeout configurable via environment variable + +--- + +## Prioritized Backlog + +| Priority | ID | Title | Effort | Impact | +|----------|-----|-------|--------|--------| +| P1 | PERF-1 | Add database indexes for foreign keys | M | High - immediate query speedup | +| P1 | PERF-3 | Add pagination to list endpoints | M | High - prevents timeouts at scale | +| P1 | PERF-2 | Fix N+1 query patterns | M | High - reduces DB round trips | +| P2 | PERF-4 | Implement exponential backoff in polling | M | Medium - reduces server load | +| P2 | PERF-5 | Fix or remove dead cache code | S | Medium - removes dead code | +| P2 | PERF-7 | Replace time.sleep with asyncio.sleep | S | Medium - unblocks event loop | +| P2 | PERF-6 | Selective column loading for JSON | M | Medium - reduces memory/bandwidth | +| P3 | PERF-8 | Enable drop_console in Vite build | S | Low - bundle size | +| P3 | PERF-9 | Add request timeout middleware | S | Low - resource protection | + +**Effort**: S = < 2 hours, M = 2-8 hours, L = 1-3 days diff --git a/debt-docs/REACT_PATTERNS_DEBT.md b/debt-docs/REACT_PATTERNS_DEBT.md new file mode 100644 index 00000000..a18a38cf --- /dev/null +++ b/debt-docs/REACT_PATTERNS_DEBT.md @@ -0,0 +1,482 @@ +# React Patterns Debt + +## Overview + +The React frontend has strong library choices (TanStack Query, Radix UI, Tailwind) and some well-designed hooks (`useWorkshopApi.ts` has excellent optimistic update patterns). But the component architecture has significant structural debt: React Router is essentially unused — all real navigation happens via component state, breaking browser history and deep linking. The largest component (JudgeTuningPage, 2,754 lines) has 32 useState hooks and 15+ direct fetch() calls, functioning as a mini-application. There are no error boundaries, no form library, no code splitting, and contexts mix data fetching, cache orchestration, and state management. The codebase would benefit from treating React Router as the navigation backbone, decomposing page-level components, and extracting concerns that have outgrown their containers. + +--- + +## Items + +### RC-1: React Router Unused for Application Navigation + +**Severity**: CRITICAL +**Location**: +- `client/src/App.tsx:16-35` — only 2 routes defined +- `client/src/pages/WorkshopDemoLanding.tsx:64-250` — internal state-based "routing" +- `client/src/context/WorkflowContext.tsx` — phase state replaces URL state +- `client/src/context/WorkshopContext.tsx:157-183` — manual `popstate` listener + +**Description**: React Router v6 is installed but the application defines only 2 routes: +- `/` → `WorkshopDemoLanding` +- `/trace-viewer-demo` → `TraceDataViewerDemo` + +All real navigation (intake → discovery → rubric → annotation → results → judge_tuning) happens inside `WorkshopDemoLanding` via component state: + +```typescript +// WorkshopDemoLanding.tsx — acts as a component-based router +const getViewForPhaseWithState = (role, requestedPhase, state) => { + // Maps phase → component view based on state + // Returns 'discovery-start', 'annotation-monitor', etc. +} +``` + +Phase changes call `setCurrentPhase()` in WorkflowContext, which updates component state — but **never updates the URL**. Meanwhile, `WorkshopContext` has a manual `popstate` listener to sync workshop IDs from URL, creating a half-implemented URL ↔ state bridge. + +**What breaks**: +- Browser back/forward doesn't work for phase navigation +- Bookmarking a specific phase is impossible +- Deep linking (e.g., sharing a URL to the annotation phase) doesn't work +- `Cmd+click` to open in new tab doesn't work for sidebar navigation +- Analytics/logging can't track page views by URL + +**Remediation**: Move phase navigation into React Router: +``` +/workshop/:workshopId/discovery +/workshop/:workshopId/annotation +/workshop/:workshopId/rubric +/workshop/:workshopId/judge-tuning +/workshop/:workshopId/results +``` + +Use `` for nested layouts, route loaders for data fetching, and route guards for phase gating. + +**Acceptance Criteria**: +- [ ] Each workshop phase has a distinct URL +- [ ] Browser back/forward navigates between phases +- [ ] Deep linking to any phase works +- [ ] No `currentPhase` component state — URL is source of truth + +--- + +### RC-2: JudgeTuningPage — 2,754-Line Component with 32 useState Hooks + +**Severity**: CRITICAL +**Location**: `client/src/pages/JudgeTuningPage.tsx` + +**Description**: This single component is 2,754 lines with: +- **32 `useState` hooks** (lines 62-123) managing: + - Prompt editing state (5 hooks) + - Evaluation results and metrics (4 hooks) + - Configuration state (4 hooks) + - Loading/error flags (5 hooks) + - Alignment state (5 hooks) + - Auto-evaluation state (4 hooks) + - UI state — pagination, expanded rows, mode (5 hooks) +- **15+ direct `fetch()` calls** bypassing TanStack Query and the generated client +- **Multiple `useEffect` chains** for data loading, polling, and state synchronization +- **Manual polling** with `setInterval` for auto-evaluation status +- **Business logic** (prompt derivation, metrics calculation, model mapping) mixed with rendering +- **`any` types** on lines 70, 109 (`useState`) + +This component functions as a complete sub-application. It handles prompt management, evaluation execution, alignment calculation, metrics display, auto-evaluation polling, and export — each of which is a separate concern. + +**State that should be server state (TanStack Query)**: +- `prompts`, `evaluations`, `metrics`, `rubric`, `mlflowConfig` — all fetched from the server +- `autoEvalStatus`, `autoEvalJobId` — server polling state + +**State that should be derived (useMemo/computed)**: +- `isModified` — derivable from `currentPrompt !== originalPromptText` +- `hasEvaluated` — derivable from `evaluations.length > 0` + +**State that could use useReducer**: +- The 5 alignment state hooks (`isRunningAlignment`, `alignmentLogs`, `alignmentResult`, `showAlignmentLogs`, `alignmentModel`) are a state machine + +**Remediation**: Decompose into: +1. `JudgeTuningPage` — layout orchestrator only +2. `PromptEditor` — prompt creation/editing/selection +3. `EvaluationRunner` — evaluation execution and results +4. `AlignmentPanel` — human-judge alignment calculation +5. `AutoEvaluationStatus` — auto-eval polling and display +6. `useJudgePrompts()` — TanStack Query hook for prompt CRUD +7. `useEvaluations()` — TanStack Query hook for evaluation state +8. `useAutoEvaluation()` — TanStack Query hook with `refetchInterval` for polling + +**Acceptance Criteria**: +- [ ] JudgeTuningPage under 300 lines +- [ ] Zero direct `fetch()` calls — all through TanStack Query hooks +- [ ] Server state managed by `useQuery`/`useMutation`, not `useState` +- [ ] No manual `setInterval` polling + +--- + +### RC-3: No Error Boundaries Anywhere in the Application + +**Severity**: HIGH +**Location**: Entire `client/src/` — zero `ErrorBoundary` components exist + +**Description**: The application has no React error boundaries. If any component throws during rendering (e.g., accessing a property of `undefined`, JSON parse failure, API response shape change), the entire application crashes with a white screen. + +Components most at risk: +- **TraceViewer.tsx** (1,650 lines) — complex JSON parsing with multiple fallback paths +- **JudgeTuningPage.tsx** (2,754 lines) — many data dependencies, any missing field crashes render +- **FacilitatorDashboard.tsx** (1,325 lines) — derived metrics from multiple queries + +Error handling currently relies on: +- `try/catch` in event handlers (doesn't catch render errors) +- Toast notifications for API errors (doesn't catch render errors) +- Optional chaining (`?.`) to prevent null access (fragile, can still miss cases) + +**Remediation**: Add error boundaries at three levels: +1. **App-level**: Catch any unhandled error with a "Something went wrong" fallback +2. **Page-level**: Each route gets a boundary so one page crashing doesn't break others +3. **Widget-level**: TraceViewer, evaluation grid, and dashboard cards each get boundaries so a single widget failure shows a graceful degradation, not a page crash + +**Acceptance Criteria**: +- [ ] App-level error boundary with recovery action (reload) +- [ ] Page-level error boundaries for each phase view +- [ ] Widget-level boundaries for TraceViewer, dashboard, and evaluation components +- [ ] Error boundary logs errors to a reporting service + +--- + +### RC-4: Large Components — 6 Components Over 500 Lines + +**Severity**: HIGH +**Location**: + +| Component | Lines | useState | Responsibilities | +|-----------|-------|----------|------------------| +| `pages/JudgeTuningPage.tsx` | 2,754 | 32 | Prompt editing, evaluation, alignment, auto-eval, metrics, export | +| `components/TraceViewer.tsx` | 1,650 | 0 | JSON parsing, LLM extraction, smart rendering, tab UI | +| `components/FacilitatorDashboard.tsx` | 1,325 | 6 | Metrics, phase management, trace ops, progress cards, actions | +| `components/TraceDataViewer.tsx` | 730 | 0 | Trace display, content extraction, download, SQL generation | +| `pages/WorkshopDemoLanding.tsx` | 649 | 0 | Phase routing, sidebar, layout orchestration | +| `components/RoleBasedWorkflow.tsx` | 630 | 0 | Phase navigation, role-based menu, completion badges | +| `components/FocusedAnalysisView.tsx` | 605 | 3 | Trace navigation, CSV export, annotation UI | +| `components/DatabricksModelTester.tsx` | 582 | 11 | Connection testing, model selection, chat, endpoint calling | + +**Description**: Each of these components handles multiple distinct responsibilities. The React mental model is "small, composable components" — when a component exceeds ~300 lines, it usually means multiple concerns are entangled. + +**TraceViewer** (1,650 lines) is particularly notable: ~90 utility functions for JSON repair, parsing, and content extraction are defined *inside* the component file. These are pure functions with no React dependencies — they belong in a `utils/` module. + +**FacilitatorDashboard** (1,325 lines) renders completely different UIs based on `focusPhase` ('discovery' | 'annotation' | null). Contains 60 ternary operators and 27 `&&` conditional chains. These conditional branches are separate views that should be separate components. + +**Remediation**: Apply single-responsibility principle. Extract: +- Pure utility functions → `utils/` modules +- Conditional UI branches → sub-components +- State machines (loading/error/success) → custom hooks or `useReducer` +- Business logic (metrics calculation, data transformation) → hooks + +**Acceptance Criteria**: +- [ ] No component over 500 lines +- [ ] Pure utility functions in `utils/`, not in component files +- [ ] Conditional rendering branches extracted into named components + +--- + +### RC-5: Context Providers Mix Concerns — Data Fetching, Caching, State, and Side Effects + +**Severity**: HIGH +**Location**: +- `client/src/context/UserContext.tsx` (288 lines) +- `client/src/context/WorkshopContext.tsx` (208 lines) +- `client/src/context/WorkflowContext.tsx` (180 lines) + +**Description**: Each context provider handles multiple cross-cutting concerns: + +**UserContext** (288 lines) does: +1. Authentication (login/logout API calls) +2. Permission loading (separate API call) +3. localStorage persistence (read + write + sync) +4. User validation (checks if cached user still exists via API) +5. Activity tracking (`updateLastActive()` API call) +6. Fallback permission defaults when API fails +7. Mixed API patterns (uses both `UsersService` generated client AND raw `fetch()`) + +**WorkshopContext** (208 lines) does: +1. Workshop ID state management +2. URL parsing (extracts workshop ID from path or query params) +3. UUID validation +4. localStorage persistence +5. React Query cache orchestration (`queryClient.clear()`, `invalidateQueries()`, `removeQueries()`) +6. `popstate` event listener for browser back/forward +7. User ↔ Workshop sync (when user logs in with workshop_id) +8. Hardcoded filter for a known invalid ID (`'569c0be9-...'`) + +**WorkflowContext** (180 lines) does: +1. Current phase state +2. Phase progression calculation (derived from 6 TanStack Query hooks) +3. Phase auto-completion logic (rubric complete when annotations start, etc.) +4. Phase enablement rules +5. Backend ↔ frontend phase sync via useEffect + +**Problems**: +- WorkshopContext calls `queryClient.clear()` 4 times — wiping the entire cache instead of targeted invalidation +- WorkflowContext has a useEffect with 9 dependencies, creating re-render risk +- UserContext mixes imperative API calls with declarative React state +- No context splits — changing workshop ID triggers permission reload, cache clear, and phase recalculation simultaneously + +**Remediation**: +- Split UserContext into `AuthContext` (login/logout) and `PermissionsContext` (role/permissions) +- Move cache orchestration out of WorkshopContext into query invalidation hooks +- Replace WorkflowContext's derived state with TanStack Query selectors +- Remove localStorage sync from contexts — use a dedicated persistence hook + +**Acceptance Criteria**: +- [ ] Each context has a single responsibility +- [ ] Zero `queryClient.clear()` calls in contexts +- [ ] No localStorage reads/writes in context providers +- [ ] Context updates don't trigger unnecessary re-renders of unrelated components + +--- + +### RC-6: No Form Library — All Forms Use Manual useState + +**Severity**: MEDIUM +**Location**: +- `client/src/components/UserLogin.tsx` — manual `useState` for form fields +- `client/src/pages/IntakePage.tsx` — manual `useState` for MLflow config form +- `client/src/components/DatabricksModelTester.tsx` — 11 `useState` for form state +- `client/src/components/CustomLLMProviderConfig.tsx` — 9 `useState` including `formData` object +- `client/src/components/FacilitatorUserManager.tsx` — manual `useState` for new user form +- `client/src/components/FacilitatorInvitationManager.tsx` — manual `useState` for invitation form +- `client/src/components/AnnotationStartPage.tsx` — 5 `useState` for configuration form + +**Description**: Every form in the application manages field state manually: +```typescript +const [formData, setFormData] = useState({ email: '', name: '', role: 'participant' }); +const [isLoading, setIsLoading] = useState(false); +const [error, setError] = useState(null); +``` + +No form library (react-hook-form, formik) is used. Consequences: +- **No validation**: Only HTML5 `required` and `type="email"` — no custom validation +- **No field-level errors**: Errors are displayed as a single string, not per-field +- **No dirty tracking**: Can't tell if a form has been modified +- **No submit state management**: Manual `isLoading`/`isSubmitting` flags +- **No form reset**: Manual re-initialization of state +- **Inconsistent patterns**: Each form implements its own submission/error pattern + +The `DatabricksModelTester` is the most extreme example — 11 `useState` hooks for what is essentially a single form with model selection, prompt input, and configuration fields. + +**Remediation**: Adopt `react-hook-form` (already in the React ecosystem, works with Radix UI components): +- Replaces 5-11 `useState` hooks per form with a single `useForm()` call +- Built-in validation with `zod` schema (can share with Pydantic schemas) +- Field-level error display +- Dirty/touched tracking +- Submit state management + +**Acceptance Criteria**: +- [ ] Form library adopted for all forms with 3+ fields +- [ ] Client-side validation with per-field error messages +- [ ] Consistent form submission pattern across all components + +--- + +### RC-7: Conditional Rendering Complexity — FacilitatorDashboard Has 60 Ternaries + +**Severity**: MEDIUM +**Location**: +- `client/src/components/FacilitatorDashboard.tsx` — 60 ternary operators, 27 `&&` chains +- `client/src/components/AnnotationStartPage.tsx:380-400` — 4-way nested ternary in button + +**Description**: `FacilitatorDashboard` renders completely different layouts based on `focusPhase`: +- `focusPhase === 'discovery'` → Discovery monitoring view +- `focusPhase === 'annotation'` → Annotation monitoring view +- `focusPhase === null` → General dashboard view + +These are **three different views** packed into one component with ternary operators: + +```typescript +// Lines 553-837: Cards section +{focusPhase !== 'annotation' && ( // Discovery card + {focusPhase === 'discovery' ? Viewing : null} + {(currentPhase === 'discovery' || focusPhase === 'discovery') ? ( + // Discovery-specific content + ) : ( + // General content + )} +)} + +// Similar pattern repeats for annotation card, actions, etc. +``` + +`AnnotationStartPage` has a 4-way nested ternary for a single button: +```typescript +{isStarting ? (<>Starting...) + : totalTraces === 0 ? (<>No Traces Available) + : !rubric ? (<>Rubric Required) + : (<>Start Annotation Phase)} +``` + +**Impact**: Hard to read, hard to test individual branches, high cognitive complexity. Adding a new phase requires touching conditional logic throughout the file. + +**Remediation**: Extract conditional branches into separate components: +```typescript +// Instead of 60 ternaries: +{focusPhase === 'discovery' && } +{focusPhase === 'annotation' && } +{!focusPhase && } +``` + +**Acceptance Criteria**: +- [ ] No component has more than 10 ternary operators +- [ ] Conditional rendering branches that exceed 20 lines are extracted to named components +- [ ] No nested ternaries deeper than 2 levels + +--- + +### RC-8: useEffect Chains and Missing Dependencies + +**Severity**: MEDIUM +**Location**: +- `client/src/components/CustomLLMProviderConfig.tsx:50-52` — `loadStatus` not in deps +- `client/src/context/WorkflowContext.tsx:62-126` — 9-dependency useEffect +- `client/src/pages/JudgeTuningPage.tsx` — multiple useEffect chains for data sync +- `client/src/components/IntakeWaitingView.tsx:23-45` — useEffect + setInterval for polling + +**Description**: Several anti-patterns with useEffect: + +**Missing dependencies**: +```typescript +// CustomLLMProviderConfig.tsx +const loadStatus = async () => { /* uses workshopId */ }; +useEffect(() => { + loadStatus(); // loadStatus recreated every render, not in deps +}, [workshopId]); +``` + +**Over-large dependency arrays** (re-render risk): +```typescript +// WorkflowContext.tsx — useEffect with 9 dependencies +useEffect(() => { + // Complex phase auto-completion logic +}, [traces, findings, rubric, annotations, participants, workshopId, user, workshop?.current_phase, ...]); +``` + +**useEffect for data fetching** (should be TanStack Query): +```typescript +// IntakeWaitingView.tsx — manual polling +useEffect(() => { + const loadStatus = async () => { /* fetch */ }; + loadStatus(); + const interval = setInterval(loadStatus, 5000); + return () => clearInterval(interval); +}, [workshopId]); +``` + +**State cascades** (one effect triggers state that triggers another): +- JudgeTuningPage: loading prompts → sets prompts state → triggers effect to load evaluations → sets evaluations state → triggers effect to calculate metrics + +**Remediation**: +- Replace data-fetching useEffects with `useQuery` +- Wrap callbacks in `useCallback` and include in dependency arrays +- Break large useEffects into smaller, focused effects +- Use `useMemo` for derived computations instead of useEffect + setState + +**Acceptance Criteria**: +- [ ] Zero useEffect calls for data fetching (all use TanStack Query) +- [ ] All useEffect dependency arrays are complete (no ESLint exhaustive-deps warnings) +- [ ] No useEffect chains that could be replaced by derived state + +--- + +### RC-9: Direct DOM Manipulation Instead of React Patterns + +**Severity**: LOW +**Location**: +- `client/src/components/FocusedAnalysisView.tsx:178-186` — `document.createElement('a')` for download +- `client/src/components/TraceDataViewer.tsx:436,449` — duplicate `document.createElement('a')` for download +- `client/src/components/Pagination.tsx:70-71` — `document.addEventListener('keydown')` + +**Description**: File download is implemented imperatively in 3 locations with duplicated code: +```typescript +const a = document.createElement('a'); +a.href = url; +a.download = `trace_${traceId}_data.csv`; +a.click(); +window.URL.revokeObjectURL(url); +``` + +This pattern is repeated identically in `FocusedAnalysisView` and twice in `TraceDataViewer`. It's not a React pattern violation per se (file downloads require imperative code), but the duplication is unnecessary. + +**Remediation**: Extract to a shared utility: +```typescript +// utils/download.ts +export function downloadBlob(content: string, filename: string, mimeType = 'text/plain') { + const blob = new Blob([content], { type: mimeType }); + const url = URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = filename; + a.click(); + URL.revokeObjectURL(url); +} +``` + +**Acceptance Criteria**: +- [ ] File download logic in a single shared utility +- [ ] Zero duplicate DOM manipulation code + +--- + +### RC-10: No Suspense Boundaries or Loading Skeletons + +**Severity**: LOW +**Location**: Entire `client/src/` — zero `` usage + +**Description**: The application uses no React Suspense boundaries. Every component manages its own loading state: +```typescript +if (isLoading) return
Loading...
; +if (error) return
Error: {error}
; +``` + +This creates: +- Inconsistent loading UI across the app (some show spinners, some show text, some show nothing) +- No skeleton/placeholder UI during data loading +- Full-page loading states instead of granular component-level loading +- No streaming or progressive rendering + +Combined with the missing code splitting (RC-1/TP-11), there's no loading fallback for lazy-loaded routes either. + +**Remediation**: Add Suspense boundaries with skeleton components, especially for route-level code splitting and data-heavy components. + +**Acceptance Criteria**: +- [ ] Suspense boundaries at route level for code splitting +- [ ] Consistent loading skeleton components for data-heavy views +- [ ] TanStack Query's `suspense: true` option evaluated for key queries + +--- + +## Prioritized Backlog + +| Priority | ID | Title | Effort | Impact | +|----------|-----|-------|--------|--------| +| P0 | RC-1 | Move navigation into React Router with URL-based phases | L | Critical — browser history, deep linking, bookmarking | +| P0 | RC-2 | Decompose JudgeTuningPage (2,754 lines, 32 useState) | L | Critical — unmaintainable, untestable | +| P1 | RC-3 | Add error boundaries at app, page, and widget level | S | High — prevents white-screen crashes | +| P1 | RC-4 | Break down 6 components over 500 lines | L | High — single responsibility, testability | +| P1 | RC-5 | Split context providers into single-responsibility | M | High — reduces unnecessary re-renders | +| P2 | RC-6 | Adopt form library (react-hook-form) | M | Medium — validation, consistency | +| P2 | RC-7 | Extract conditional rendering branches into components | M | Medium — readability, testability | +| P2 | RC-8 | Fix useEffect chains and missing dependencies | M | Medium — correctness, prevents bugs | +| P3 | RC-9 | Extract DOM manipulation to shared utilities | S | Low — DRY | +| P3 | RC-10 | Add Suspense boundaries and loading skeletons | M | Low — progressive UX | + +**Effort**: S = < 2 hours, M = 2-8 hours, L = 1-3 days + +--- + +## Cross-References + +| This Item | Related Items | +|-----------|---------------| +| RC-1 (Router unused) | TP-11 (no code splitting) — lazy routes need Router | +| RC-2 (JudgeTuningPage) | TP-1 (fetch bypass), TP-3 (TanStack partial adoption) | +| RC-3 (No error boundaries) | DX-9 (frontend error reporting) | +| RC-4 (Large components) | CQ-1 (god file), CQ-2 (monolithic components) | +| RC-5 (Context concerns) | TP-4 (QueryClient misconfiguration), ARCH-11 (context overload) | +| RC-6 (No form library) | TP-7 (no Pydantic validation) — form + API validation gap | +| RC-7 (Conditional complexity) | CQ-8 (deep nesting) | +| RC-8 (useEffect chains) | TP-3 (manual fetch patterns), PERF-4 (polling) | diff --git a/debt-docs/README.md b/debt-docs/README.md new file mode 100644 index 00000000..884f4e5a --- /dev/null +++ b/debt-docs/README.md @@ -0,0 +1,116 @@ +# Tech Debt Registry + +This directory contains the comprehensive tech debt audit for the Human Evaluation Workshop codebase. Each document follows the spec format used in `/specs/` with an added prioritized backlog section. + +## Debt Categories + +| Document | Category | Critical | High | Medium | Low | +|----------|----------|----------|------|--------|-----| +| [ARCHITECTURE_DEBT.md](ARCHITECTURE_DEBT.md) | Separation of concerns, modularity, abstractions | 3 | 4 | 6 | 2 | +| [COMPLEXITY_DEBT.md](COMPLEXITY_DEBT.md) | Cyclomatic/cognitive complexity, nesting, coupling | 4 | 5 | 5 | 2 | +| [CODE_QUALITY_DEBT.md](CODE_QUALITY_DEBT.md) | Code quality, architecture, complexity | 3 | 6 | 5 | 4 | +| [TESTING_DEBT.md](TESTING_DEBT.md) | Test coverage, quality, infrastructure | 4 | 4 | 4 | 0 | +| [SECURITY_DEBT.md](SECURITY_DEBT.md) | Auth, secrets, CORS, encryption | 3 | 3 | 3 | 2 | +| [DEPENDENCY_DEBT.md](DEPENDENCY_DEBT.md) | Unused deps, pinning, duplication | 0 | 1 | 1 | 1 | +| [PERFORMANCE_DEBT.md](PERFORMANCE_DEBT.md) | Queries, indexes, caching, polling | 0 | 3 | 4 | 2 | +| [DEPLOYMENT_DEBT.md](DEPLOYMENT_DEBT.md) | Config, health checks, CI/CD, logging | 2 | 3 | 4 | 3 | +| [DX_DEBT.md](DX_DEBT.md) | Tooling, onboarding, docs, artifacts | 1 | 3 | 5 | 1 | +| [TOOLING_PATTERNS_DEBT.md](TOOLING_PATTERNS_DEBT.md) | Library misuse, framework underutilization | 2 | 4 | 3 | 1 | +| [REACT_PATTERNS_DEBT.md](REACT_PATTERNS_DEBT.md) | React architecture, components, hooks, routing | 2 | 3 | 3 | 2 | +| [WORKSHOP_ID_SIMPLIFICATION.md](WORKSHOP_ID_SIMPLIFICATION.md) | Suggestions: reduce workshop_id threading | — | — | — | — | + +## Keyword Index + +| Keyword | Document(s) | +|---------|-------------| +| separation of concerns, SoC | ARCHITECTURE_DEBT | +| god service, god component, modularity | ARCHITECTURE_DEBT, CODE_QUALITY_DEBT | +| service layer, business logic in routes | ARCHITECTURE_DEBT | +| generated client, raw fetch, API bypass | ARCHITECTURE_DEBT | +| duplicate types, type drift | ARCHITECTURE_DEBT | +| duplicate logic, rubric parsing | ARCHITECTURE_DEBT | +| localStorage, storage abstraction | ARCHITECTURE_DEBT | +| late imports, internal imports | ARCHITECTURE_DEBT, CODE_QUALITY_DEBT | +| os.environ, environment mutation | ARCHITECTURE_DEBT | +| auth pattern, auth decorator, middleware | ARCHITECTURE_DEBT, SECURITY_DEBT | +| context provider, state management | ARCHITECTURE_DEBT | +| query keys, React Query config | ARCHITECTURE_DEBT | +| job persistence, background threads | ARCHITECTURE_DEBT | +| permission logic, UserPermissions | ARCHITECTURE_DEBT | +| cyclomatic complexity, CC, decision points | COMPLEXITY_DEBT | +| cognitive complexity, nesting depth | COMPLEXITY_DEBT, CODE_QUALITY_DEBT | +| useState explosion, useEffect chains | COMPLEXITY_DEBT | +| retry loop, retry pattern | COMPLEXITY_DEBT, CODE_QUALITY_DEBT | +| generator protocol, implicit types | COMPLEXITY_DEBT | +| fan-in, fan-out, dependency graph | COMPLEXITY_DEBT | +| as any, type cast | COMPLEXITY_DEBT, CODE_QUALITY_DEBT | +| LLM parsing, JSON fallback | COMPLEXITY_DEBT | +| schema updates, raw SQL | COMPLEXITY_DEBT, DEPLOYMENT_DEBT | +| useReducer, state cascade | COMPLEXITY_DEBT | +| bare except, error handling | CODE_QUALITY_DEBT | +| god file, large file, complexity | CODE_QUALITY_DEBT | +| any type, type safety | CODE_QUALITY_DEBT | +| dead code, unused, TODO | CODE_QUALITY_DEBT, DX_DEBT | +| print, console.log, debug | CODE_QUALITY_DEBT, DX_DEBT | +| polling, retry, duplication | CODE_QUALITY_DEBT, PERFORMANCE_DEBT | +| test coverage, missing tests | TESTING_DEBT | +| spec tagging | TESTING_DEBT | +| E2E, Playwright, flaky | TESTING_DEBT | +| mock, fixture, test infra | TESTING_DEBT | +| hardcoded password, credentials | SECURITY_DEBT | +| CORS, allow_origins | SECURITY_DEBT, DEPLOYMENT_DEBT | +| encryption, ENCRYPTION_KEY | SECURITY_DEBT | +| auth, admin, unprotected | SECURITY_DEBT | +| password_hash, exposure | SECURITY_DEBT | +| litellm, dspy, unused dep | DEPENDENCY_DEBT | +| version pinning | DEPENDENCY_DEBT | +| N+1, index, query | PERFORMANCE_DEBT | +| pagination, list endpoints | PERFORMANCE_DEBT | +| cache, TTL | PERFORMANCE_DEBT | +| bundle size, drop_console | PERFORMANCE_DEBT, DX_DEBT | +| health check, readiness | DEPLOYMENT_DEBT | +| migration, rollback | DEPLOYMENT_DEBT | +| worker, pool, connection | DEPLOYMENT_DEBT | +| structured logging | DEPLOYMENT_DEBT, DX_DEBT | +| rate limiting | DEPLOYMENT_DEBT | +| eslint, prettier, pre-commit | DX_DEBT | +| architecture docs, onboarding | DX_DEBT | +| stale artifacts, git cleanup | DX_DEBT | +| config sprawl | DX_DEBT | +| TanStack Query, useQuery, refetchInterval | TOOLING_PATTERNS_DEBT | +| generated client, fetch bypass, OpenAPI | TOOLING_PATTERNS_DEBT, ARCHITECTURE_DEBT | +| response_model, Pydantic, raw dict | TOOLING_PATTERNS_DEBT | +| Alembic bypass, runtime schema, _apply_schema | TOOLING_PATTERNS_DEBT, DEPLOYMENT_DEBT | +| SQLAlchemy .query(), select(), deprecated API | TOOLING_PATTERNS_DEBT | +| Depends(), dependency injection, DI | TOOLING_PATTERNS_DEBT | +| BackgroundTasks, threading.Thread | TOOLING_PATTERNS_DEBT | +| QueryClient, staleTime, cache invalidation | TOOLING_PATTERNS_DEBT | +| junction table, JSON column, relational | TOOLING_PATTERNS_DEBT | +| code splitting, React.lazy, manualChunks | TOOLING_PATTERNS_DEBT | +| React Router, navigation, deep linking, URL | REACT_PATTERNS_DEBT | +| useState explosion, component size, decomposition | REACT_PATTERNS_DEBT, COMPLEXITY_DEBT | +| error boundary, ErrorBoundary, white screen | REACT_PATTERNS_DEBT | +| context provider, re-render, provider hell | REACT_PATTERNS_DEBT | +| form handling, react-hook-form, validation | REACT_PATTERNS_DEBT | +| conditional rendering, ternary, && chain | REACT_PATTERNS_DEBT | +| useEffect chain, missing deps, data cascade | REACT_PATTERNS_DEBT | +| Suspense, loading skeleton, progressive | REACT_PATTERNS_DEBT | +| JudgeTuningPage, FacilitatorDashboard, TraceViewer | REACT_PATTERNS_DEBT, CODE_QUALITY_DEBT | +| workshop_id, workshop context, scoped service | WORKSHOP_ID_SIMPLIFICATION, ARCHITECTURE_DEBT | +| JWT claims, auth token, workshop scope | WORKSHOP_ID_SIMPLIFICATION, SECURITY_DEBT | + +## How to Use This Registry + +1. **Before starting work**: Check if your area has known debt items +2. **During implementation**: If you encounter new debt, add it to the relevant document +3. **Sprint planning**: Use the Prioritized Backlog section in each document to plan remediation +4. **Tracking**: Update item status as debt is resolved + +## Severity Definitions + +| Severity | Definition | +|----------|------------| +| **CRITICAL** | Active risk to production stability, data integrity, or security. Fix immediately. | +| **HIGH** | Significant impact on maintainability, reliability, or developer productivity. Fix this sprint. | +| **MEDIUM** | Moderate impact on code quality or DX. Plan for remediation within 1-2 sprints. | +| **LOW** | Minor improvements. Address opportunistically or during related work. | diff --git a/debt-docs/SECURITY_DEBT.md b/debt-docs/SECURITY_DEBT.md new file mode 100644 index 00000000..4091216d --- /dev/null +++ b/debt-docs/SECURITY_DEBT.md @@ -0,0 +1,383 @@ +# Security Debt + +## Overview + +The codebase has several critical security issues that require immediate attention. Hardcoded credentials are committed to version control, CORS is configured to allow all origins with credentials, admin endpoints lack authentication, and the encryption key auto-generates on startup (causing data loss on restart). While some of these configurations may be acceptable for local development, they represent serious risks if deployed to production. + +--- + +## Items + +### SEC-1: Hardcoded Credentials in Version Control + +**Severity**: CRITICAL +**Location**: +- `config/auth.yaml:5-6` - Facilitator email and plaintext password +- `config/auth.yaml:20` - Default user password `changeme123` +- `server/utils/config.py:29-30` - Hardcoded fallback password `facilitator123` + +**Description**: +```yaml +# config/auth.yaml +facilitators: + - email: "facilitator123@email.com" + password: "facilitator123" # Plaintext in repo + +# ... +default_user_password: "changeme123" # Default for all new users +``` + +```python +# server/utils/config.py +'email': 'facilitator@databricks.com', +'password': 'facilitator123', # Hardcoded fallback +``` + +**Impact**: Anyone with access to the repository has valid credentials. These are in git history permanently. Default passwords are universally known attack vectors. + +**Remediation**: +1. Move all credentials to environment variables +2. Remove hardcoded passwords from `auth.yaml` - require env vars only +3. Update `server/utils/config.py` to fail if env vars not set +4. Rotate all passwords that were committed + +**Acceptance Criteria**: +- [ ] Zero plaintext passwords in any tracked file +- [ ] `config/auth.yaml` reads passwords from environment variables +- [ ] `server/utils/config.py` raises error if credentials not configured +- [ ] Existing committed passwords rotated + +--- + +### SEC-2: Overly Permissive CORS Configuration + +**Severity**: CRITICAL +**Location**: +- `server/config.py:24` - `CORS_ORIGINS: list = ['*']` +- `server/app.py:201-207` - CORS middleware configuration + +**Description**: +```python +# server/config.py +CORS_ORIGINS: list = ['*'] + +# server/app.py +app.add_middleware( + CORSMiddleware, + allow_origins=ServerConfig.CORS_ORIGINS, # ['*'] + allow_credentials=True, # Sends cookies cross-origin + allow_methods=["*"], # All HTTP methods + allow_headers=["*"], # All headers +) +``` + +`allow_origins=['*']` combined with `allow_credentials=True` is a textbook security misconfiguration. Any website can make authenticated requests to this API. + +**Impact**: Enables CSRF attacks, cross-origin credential theft. Any malicious site visited by a logged-in user can access the full API. + +**Remediation**: +```python +CORS_ORIGINS: list = os.getenv('CORS_ORIGINS', 'http://localhost:3000,http://localhost:5173').split(',') +``` +In production, set to specific Databricks App domains. Remove `allow_credentials=True` if not needed, or ensure origins are restricted. + +**Acceptance Criteria**: +- [ ] `CORS_ORIGINS` reads from environment variable +- [ ] Default is localhost-only, not `['*']` +- [ ] `allow_methods` and `allow_headers` are explicitly listed +- [ ] Production deployment has CORS_ORIGINS set to specific domains + +--- + +### SEC-3: Encryption Key Auto-Generated on Startup + +**Severity**: CRITICAL +**Location**: `server/utils/encryption.py:21-29` + +**Description**: +```python +if not self.secret_key: + self.secret_key = self._generate_key() + logger.warning('No encryption key found. Generated new key.') +``` + +If `ENCRYPTION_KEY` environment variable is not set, the application generates a new random key on every startup. This means: +1. All previously encrypted data (tokens, secrets) becomes **permanently unrecoverable** after restart +2. The warning log is non-fatal - the app continues running with a new key +3. No validation that the key format is correct + +**Impact**: Silent data loss on every container restart. Encrypted tokens from previous sessions cannot be decrypted. + +**Remediation**: Make `ENCRYPTION_KEY` required. Fail fast on startup if not set: +```python +if not self.secret_key: + raise ValueError('ENCRYPTION_KEY environment variable not set.') +``` + +**Acceptance Criteria**: +- [ ] App fails to start if `ENCRYPTION_KEY` is not set +- [ ] Key format is validated on startup +- [ ] Documentation lists `ENCRYPTION_KEY` as required env var + +--- + +### SEC-4: Admin Endpoints Without Authentication + +**Severity**: HIGH +**Location**: `server/routers/users.py:90-111` + +**Description**: +```python +@router.post('/admin/facilitators/') +async def create_facilitator_config(config_data: FacilitatorConfigCreate, ...): + """Create a pre-configured facilitator (admin only).""" + # In a real system, you'd check admin permissions here + +@router.get('/admin/facilitators/') +async def list_facilitator_configs(...): + """List all pre-configured facilitators (admin only).""" + # NOTE: This endpoint is not protected +``` + +Additional unprotected endpoints: + +| Endpoint | Risk | +|----------|------| +| `POST /users/` (create_user) | Any user can create accounts for any workshop | +| `DELETE /users/{user_id}` | Any user can delete any other user | +| `PUT /users/{user_id}/role` | Any user can escalate to FACILITATOR role | + +**Impact**: Unauthenticated users can create facilitator accounts, delete users, escalate privileges. Full admin takeover possible. + +**Remediation**: Create authentication dependency and apply to all admin endpoints: +```python +async def require_facilitator(user: User = Depends(get_current_user)) -> User: + if user.role != UserRole.FACILITATOR: + raise HTTPException(status_code=403, detail="Forbidden") + return user +``` + +**Acceptance Criteria**: +- [ ] All `/admin/` endpoints require facilitator authentication +- [ ] User deletion requires facilitator role +- [ ] Role changes require facilitator role +- [ ] Tests verify 403 for unauthorized access + +--- + +### SEC-5: Password Hash Exposed in API Responses + +**Severity**: HIGH +**Location**: `server/models.py:62-71` + +**Description**: +```python +class User(BaseModel): + # ... other fields ... + password_hash: Optional[str] = None # For internal use only +``` + +The `password_hash` field is included in the `User` Pydantic model that is returned by API endpoints. The comment "For internal use only" does not prevent Pydantic from serializing it. + +**Impact**: Password hashes are sent to clients in API responses. Attackers can perform offline brute-force attacks against the hashes. + +**Remediation**: Use `Field(exclude=True)` or create a separate `UserResponse` model: +```python +class UserResponse(BaseModel): + id: str + email: str + name: str + role: UserRole + workshop_id: Optional[str] = None + status: UserStatus + # No password_hash field +``` + +**Acceptance Criteria**: +- [ ] `password_hash` never appears in any API response +- [ ] Separate response model used for user endpoints +- [ ] Test verifies hash is not in response body + +--- + +### SEC-6: Tokens Stored Unencrypted in Memory + +**Severity**: HIGH +**Location**: `server/services/token_storage_service.py:8-24` + +**Description**: +```python +class TokenStorageService: + def __init__(self): + self._tokens: Dict[str, Dict[str, any]] = {} + + def store_token(self, workshop_id: str, token: str, ...): + self._tokens[workshop_id] = { + 'token': token, # Plaintext Databricks token + 'expires_at': expiry_time, + } +``` + +Databricks tokens stored in plaintext in memory. If the process is compromised or memory is dumped, tokens are immediately readable. + +**Impact**: Token theft if process memory is accessed. No rotation mechanism documented. + +**Remediation**: Encrypt tokens before storing using the existing encryption utility: +```python +from server.utils.encryption import encrypt_sensitive_data +self._tokens[workshop_id] = { + 'token': encrypt_sensitive_data(token), + ... +} +``` + +**Acceptance Criteria**: +- [ ] Tokens encrypted at rest in memory +- [ ] Decryption happens only at point of use +- [ ] Token expiry is enforced + +--- + +### SEC-7: Error Messages Expose Internal Details + +**Severity**: MEDIUM +**Location**: +- `server/routers/dbsql_export.py:59-72` - `f"Export failed: {export_result.get('error')}"` +- `server/routers/databricks.py:28, 49, 118, 157, 246, 289` - `f"Failed to initialize: {str(e)}"` + +**Description**: Internal exception messages returned directly to clients: +```python +raise HTTPException(status_code=500, detail=f'Failed to initialize Databricks service: {str(e)}') +``` + +This can expose: stack traces, library names/versions, infrastructure details (e.g., "PGPASSWORD not found"), file paths. + +**Impact**: Information disclosure helps attackers understand the tech stack and find additional vulnerabilities. + +**Remediation**: Log full errors internally, return generic messages to clients: +```python +logger.error(f"Databricks init failed", exc_info=True) +raise HTTPException(status_code=500, detail="Service initialization failed. Please try again.") +``` + +**Acceptance Criteria**: +- [ ] No `str(e)` in HTTPException detail messages +- [ ] All 500 errors return generic messages +- [ ] Full errors logged server-side + +--- + +### SEC-8: Database Paths and Schema Names in Logs + +**Severity**: MEDIUM +**Location**: +- `server/app.py:49, 52-54, 90, 113` +- `server/routers/dbsql_export.py:52-54` + +**Description**: +```python +logger.info(f"Database path: {db_path}") +logger.info(f"Target: {request.catalog}.{request.schema_name}") +print(f" SQLite rescue configured: {rescue_status['volume_backup_path']}") +``` + +**Impact**: If logs are shipped to external services, infrastructure details are exposed. + +**Remediation**: Log generic identifiers, not full paths. Use debug level for infrastructure details. + +**Acceptance Criteria**: +- [ ] No file paths in INFO-level logs +- [ ] Infrastructure details at DEBUG level only + +--- + +### SEC-9: Sensitive Data in Print Statements + +**Severity**: MEDIUM +**Location**: `server/routers/workshops.py:963-977` + +**Description**: +```python +print(f" DEBUG trace_ids: {[t.id for t in traces]}") +print(f" DEBUG: Selected traces: {trace_ids_to_use}") +``` + +Trace IDs and potentially sensitive workshop data printed to stdout. + +**Impact**: Sensitive data in logs. Overlaps with CQ-5 but security-specific concern. + +**Remediation**: Remove debug prints (see CQ-5). If needed, use logger.debug() which is not output at INFO level. + +**Acceptance Criteria**: +- [ ] Zero print statements with data content +- [ ] Debug output uses logger.debug(), disabled by default + +--- + +### SEC-10: Missing Security Headers + +**Severity**: LOW +**Location**: `server/app.py` - No security headers middleware + +**Description**: Standard security headers not set: +- No `X-Content-Type-Options: nosniff` +- No `X-Frame-Options: DENY` +- No `Strict-Transport-Security` +- No `Content-Security-Policy` + +**Impact**: Defense-in-depth gap. Won't prevent attacks alone but is standard practice. + +**Remediation**: Add security headers middleware: +```python +@app.middleware("http") +async def add_security_headers(request, call_next): + response = await call_next(request) + response.headers["X-Content-Type-Options"] = "nosniff" + response.headers["X-Frame-Options"] = "DENY" + return response +``` + +**Acceptance Criteria**: +- [ ] Security headers present on all responses +- [ ] Headers configurable for different deployment environments + +--- + +### SEC-11: Fernet Key Format Not Validated + +**Severity**: LOW +**Location**: `server/utils/encryption.py:32` + +**Description**: +```python +self.cipher = Fernet(self.secret_key.encode()) +``` +No try/except around Fernet initialization. Invalid key format causes cryptic error on startup. + +**Impact**: Poor error message if ENCRYPTION_KEY is malformed. + +**Remediation**: Validate key format and provide helpful error message. (Will be addressed by SEC-3 fix.) + +**Acceptance Criteria**: +- [ ] Clear error message if key format is invalid +- [ ] Documentation shows how to generate a valid key + +--- + +## Prioritized Backlog + +| Priority | ID | Title | Effort | Impact | +|----------|-----|-------|--------|--------| +| P0 | SEC-1 | Remove hardcoded credentials from repo | S | Critical - credentials in version control | +| P0 | SEC-2 | Restrict CORS to specific origins | S | Critical - CSRF/credential theft | +| P0 | SEC-3 | Require ENCRYPTION_KEY, fail on missing | S | Critical - silent data loss | +| P0 | SEC-4 | Add auth to admin endpoints | M | High - privilege escalation | +| P1 | SEC-5 | Exclude password_hash from API responses | S | High - password hash exposure | +| P1 | SEC-6 | Encrypt tokens in memory | S | High - token theft risk | +| P2 | SEC-7 | Sanitize error messages to clients | M | Medium - information disclosure | +| P2 | SEC-8 | Remove infrastructure details from logs | S | Medium - info disclosure | +| P2 | SEC-9 | Remove data from print statements | S | Medium - overlaps with CQ-5 | +| P3 | SEC-10 | Add security headers middleware | S | Low - defense in depth | +| P3 | SEC-11 | Validate Fernet key format | S | Low - DX improvement | + +**Effort**: S = < 2 hours, M = 2-8 hours, L = 1-3 days diff --git a/debt-docs/TESTING_DEBT.md b/debt-docs/TESTING_DEBT.md new file mode 100644 index 00000000..657a781b --- /dev/null +++ b/debt-docs/TESTING_DEBT.md @@ -0,0 +1,351 @@ +# Testing Debt + +## Overview + +The codebase has significant testing gaps, particularly in backend services and frontend components. Of 13 backend services, only 1 has partial test coverage. Of 60+ frontend components, fewer than 10 have tests. Critical paths like judge evaluation, MLflow intake, encryption, and password handling are completely untested. The test infrastructure also lacks factories and fixtures for common entities, leading to duplicated mock code across test files. + +--- + +## Items + +### TD-1: Backend Services - Zero Test Coverage (8 services) + +**Severity**: CRITICAL +**Location**: The following services have NO test files: + +| Service | Methods | Risk | +|---------|---------|------| +| `server/services/databricks_service.py` | 13+ methods (`call_serving_endpoint`, `list_serving_endpoints`, `test_connection`) | Production LLM calls untested | +| `server/services/judge_service.py` | 8 methods (`evaluate_prompt`, `_evaluate_with_mlflow`, `_calculate_performance_metrics`, `export_judge`, `select_few_shot_examples`) | Core evaluation logic untested | +| `server/services/mlflow_intake_service.py` | 5+ methods (`configure_mlflow`, `search_traces`, `fetch_trace_details`, `ingest_traces`) | Critical data pipeline untested | +| `server/services/rubric_generation_service.py` | 3+ methods (`generate_rubric_suggestions`) | AI feature response parsing untested | +| `server/services/dbsql_export_service.py` | 4+ methods (`export_workshop_data`, `read_table`, `insert_overwrite_table`) | Data export untested | +| `server/utils/encryption.py` | `encrypt`, `decrypt`, `is_encrypted` | Security-critical, zero coverage | +| `server/utils/password.py` | `hash_password`, `verify_password`, `validate_password_strength` | Auth utilities untested | +| `server/utils/config.py` | `load_auth_config`, `get_facilitator_config` | Config loading untested | + +**Impact**: Core business logic for evaluation, data intake, and security has zero validation. Regressions will reach production undetected. + +**Remediation**: Write unit tests for each service. Mock external dependencies (MLflow, Databricks SDK). Prioritize by production risk. + +**Acceptance Criteria**: +- [ ] Every service file has a corresponding test file +- [ ] All public methods have at least one test +- [ ] Security utilities (encryption, password) have round-trip and edge case tests +- [ ] Coverage for services reaches 80%+ + +--- + +### TD-2: Frontend Components - Near-Zero Test Coverage + +**Severity**: CRITICAL +**Location**: 50+ components with NO tests including: + +| Component | Lines | Risk | +|-----------|-------|------| +| `client/src/pages/IntakePage.tsx` | 707 | MLflow intake flow untested | +| `client/src/pages/JudgeTuningPage.tsx` | 2754 | Most complex page, zero tests | +| `client/src/pages/AnnotationReviewPage.tsx` | - | Review workflow untested | +| `client/src/pages/DBSQLExportPage.tsx` | - | Export flow untested | +| `client/src/pages/FindingsReviewPage.tsx` | - | Results untested | +| `client/src/components/FacilitatorDashboard.tsx` | 1325 | Most complex component, zero tests | +| `client/src/components/AnnotationStartPage.tsx` | 408 | Annotation workflow untested | +| `client/src/components/RubricViewPage.tsx` | 169 | Rubric editing untested | +| `client/src/components/RubricSuggestionPanel.tsx` | - | AI suggestions untested | +| `client/src/components/CustomLLMProviderConfig.tsx` | - | LLM config untested | +| `client/src/components/JsonPathSettings.tsx` | - | Settings untested | +| `client/src/components/RoleBasedWorkflow.tsx` | - | Role routing untested | + +**Coverage Summary**: ~7% of components tested (4 of 60+). + +**Impact**: UI regressions will not be caught. Form validation, error states, loading states, and conditional rendering all untested. + +**Remediation**: Prioritize tests for complex components (FacilitatorDashboard, JudgeTuningPage, IntakePage). Use React Testing Library for behavior tests. + +**Acceptance Criteria**: +- [ ] All pages have at least rendering + basic interaction tests +- [ ] Complex components (>500 lines) have comprehensive test suites +- [ ] Coverage for components reaches 60%+ + +--- + +### TD-3: Workshop Router - Massive Endpoint Coverage Gap + +**Severity**: CRITICAL +**Location**: `server/routers/workshops.py` (5,229 lines, 100+ endpoints) + +**Description**: Only 3 test cases exist for the entire workshop router: +- `GET /workshops/{id}` - basic retrieval +- `GET /workshops/{id}/traces` - trace listing +- Edge case for missing workshop + +**Missing test coverage for**: +- Annotation submission and editing +- Trace assignment and reassignment +- Phase transition logic +- Conflict resolution +- Background job lifecycle (alignment, evaluation) +- Discovery findings CRUD +- Participant notes +- JSONPath settings +- Auto-assign annotations + +**Impact**: The largest and most complex file in the codebase has <5% test coverage. + +**Remediation**: Systematically add tests for each endpoint group. Pair with CQ-1 (splitting the router) to make testing tractable. + +**Acceptance Criteria**: +- [ ] Every HTTP endpoint has at least one success and one failure test +- [ ] Phase transition logic has boundary tests +- [ ] Background job handlers have lifecycle tests + +--- + +### TD-4: Missing E2E Workflow Coverage + +**Severity**: CRITICAL +**Location**: `client/tests/e2e/` + +**Description**: E2E tests cover ~15% of user flows. Currently tested: +- Facilitator login + workshop creation +- Discovery phase with participant invites +- Rubric creation workflow +- Basic annotation +- Auto-evaluation with judge + +**Missing E2E flows**: +- Complete annotation-to-results workflow +- Judge tuning/evaluation iteration cycle +- DBSQL export workflow +- Multi-workshop participant flows +- Facilitator dashboard interactions +- Error recovery flows (network failures, partial submissions) +- Concurrent annotation handling +- Large-scale trace ingestion (1000+ traces) + +**Impact**: End-to-end regressions in major workflows will not be caught. + +**Remediation**: Add E2E tests for each major user flow, prioritized by usage frequency and risk. + +**Acceptance Criteria**: +- [ ] Complete annotation-to-results E2E test +- [ ] Judge tuning E2E test +- [ ] DBSQL export E2E test +- [ ] At least 60% of major workflows covered + +--- + +### TD-5: Missing Spec Tags on Client Tests + +**Severity**: HIGH +**Location**: Client test files in `client/src/` + +**Description**: Only 18 spec tags across 9 files. 50+ client test files have zero `@spec:` annotations. The project's spec-driven development model requires all tests to be tagged per `CLAUDE.md`. + +**Currently tagged files**: +- `JudgeTypeSelector.test.tsx` (4 tags) +- `Pagination.test.tsx` (2 tags) +- `useWorkshopApi.test.ts` (1 tag) +- `rubricUtils.test.ts` (2 tags) + +**Impact**: Cannot filter tests by spec, cannot verify spec coverage, violates project conventions. + +**Remediation**: Add spec tags to all existing client tests. Include spec tags in test templates for new tests. + +**Acceptance Criteria**: +- [ ] 100% of test files have at least one `@spec:` tag +- [ ] `just spec-tagging-check` passes with zero violations + +--- + +### TD-6: Test Infrastructure - Missing Factories and Fixtures + +**Severity**: HIGH +**Location**: `tests/conftest.py` (96 lines) + +**Description**: `conftest.py` has basic fixtures (`async_client`, `mock_db_session`, `override_get_db`) but is missing: +- Workshop factory for different phases/states +- Trace data builders for common scenarios +- Annotation factory for bulk test data +- Mock MLflow/Databricks services at fixture level +- Frontend API response mock utilities + +Each test file reimplements `FakeDatabaseService` or similar mocks independently. + +**Impact**: Duplicated setup code, inconsistent mocks, high effort to write new tests. + +**Remediation**: Create shared factories and fixtures: +```python +# conftest.py additions +@pytest.fixture +def workshop_factory(): + def _create(phase=WorkshopPhase.INTAKE, traces=5, annotations=0): + ... + return _create +``` + +**Acceptance Criteria**: +- [ ] Workshop, trace, annotation, and rubric factories exist +- [ ] Mock services are shared via fixtures +- [ ] New tests can use factories instead of manual setup + +--- + +### TD-7: Untested Hook - useDatabricksApi + +**Severity**: HIGH +**Location**: `client/src/hooks/useDatabricksApi.ts` - NO corresponding test file + +**Description**: The Databricks API integration hook has zero test coverage. Other hooks (`useWorkshopApi`, `useJsonPathExtraction`) have tests. + +**Impact**: Databricks connection testing, endpoint listing, and model evaluation untested. + +**Remediation**: Create `useDatabricksApi.test.ts` with mocked API responses. + +**Acceptance Criteria**: +- [ ] Test file exists with tests for each hook function +- [ ] Error handling and loading states tested + +--- + +### TD-8: Shallow/Trivial Tests + +**Severity**: HIGH +**Location**: +- `client/src/components/LoadingSpinner.test.tsx` - Only 3 assertions, missing timeout states, error scenarios, accessibility +- `client/src/components/TraceDataViewer.test.tsx` - Likely snapshot-only testing without behavior validation +- `client/src/hooks/useWorkshopApi.test.ts` - Only 2 test functions for an extensive API hook + +**Impact**: Tests pass but provide false confidence. Changes to these components could introduce undetected regressions. + +**Remediation**: Add meaningful assertions for behavior, edge cases, and error states. + +**Acceptance Criteria**: +- [ ] Each test file has tests for success, error, and edge cases +- [ ] No snapshot-only tests without behavioral assertions alongside + +--- + +### TD-9: Flaky E2E Test Patterns + +**Severity**: MEDIUM +**Location**: +- `client/tests/e2e/facilitator-create-workshop.spec.ts:18` - `waitForTimeout(500)` hardcoded wait +- Other E2E tests use conditional click logic that hides flakiness + +**Description**: Fixed timeouts instead of polling with assertions: +```typescript +await page.waitForTimeout(500); // Hardcoded wait +const createNewButton = page.getByRole('button', { name: /Create New/i }); +if (await createNewButton.isVisible().catch(() => false)) { + await createNewButton.click(); +} +``` + +**Impact**: Tests pass locally but may fail in CI under load. Conditional logic masks real failures. + +**Remediation**: Replace `waitForTimeout` with `expect.poll()` or `waitForSelector`. Remove conditional try/catch in assertions. + +**Acceptance Criteria**: +- [ ] Zero `waitForTimeout` calls in E2E tests +- [ ] All assertions use `expect.poll()` or Playwright auto-waiting + +--- + +### TD-10: Over-Mocked Tests (Testing Implementation Details) + +**Severity**: MEDIUM +**Location**: +- `tests/unit/services/test_alignment_service.py` - Tests private `_normalize_judge_prompt` method +- `tests/unit/routers/test_databricks_router.py` - FakeService doesn't validate error conditions + +**Description**: Tests mock internal implementation details instead of testing behavior through public interfaces. This makes refactoring break tests unnecessarily. + +**Impact**: Brittle tests that break on implementation changes but pass on behavioral regressions. + +**Remediation**: Test through public interfaces. Mock at boundaries (HTTP, database) not internal methods. + +**Acceptance Criteria**: +- [ ] No tests directly call private methods (prefixed with `_`) +- [ ] Mocks validate both success and error paths + +--- + +### TD-11: Missing Edge Case Coverage + +**Severity**: MEDIUM +**Location**: Various + +**Description**: Known untested edge cases: +- Database: Transaction rollback, concurrent writes, large result sets +- Router: Error paths for service initialization (`server/routers/databricks.py:27-28`) +- UI: Pagination with 0 items, rubric with special characters, traces >1MB, concurrent API calls + +**Impact**: Edge cases are where most production bugs hide. + +**Remediation**: Add targeted edge case tests for each critical path. + +**Acceptance Criteria**: +- [ ] Each service has at least 2 edge case tests +- [ ] Empty state, large data, and concurrent access tested + +--- + +### TD-12: Duplicated Mock Patterns + +**Severity**: MEDIUM +**Location**: Multiple test files in `tests/unit/routers/` + +**Description**: Each router test reimplements `FakeDatabaseService`: +```python +class FakeDatabaseService: + def get_workshop(self, workshop_id: str): + assert workshop_id == "w1" + return workshop +``` +Mock validates input but doesn't simulate realistic DB behavior (missing error cases, empty results). + +**Impact**: Inconsistent mock behavior, duplicated code, easy to miss error paths. + +**Remediation**: Create a shared `FakeDatabaseService` in `conftest.py` with configurable behavior. + +**Acceptance Criteria**: +- [ ] Single shared mock service with configurable responses +- [ ] Error injection support for failure path testing + +--- + +## Prioritized Backlog + +| Priority | ID | Title | Effort | Impact | +|----------|-----|-------|--------|--------| +| P0 | TD-1 | Add tests for encryption.py and password.py | S | Critical - security code untested | +| P0 | TD-1 | Add tests for judge_service.py | M | Critical - core evaluation untested | +| P0 | TD-1 | Add tests for databricks_service.py | M | Critical - production LLM calls | +| P0 | TD-3 | Add workshop router endpoint tests | L | Critical - largest file, <5% coverage | +| P1 | TD-1 | Add tests for mlflow_intake_service.py | M | Critical - data pipeline | +| P1 | TD-2 | Add tests for FacilitatorDashboard, JudgeTuningPage | L | Critical - most complex UI | +| P1 | TD-4 | Add annotation-to-results E2E test | M | Critical - major workflow gap | +| P1 | TD-6 | Create test factories and shared fixtures | M | High - accelerates all future testing | +| P2 | TD-5 | Add spec tags to all client tests | S | High - project convention | +| P2 | TD-7 | Add useDatabricksApi tests | S | High - untested hook | +| P2 | TD-4 | Add judge tuning and DBSQL export E2E tests | M | High - workflow gaps | +| P2 | TD-8 | Deepen shallow test suites | M | High - false confidence | +| P3 | TD-9 | Fix flaky E2E patterns | S | Medium - CI reliability | +| P3 | TD-10 | Refactor over-mocked tests | M | Medium - test maintainability | +| P3 | TD-11 | Add edge case tests | M | Medium - production resilience | +| P3 | TD-12 | Consolidate duplicated mocks | S | Medium - test DX | + +**Effort**: S = < 2 hours, M = 2-8 hours, L = 1-3 days + +**Coverage Summary**: + +| Layer | Current | Target | +|-------|---------|--------| +| Backend Services | ~15% | 80% | +| Backend Routers | ~5% (workshops) | 70% | +| Backend Utils | ~20% | 90% | +| Frontend Pages | 0% | 60% | +| Frontend Components | ~7% | 60% | +| Frontend Hooks | ~33% | 90% | +| E2E Workflows | ~15% | 60% | diff --git a/debt-docs/TOOLING_PATTERNS_DEBT.md b/debt-docs/TOOLING_PATTERNS_DEBT.md new file mode 100644 index 00000000..7d7ccc98 --- /dev/null +++ b/debt-docs/TOOLING_PATTERNS_DEBT.md @@ -0,0 +1,454 @@ +# Tooling & Pattern Misuse Debt + +## Overview + +The codebase adopts strong libraries (TanStack Query, FastAPI, SQLAlchemy, Alembic, Pydantic, Radix UI) but consistently underutilizes or misuses their core value propositions. The most impactful pattern: the app has a generated OpenAPI client that is almost entirely bypassed — 70+ direct `fetch()` calls exist alongside generated service methods, with duplicate type definitions in 3 separate locations. Similarly, TanStack Query is installed and partially used but many components still use manual `useEffect`+`useState`+`setInterval` patterns. On the backend, Alembic manages migrations but 15+ schema changes happen at runtime via raw SQL, creating untracked drift. These aren't missing features — they're adopted tools whose contracts are being violated. + +--- + +## Items + +### TP-1: Generated OpenAPI Client Largely Bypassed + +**Severity**: CRITICAL +**Location**: +- `client/src/client/services/` — generated client (6 service classes) +- `client/src/pages/JudgeTuningPage.tsx` — 15+ direct `fetch()` calls +- `client/src/hooks/useDatabricksApi.ts` — 7 direct `fetch()` calls +- `client/src/pages/IntakePage.tsx` — 7 direct `fetch()` calls +- `client/src/components/FacilitatorDashboard.tsx` — 4 direct `fetch()` calls +- `client/src/context/UserContext.tsx:202` — manual `fetch('/users/auth/login')` +- `client/src/context/WorkflowContext.tsx:45` — manual `fetch()` + +**Description**: The project generates a TypeScript API client from the OpenAPI spec (`/* generated using openapi-typescript-codegen -- do not edit */`), but **70+ endpoints are called via raw `fetch()`** instead of the generated service methods. Only a handful of call sites use `UsersService` or `WorkshopsService`. + +The generated client exists to provide: +- Type-safe request/response handling +- Automatic base URL management +- Centralized auth header injection +- Request cancellation +- OpenAPI contract enforcement + +All of these benefits are lost when `fetch()` is used directly. + +**Impact**: No compile-time type checking on API calls. URL typos caught at runtime only. Auth headers duplicated manually. When backend endpoints change, broken `fetch()` calls are invisible until users hit them. + +**Remediation**: Replace all direct `fetch()` calls with generated client service methods. If the generated client is missing endpoints (because the backend lacks `response_model`), fix the backend first (see TP-7), then regenerate. + +**Acceptance Criteria**: +- [ ] Zero direct `fetch()` calls to backend API in application code +- [ ] All API calls go through generated client services +- [ ] Generated client regenerated from complete OpenAPI spec + +--- + +### TP-2: Frontend Type Definitions Triplicated + +**Severity**: HIGH +**Location**: +- `client/src/client/models/` — generated TypeScript types (53 model files) +- `client/src/context/UserContext.tsx:5-27` — manual `User` interface +- `client/src/components/AnnotationAssignmentManager.tsx:22-46` — manual `User`, `WorkshopParticipant`, `Trace` interfaces +- Various page components with inline type definitions + +**Description**: Types are defined in three places that can drift: +1. **Backend Pydantic models** (`server/models.py`) — source of truth +2. **Generated TypeScript client** (`client/src/client/models/`) — derived from OpenAPI spec +3. **Manual frontend interfaces** — in context providers and components + +The `User` type in `UserContext.tsx` defines `role: 'facilitator' | 'sme' | 'participant'` while the generated client type may differ. `AnnotationAssignmentManager` defines its own `User`, `WorkshopParticipant`, and `Trace` types instead of importing from the generated client. + +**Impact**: Type drift between frontend and backend. Changes to Pydantic models don't propagate to manual interfaces. False type safety — code compiles but the contract is wrong. + +**Remediation**: Delete all manual type definitions. Import exclusively from `@/client/models`. If the generated types are incomplete, fix the backend `response_model` declarations and regenerate. + +**Acceptance Criteria**: +- [ ] Zero manual interface definitions that duplicate generated types +- [ ] All components import types from `@/client/models` +- [ ] Single source of truth: Pydantic model → OpenAPI spec → generated TypeScript + +--- + +### TP-3: TanStack Query Partially Adopted — Manual Patterns Persist + +**Severity**: HIGH +**Location**: +- `client/src/hooks/useWorkshopApi.ts` — 50+ hooks (good usage) +- `client/src/components/IntakeWaitingView.tsx:19-45` — manual `useState`/`useEffect`/`setInterval` +- `client/src/components/ProductionLogin.tsx:17-70` — manual `useState`/`fetch` with retry +- `client/src/components/RubricSuggestionPanel.tsx:54-77` — manual `useState`/`fetch` +- `client/src/pages/JudgeTuningPage.tsx:62-120` — 49 `useState` hooks for data that should be query state + +**Description**: `useWorkshopApi.ts` has 50+ well-structured TanStack Query hooks. But several components bypass these entirely and use the pre-TanStack pattern: + +```typescript +// IntakeWaitingView.tsx — SHOULD use useQuery with refetchInterval +const [status, setStatus] = useState(null); +const [isLoading, setIsLoading] = useState(true); +useEffect(() => { + const loadStatus = async () => { /* manual fetch */ }; + const interval = setInterval(loadStatus, 5000); // manual polling + return () => clearInterval(interval); +}, []); +``` + +TanStack Query provides `refetchInterval`, automatic loading/error states, caching, and deduplication — all reimplemented manually in these components. + +**Specific bypasses**: +- **IntakeWaitingView**: Manual `setInterval` polling → should use `refetchInterval: 5000` +- **ProductionLogin**: Manual fetch with retry → should use `useQuery` with `retry: 3` +- **JudgeTuningPage**: 49 `useState` hooks managing server state → should be `useQuery`/`useMutation` +- **RubricSuggestionPanel**: Duplicates logic from `useGenerateRubricSuggestions` hook + +**Impact**: Inconsistent loading/error handling. No request deduplication. No cache sharing between components. Polling doesn't respect window focus. Components have 3x the state management code they need. + +**Remediation**: Migrate remaining manual patterns to existing TanStack Query hooks. For JudgeTuningPage, create dedicated hooks for evaluation state. + +**Acceptance Criteria**: +- [ ] Zero `useEffect`+`fetch` patterns for server data +- [ ] Zero `setInterval` polling — all use `refetchInterval` +- [ ] All server state managed through `useQuery`/`useMutation` + +--- + +### TP-4: TanStack Query Misconfigured — staleTime: 0 Causes Over-fetching + +**Severity**: HIGH +**Location**: +- `client/src/main.tsx:7-17` — QueryClient defaults +- `client/src/App.tsx:12` — duplicate QueryClient instance +- `client/src/context/WorkshopContext.tsx:95,108,127,168` — aggressive `queryClient.clear()` + +**Description**: The QueryClient is configured with `staleTime: 0` and `refetchOnMount: true`, meaning every component mount triggers a refetch. Comments in the code reveal this has already caused problems: + +```typescript +// useWorkshopApi.ts — "was 10s — too aggressive for Databricks Apps" +// useAllParticipantNotes — "was 5s, too aggressive for Databricks Apps" +// WorkshopContext — "stale workshopId causes polling on login page, hammering the backend with 503 storms" +``` + +Additionally: +- **Two QueryClient instances**: `main.tsx` creates one with custom defaults; `App.tsx` creates another with defaults (which one wins depends on component tree) +- **`queryClient.clear()` called 4 times**: In `WorkshopContext.tsx`, any workshop ID change clears the *entire* cache, triggering re-fetches of everything +- **Inconsistent staleTime**: Some hooks set it (10s-30s), most don't (inheriting 0) +- **Inconsistent refetchOnWindowFocus**: Some enable it, some disable it "to prevent Chrome hangs" + +**Impact**: Backend receives 3-5x more requests than necessary. Users experience unnecessary loading spinners. Chrome tab hangs reported. 503 error storms traced to polling. + +**Remediation**: +1. Remove duplicate QueryClient — keep one with sensible defaults (`staleTime: 30_000`) +2. Replace `queryClient.clear()` with targeted `invalidateQueries({ queryKey: [...] })` +3. Set consistent staleTime/refetchOnWindowFocus across all hooks +4. Use query key factory consistently for invalidation + +**Acceptance Criteria**: +- [ ] Single QueryClient instance with documented defaults +- [ ] Zero `queryClient.clear()` calls — use targeted invalidation +- [ ] Consistent staleTime across similar query types +- [ ] No comments about "too aggressive" polling + +--- + +### TP-5: Alembic Migrations Bypassed by Runtime Schema Updates + +**Severity**: CRITICAL +**Location**: +- `server/database.py:589-734` — `_apply_schema_updates()` function +- `server/database.py:651-673` — runtime unique index creation +- `server/database.py:689-713` — runtime `CREATE TABLE IF NOT EXISTS` +- `server/app.py:68` — calls `maybe_bootstrap_db_on_startup()` + +**Description**: Alembic manages migrations in `migrations/versions/`, but `_apply_schema_updates()` runs raw SQL at application startup that adds columns, creates tables, and builds indexes — all outside Alembic's tracking: + +```python +# database.py:603 — Column added at runtime, not in any migration +conn.execute(text("ALTER TABLE judge_prompts ADD COLUMN IF NOT EXISTS model_name VARCHAR DEFAULT 'demo'")) + +# database.py:618 — Another runtime column +conn.execute(text('ALTER TABLE annotations ADD COLUMN IF NOT EXISTS ratings JSON')) + +# database.py:689-713 — Entire table created at runtime +conn.execute(text('CREATE TABLE IF NOT EXISTS participant_notes (...)')) + +# database.py:651-653 — Unique indexes at runtime +conn.execute(text('CREATE UNIQUE INDEX IF NOT EXISTS idx_discovery_findings_unique ...')) +``` + +**Runtime changes not tracked by Alembic**: +- `judge_prompts.model_name`, `judge_prompts.model_parameters` +- `annotations.ratings` +- `traces.include_in_alignment`, `traces.sme_feedback` +- `participant_notes` table +- 3 unique indexes on `discovery_findings`, `annotations`, `judge_evaluations` + +Migration `0010_add_participant_notes.py` later creates the same table that `_apply_schema_updates()` already creates — redundant and potentially conflicting. + +**Impact**: `alembic heads` doesn't reflect actual schema. Rollbacks impossible for runtime changes. Multiple workers running startup simultaneously can race on schema changes. Schema drift between environments. + +**Remediation**: Move all `_apply_schema_updates()` logic into proper Alembic migrations. Remove the runtime function. Use `alembic revision --autogenerate` to detect drift. + +**Acceptance Criteria**: +- [ ] `_apply_schema_updates()` removed or empty +- [ ] All schema changes tracked in Alembic migrations +- [ ] `alembic upgrade head` is the only way schema changes +- [ ] Zero `CREATE TABLE IF NOT EXISTS` or `ALTER TABLE ADD COLUMN` in application code + +--- + +### TP-6: SQLAlchemy Uses Deprecated Query API Exclusively + +**Severity**: HIGH +**Location**: `server/services/database_service.py` — 93 occurrences of `.query()` + +**Description**: The entire data access layer uses SQLAlchemy's legacy `Session.query()` API which was deprecated in SQLAlchemy 2.0: + +```python +# Legacy pattern (used throughout): +db_workshop = self.db.query(WorkshopDB).filter(WorkshopDB.id == workshop_id).first() + +# Modern pattern (not used anywhere): +from sqlalchemy import select +stmt = select(WorkshopDB).where(WorkshopDB.id == workshop_id) +db_workshop = self.db.execute(stmt).scalar_one_or_none() +``` + +Additionally: +- **Relationships defined but not used**: `database.py` defines `relationship()` fields, but `database_service.py` does manual JOINs with tuple unpacking instead of accessing `finding.user.name` +- **No eager loading**: Zero uses of `selectinload()`, `joinedload()`, or `lazy=` configuration +- **No `load_only()`**: JSON columns (100KB+) always fetched even when only IDs are needed + +**Impact**: When SQLAlchemy drops the legacy API, the entire service layer breaks. Missing eager loading causes N+1 queries. Manual JOINs defeat the purpose of the ORM relationship system. + +**Remediation**: Migrate to `select()` API. Use relationships for data access. Add `selectinload()` or `joinedload()` where relationships are traversed. + +**Acceptance Criteria**: +- [ ] Zero uses of `Session.query()` — all use `select()` statement API +- [ ] Relationships used for related data access (no manual tuple unpacking) +- [ ] Eager loading strategy specified for relationship-heavy queries + +--- + +### TP-7: Pydantic Models Not Enforced on Responses — 60+ Raw Dict Returns + +**Severity**: HIGH +**Location**: +- `server/routers/workshops.py` — 50+ endpoints return raw dicts, zero use `response_model` +- `server/routers/users.py` — 12+ raw dict returns +- `server/routers/databricks.py` — 2 raw dict returns +- `server/models.py` — no `model_config` on any model + +**Description**: FastAPI's `response_model` parameter enables automatic response validation, serialization, and OpenAPI schema generation. Out of 100+ endpoints, only 11 declare a `response_model`. The rest return ad-hoc dictionaries: + +```python +# Typical pattern in workshops.py (50+ occurrences): +return {"message": "Judge name updated successfully", "judge_name": judge_name} +return {"error": "No traces available for preview"} +return {"status": "deleted"} +``` + +Pydantic models also lack configuration: +- No `model_config = ConfigDict(from_attributes=True)` — can't convert SQLAlchemy objects directly +- No `Field()` validators on emails, names, or descriptions +- Same model used for create and response (no proper Create/Update/Response variants) + +**Downstream effect**: The generated TypeScript client gets `CancelablePromise` return types for these endpoints because the OpenAPI spec has no response schema. + +**Impact**: No response validation. Generated client types are useless (`any`). API documentation incomplete. Frontend must guess at response shapes. + +**Remediation**: +1. Add `model_config = ConfigDict(from_attributes=True)` to all response models +2. Create proper Response variants (separate from Create/Update) +3. Add `response_model=` to all endpoints +4. Add `Field()` validators (email, length constraints, etc.) +5. Regenerate TypeScript client + +**Acceptance Criteria**: +- [ ] All endpoints declare `response_model` +- [ ] Generated TypeScript client has zero `any` return types +- [ ] Response models separate from create/update models +- [ ] `model_config` configured on all models + +--- + +### TP-8: FastAPI Dependency Injection Underutilized + +**Severity**: MEDIUM +**Location**: +- `server/routers/workshops.py:245,258,265,275...` — manual `DatabaseService(db)` instantiation +- `server/routers/users.py:25-27` — correct `Depends()` pattern (but only here) + +**Description**: `users.py` correctly defines a dependency: +```python +def get_database_service(db: Session = Depends(get_db)) -> DatabaseService: + return DatabaseService(db) +``` + +But `workshops.py` (5,229 lines, 100+ endpoints) manually instantiates `DatabaseService(db)` in every handler: +```python +async def create_workshop(workshop_data: WorkshopCreate, db: Session = Depends(get_db)): + db_service = DatabaseService(db) # Repeated 100+ times +``` + +FastAPI's `Depends()` provides: +- Single point of change for service creation +- Testability via dependency overrides +- Middleware-like behavior (logging, metrics, error wrapping) +- Automatic cleanup via generator dependencies + +**Impact**: Can't swap service implementations for testing without patching. No single point to add cross-cutting concerns. Boilerplate repeated 100+ times. + +**Remediation**: Use `Depends(get_database_service)` across all routers. + +**Acceptance Criteria**: +- [ ] `DatabaseService` injected via `Depends()` in all routers +- [ ] Zero manual `DatabaseService(db)` instantiation in handlers +- [ ] Tests use dependency overrides instead of mocks + +--- + +### TP-9: FastAPI BackgroundTasks Unused — Manual Threading Instead + +**Severity**: MEDIUM +**Location**: +- `server/routers/workshops.py:1225` — `threading.Thread(target=run_auto_evaluation_for_new_traces)` +- `server/routers/workshops.py:1302-1640` — `threading.Thread(target=run_auto_evaluation_background)` +- `server/routers/workshops.py:3468` — `threading.Thread(target=run_alignment_background)` +- `server/routers/workshops.py:3723` — `threading.Thread(target=run_evaluation_background)` +- `server/routers/workshops.py:4203` — `threading.Thread(target=run_simple_evaluation_background)` +- `server/routers/dbsql_export.py:25` — `BackgroundTasks` parameter declared but never used + +**Description**: FastAPI provides `BackgroundTasks` for post-response work. Instead, 6 endpoints spawn raw daemon threads: + +```python +eval_thread = threading.Thread(target=run_auto_evaluation_background, daemon=True) +eval_thread.start() +``` + +These threads: +- Contain `time.sleep()` calls that block the thread +- Create their own database sessions (bypassing FastAPI's session lifecycle) +- Have no error reporting back to the caller +- Are invisible to FastAPI's shutdown lifecycle (`daemon=True` means they die silently) +- Cannot be tracked, cancelled, or monitored + +One endpoint (`dbsql_export.py:25`) actually accepts `BackgroundTasks` as a parameter but never calls `background_tasks.add_task()`. + +**Impact**: Thread leaks if exceptions occur. No graceful shutdown. Database sessions held open. No observability into background work. Can't test background execution. + +**Remediation**: Replace `threading.Thread` with `BackgroundTasks` for short tasks, or implement a proper task queue (e.g., Celery, or an async task manager) for long-running evaluations. + +**Acceptance Criteria**: +- [ ] Zero `threading.Thread` usage in router handlers +- [ ] Short background work uses `BackgroundTasks` +- [ ] Long-running jobs use a tracked task system with status reporting +- [ ] All background work respects application shutdown + +--- + +### TP-10: SQLAlchemy Model Constraints Defined at Runtime Instead of Declaratively + +**Severity**: MEDIUM +**Location**: +- `server/database.py:651-673` — runtime unique index creation via raw SQL +- `server/database.py:127,147,150-151,330,381-382` — JSON columns storing relational data + +**Description**: Unique constraints are created via raw SQL at application startup instead of being declared in model definitions: + +```python +# Runtime (current): +conn.execute(text('CREATE UNIQUE INDEX IF NOT EXISTS idx_discovery_findings_unique ON discovery_findings (workshop_id, trace_id, user_id)')) + +# Declarative (should be): +class DiscoveryFindingDB(Base): + __table_args__ = ( + UniqueConstraint('workshop_id', 'trace_id', 'user_id', name='idx_discovery_findings_unique'), + ) +``` + +Additionally, 7 JSON columns store lists of IDs that should be junction tables: +- `assigned_traces = Column(JSON)` — should be many-to-many with traces +- `active_discovery_trace_ids = Column(JSON)` — should be association table +- `active_annotation_trace_ids = Column(JSON)` — should be association table +- `discovery_traces = Column(JSON)` — should be association table +- `annotation_traces = Column(JSON)` — should be association table +- `few_shot_examples = Column(JSON)` — should reference traces via junction + +**Impact**: No referential integrity for JSON-stored IDs. Can't query "which traces are assigned to user X" without parsing JSON. Constraints not version-controlled. Alembic can't manage what it can't see. + +**Remediation**: Move constraints into `__table_args__`. Migrate JSON ID lists to proper junction tables via Alembic. + +**Acceptance Criteria**: +- [ ] All constraints declared in model `__table_args__` +- [ ] JSON columns containing ID lists migrated to junction tables +- [ ] Referential integrity enforced at database level + +--- + +### TP-11: Vite Build Missing Code Splitting + +**Severity**: LOW +**Location**: +- `client/vite.config.ts` — no `manualChunks` or dynamic imports configured +- `client/src/App.tsx` — all routes imported statically + +**Description**: All page components are imported statically, creating a single bundle: + +```typescript +import IntakePage from './pages/IntakePage'; +import JudgeTuningPage from './pages/JudgeTuningPage'; // 2,754 lines +import AnnotationDemo from './pages/AnnotationDemo'; +// ... all loaded upfront +``` + +Vite supports automatic code splitting via `React.lazy()` + dynamic imports: +```typescript +const JudgeTuningPage = React.lazy(() => import('./pages/JudgeTuningPage')); +``` + +This would split the 2,754-line JudgeTuningPage (and its dependencies) into a separate chunk loaded only when that route is visited. + +**Impact**: Larger initial bundle. Slower first paint. Users download code for pages they may never visit. + +**Remediation**: Use `React.lazy()` for route-level code splitting. Configure `manualChunks` to separate vendor code. + +**Acceptance Criteria**: +- [ ] Route-level code splitting with `React.lazy()` +- [ ] Vendor chunk separated via `manualChunks` +- [ ] Bundle size tracked and budgeted + +--- + +## Prioritized Backlog + +| Priority | ID | Title | Effort | Impact | +|----------|-----|-------|--------|--------| +| P0 | TP-5 | Move runtime schema updates into Alembic migrations | M | Critical — untracked schema drift | +| P0 | TP-1 | Replace direct fetch() with generated API client | L | Critical — no type safety on API calls | +| P1 | TP-7 | Add response_model to all endpoints + regenerate client | L | High — enables TP-1 and TP-2 | +| P1 | TP-4 | Fix QueryClient configuration and cache invalidation | S | High — eliminates 503 storms, over-fetching | +| P1 | TP-3 | Migrate remaining manual fetch patterns to TanStack Query | M | High — consistent data fetching | +| P1 | TP-6 | Migrate from deprecated .query() to select() API | L | High — future-proofs data layer | +| P2 | TP-2 | Delete duplicate frontend types, use generated client types | S | High — single source of truth | +| P2 | TP-8 | Use Depends() for DatabaseService across all routers | S | Medium — testability, DRY | +| P2 | TP-9 | Replace threading.Thread with BackgroundTasks or task queue | M | Medium — observability, lifecycle | +| P2 | TP-10 | Move constraints to model declarations, migrate JSON→tables | M | Medium — data integrity | +| P3 | TP-11 | Add route-level code splitting | S | Low — performance | + +**Effort**: S = < 2 hours, M = 2-8 hours, L = 1-3 days + +--- + +## Cross-References + +| This Item | Related Items | +|-----------|---------------| +| TP-1 (fetch bypass) | ARCH-3, CQ-4 (any types) | +| TP-2 (type duplication) | ARCH-4 | +| TP-3 (TanStack partial) | CQ-7 (polling duplication), PERF-4 (no backoff) | +| TP-4 (QueryClient config) | PERF-4, CQ-7 | +| TP-5 (Alembic bypass) | DEPLOY-6 (no rollback), DX-7 (no schema docs) | +| TP-6 (deprecated query API) | PERF-2 (N+1 queries) | +| TP-7 (no response_model) | DX-5 (API docs), SEC-5 (password_hash exposure) | +| TP-8 (DI underused) | CQ-1 (god file workshops.py) | +| TP-9 (manual threading) | PERF-7 (blocking sleep in async) | +| TP-10 (runtime constraints) | PERF-1 (missing indexes) | diff --git a/debt-docs/WORKSHOP_ID_SIMPLIFICATION.md b/debt-docs/WORKSHOP_ID_SIMPLIFICATION.md new file mode 100644 index 00000000..875e791b --- /dev/null +++ b/debt-docs/WORKSHOP_ID_SIMPLIFICATION.md @@ -0,0 +1,213 @@ +# Workshop ID Simplification + +## Overview + +The `workshop_id` parameter is threaded explicitly through every layer of the stack — **1,060 backend references** (71 service methods, 124 routes) and **1,171 frontend references** (33 files). Every call site manually passes it: URL path → React context → API service → route handler → database service → SQL WHERE clause. This creates pervasive coupling where `workshop_id` does double duty as both a **data partitioning key** and an **authorization mechanism**, but is resolved and validated repeatedly instead of once. + +Cross-references: ARCH-1 (god service), ARCH-2 (business logic in routes), CQ-1 (god file workshops.py), REACT_PATTERNS_DEBT (context provider pollution). + +--- + +## Current Flow + +``` +React Component + → useWorkshopContext() # resolve from URL/query/localStorage + → WorkshopsService.method(workshopId, ...) # explicit param in every API call + → GET /workshops/{workshop_id}/... # path param on every route + → db_service.method(workshop_id, ...) # explicit param on every service method + → SELECT ... WHERE workshop_id = ? # explicit filter on every query +``` + +The parameter appears at 5 distinct layers with no implicit resolution at any of them. + +--- + +## Suggestions + +### WSID-1: Backend middleware — resolve workshop_id once via FastAPI dependency + +**Addresses**: Explicit `workshop_id` path param on 124 routes, manual validation in each handler + +**Current pattern**: +```python +@router.get("/{workshop_id}/traces") +async def get_traces(workshop_id: str, db_service=Depends(get_database_service)): + workshop = db_service.get_workshop(workshop_id) + if not workshop: + raise HTTPException(404) + traces = db_service.get_traces(workshop_id, ...) + ... +``` + +**Proposed pattern**: +```python +class WorkshopContext: + """Resolved once per request. Validates existence and access.""" + def __init__(self, workshop: Workshop, workshop_id: str): + self.workshop = workshop + self.workshop_id = workshop_id + +async def get_workshop_context( + workshop_id: str = Path(...), + db_service: DatabaseService = Depends(get_database_service), + current_user: User = Depends(get_current_user), +) -> WorkshopContext: + workshop = db_service.get_workshop(workshop_id) + if not workshop: + raise HTTPException(status_code=404, detail="Workshop not found") + # Centralized access check — participant/SME must belong to this workshop + if current_user.role in ("participant", "sme"): + if not db_service.is_participant(workshop_id, current_user.id): + raise HTTPException(status_code=403, detail="Not a member of this workshop") + return WorkshopContext(workshop=workshop, workshop_id=workshop_id) + +@router.get("/{workshop_id}/traces") +async def get_traces(ctx: WorkshopContext = Depends(get_workshop_context)): + traces = db_service.get_traces(ctx.workshop_id, ...) + ... +``` + +**What this buys**: +- Workshop existence + access validated exactly once per request +- Route handlers no longer repeat the lookup/404 pattern +- Authorization logic centralized instead of scattered across handlers +- Foundation for the service-scoping pattern (WSID-2) + +**Incremental path**: Can be adopted route-by-route without a big-bang migration. Add the dependency, update one router file at a time. + +--- + +### WSID-2: Scoped service instances — eliminate workshop_id from method signatures + +**Addresses**: 71 `DatabaseService` methods that all accept `workshop_id` as first parameter + +**Current pattern**: +```python +db_service.get_traces(workshop_id, user_id) +db_service.get_findings(workshop_id) +db_service.add_annotation(workshop_id, data) +db_service.get_rubric(workshop_id) +db_service.get_mlflow_config(workshop_id) +``` + +**Proposed pattern**: +```python +# Factory on the base service (or on individual domain services after ARCH-1 split) +class WorkshopScopedService: + def __init__(self, db_service: DatabaseService, workshop_id: str): + self._db = db_service + self._workshop_id = workshop_id + + def get_traces(self, user_id: str | None = None) -> list[Trace]: + return self._db.get_traces(self._workshop_id, user_id) + + def get_findings(self) -> list[DiscoveryFinding]: + return self._db.get_findings(self._workshop_id) + + def add_annotation(self, data: AnnotationCreate) -> Annotation: + return self._db.add_annotation(self._workshop_id, data) + ... + +# Usage in route handlers (pairs with WSID-1): +@router.get("/{workshop_id}/traces") +async def get_traces(ctx: WorkshopContext = Depends(get_workshop_context)): + svc = ctx.scoped_service # or Depends(get_scoped_service) + return svc.get_traces(user_id=...) +``` + +**What this buys**: +- Removes `workshop_id` from every method call site +- Makes it impossible to accidentally cross workshop boundaries within a request +- Pairs naturally with the ARCH-1 god service split — each domain service can be independently scoped + +**Relationship to ARCH-1**: This can be done *before* the god service split as a thin wrapper, or *during* the split by building scoping into each new domain service. The latter is cleaner but couples the two efforts. + +--- + +### WSID-3: Frontend — scope the API client from auth state + +**Addresses**: 1,171 frontend references, `useWorkshopContext()` + null-check pattern in 11+ pages, workshopId threaded into every API call and query key + +**Current pattern**: +```typescript +// Repeated in every page component +const { workshopId } = useWorkshopContext(); + +useEffect(() => { + if (!workshopId) return; + loadData(); +}, [workshopId]); + +// Every API call passes it explicitly +WorkshopsService.getTracesWorkshopsWorkshopIdTracesGet(workshopId, userId); + +// Every query key includes it +queryKey: ['traces', workshopId] +``` + +**Proposed pattern**: +```typescript +// Workshop API hook — resolves workshopId internally from auth/context +function useWorkshopApi() { + const { workshopId } = useWorkshopContext(); + + return useMemo(() => ({ + getTraces: (userId?: string) => + WorkshopsService.getTracesWorkshopsWorkshopIdTracesGet(workshopId!, userId), + getFindings: (userId?: string) => + WorkshopsService.getFindingsWorkshopsWorkshopIdFindingsGet(workshopId!, userId), + addAnnotation: (data: AnnotationCreate) => + WorkshopsService.addAnnotationWorkshopsWorkshopIdAnnotationsPost(workshopId!, data), + // ... + }), [workshopId]); +} + +// Page components become cleaner +function AnnotationPage() { + const api = useWorkshopApi(); + const { data: traces } = useQuery({ + queryKey: ['traces'], // workshopId no longer in key — handled by scoped client + queryFn: () => api.getTraces(userId), + }); +} +``` + +**What this buys**: +- Components no longer import `useWorkshopContext` for API calls +- Eliminates the `if (!workshopId) return` guard repeated in 11+ pages +- Query keys simplified (workshopId is implicit in the client scope) +- Centralizes the null-safety check for workshopId + +**Note on query keys**: If multiple workshops could be viewed in a single session, you'd still want workshopId in query keys for cache isolation. But the current app is single-workshop-at-a-time, so scoped keys work. + +--- + +### WSID-4: Store workshop_id in JWT claims (longer-term) + +**Addresses**: The three-source resolution in `WorkshopContext.tsx` (URL path, query param, localStorage), auth-layer complexity + +**Current state**: Workshop ID comes from URL parsing or localStorage. Login validates it but doesn't embed it in the token. + +**Proposed**: When a participant/SME logs in with a `workshop_id`, embed it in the JWT. The backend can then resolve workshop context from the token itself for non-facilitator users, and facilitators can switch workshops explicitly. + +**What this buys**: +- Single source of truth for "which workshop am I in" +- Backend can resolve workshop from auth header alone (no path param needed for most routes) +- Eliminates the three-source resolution logic and its edge cases +- Cleaner facilitator vs. participant auth model + +**Trade-off**: Facilitators manage multiple workshops and need to switch freely. The JWT approach works best for participants (single workshop), while facilitators would continue using explicit workshop selection. This is fine — it matches the actual access pattern. + +--- + +## Sequencing + +| Order | Item | Depends On | Effort | +|-------|------|------------|--------| +| 1 | WSID-1: Backend middleware | Nothing | 2-4 hours | +| 2 | WSID-2: Scoped services | WSID-1, benefits from ARCH-1 | 1-2 days | +| 3 | WSID-3: Frontend scoped client | Nothing (independent) | 4-8 hours | +| 4 | WSID-4: JWT claims | WSID-1 | 1 day | + +WSID-1 and WSID-3 are independent and can be done in parallel. WSID-2 builds on WSID-1. WSID-4 is optional but completes the picture. diff --git a/e2e-test-output.txt b/e2e-test-output.txt deleted file mode 100644 index 3668c2a7..00000000 --- a/e2e-test-output.txt +++ /dev/null @@ -1,37 +0,0 @@ -uv run python -m server.db_bootstrap bootstrap -Running tests in headless mode - -> databricks-app-template-client@0.1.0 test -> npx playwright test tests/e2e --workers=1 - -📦 Creating new database via migrations: /Users/forrest.murray/Documents/ralph-refactor/.e2e-workshop.db -✅ Database created successfully! -🚀 Starting E2E servers - DB : .e2e-workshop.db - API: http://localhost:8000 - UI : http://localhost:3000 - -> databricks-app-template-client@0.1.0 dev -> npx vite --host 127.0.0.1 --port 3000 --strictPort - -error when starting dev server: -Error: Port 3000 is already in use - at Server.onError (file:///Users/forrest.murray/Documents/ralph-refactor/client/node_modules/vite/dist/node/chunks/dep-C6uTJdX2.js:45596:18) - at Server.emit (node:events:519:28) - at emitErrorNT (node:net:1940:8) - at process.processTicksAndRejections (node:internal/process/task_queues:82:21) - -Running 16 tests using 1 worker - -INFO: Started server process [40192] -INFO: Waiting for application startup. -INFO: Application startup complete. -ERROR: [Errno 48] error while attempting to bind on address ('127.0.0.1', 8000): address already in use -INFO: Waiting for application shutdown. -INFO: Application shutdown complete. -🚀 Application startup - lifespan function called! -✅ Application startup complete! -🔄 Application shutting down... - ✘ 1 [chromium] › tests/e2e/annotation-last-trace.spec.ts:16:3 › Annotation - Last Trace Bug › all 10 annotations should be saved when annotating via API (682ms) - ✘ 2 [chromium] › tests/e2e/annotation-last-trace.spec.ts:74:3 › Annotation - Last Trace Bug › last trace annotation should be saved via UI flow (10.6s) - ✘ 3 [chromium] › tests/e2e/annotation-last-trace.spec.ts:134:3 › Annotation - Last Trace Bug › 10th trace save should not be blocked by hasAnnotationChanged check (141ms)