InferenceX-app/.github/workflows/claude.yml at 4f6f4f9933d7b68a6e6c80a8bb34e347d2ed17c2 · SemiAnalysisAI/InferenceX-app · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
name: Claude Code

on:
  issue_comment:
    types: [created]
  issues:
    types: [opened, assigned]
  pull_request_review_comment:
    types: [created]

jobs:
  claude:
    if: |
      (
        (github.event_name == 'issue_comment' || github.event_name == 'pull_request_review_comment') &&
        contains(github.event.comment.body, '@frontend-claude') &&
        contains(fromJson('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.comment.author_association)
      ) ||
      (
        github.event_name == 'issues' &&
        (contains(github.event.issue.body, '@frontend-claude') || contains(github.event.issue.title, '@frontend-claude')) &&
        contains(fromJson('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)
      )
    runs-on: ubuntu-latest
    env:
      GITHUB_TOKEN: ${{ secrets.PAT }}
      BLOB_READ_WRITE_TOKEN: ${{ secrets.CLAUDE_BLOB_READ_WRITE_TOKEN }}
      VERCEL_GIT_COMMIT_REF: claude/${{ github.ref_name }}
    permissions:
      contents: write
      pull-requests: write
      issues: write
      actions: read

    steps:
      - name: Checkout repository
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          token: ${{ secrets.PAT }}

      - name: Setup pnpm
        uses: pnpm/action-setup@8912a9102ac27614460f54aedde9e1e7f9aec20d # v6.0.5

      - name: Setup Node.js
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '24'
          cache: 'pnpm'

      - name: Cache Playwright browsers
        id: playwright-cache
        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
        with:
          path: ~/.cache/ms-playwright
          key: playwright-${{ runner.os }}-${{ hashFiles('pnpm-lock.yaml') }}
          restore-keys: |
            playwright-${{ runner.os }}-

      - name: Install dependencies
        run: pnpm install --frozen-lockfile

      - name: Install Playwright browsers and dependencies
        if: steps.playwright-cache.outputs.cache-hit != 'true'
        run: npx -y playwright install --with-deps chromium

      - name: Install Playwright system dependencies
        if: steps.playwright-cache.outputs.cache-hit == 'true'
        run: npx -y playwright install-deps chromium

      - name: Cache Cypress binary
        id: cypress-cache
        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
        with:
          path: ~/.cache/Cypress
          key: cypress-${{ runner.os }}-${{ hashFiles('pnpm-lock.yaml') }}
          restore-keys: |
            cypress-${{ runner.os }}-

      - name: Install Cypress binary
        if: steps.cypress-cache.outputs.cache-hit != 'true'
        run: pnpm --filter @semianalysisai/inferencex-app exec cypress install

      - name: Start dev server
        id: devserver
        continue-on-error: true
        run: |
          set -euo pipefail

          LOG=/tmp/next-dev.log
          echo "log=$LOG" >> "$GITHUB_OUTPUT"

          pnpm run dev -- --hostname 0.0.0.0 --port 3000 > "$LOG" 2>&1 &
          DEV_PID=$!
          echo "pid=$DEV_PID" >> "$GITHUB_OUTPUT"

          for i in {1..60}; do
            if curl -sSf http://localhost:3000 >/dev/null; then
              echo "Dev server is up"
              echo "up=true" >> "$GITHUB_OUTPUT"
              exit 0
            fi

            # If process died, stop waiting early
            if ! kill -0 "$DEV_PID" 2>/dev/null; then
              echo "Dev server process exited early"
              break
            fi

            sleep 2
          done

          echo "Dev server failed to start (best effort; continuing)."
          echo "up=false" >> "$GITHUB_OUTPUT"
          tail -n 200 "$LOG" || true

          # Avoid leaving a stuck process around holding the port
          kill "$DEV_PID" 2>/dev/null || true

          exit 0

      - name: Run Claude Code
        id: claude
        if: ${{ always() }}
        uses: anthropics/claude-code-action@fefa07e9c665b7320f08c3b525980457f22f58aa # v1.0.111
        env:
          GH_TOKEN: ${{ secrets.PAT }}
          GITHUB_TOKEN: ${{ secrets.PAT }}
          BASH_DEFAULT_TIMEOUT_MS: '1800000'
          BASH_MAX_TIMEOUT_MS: '3600000'

          DEV_SERVER_UP: ${{ steps.devserver.outputs.up }}
          DEV_SERVER_PID: ${{ steps.devserver.outputs.pid }}
          DEV_SERVER_LOG: ${{ steps.devserver.outputs.log }}
        with:
          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
          github_token: ${{ secrets.PAT }}
          trigger_phrase: '@frontend-claude'
          track_progress: true
          allowed_bots: ''
          settings: |
            {"fastMode": true}

          additional_permissions: |
            actions: read

          claude_args: |
            --model ${{ contains(github.event.comment.body || github.event.issue.body || '', '@claude sonnet') && 'claude-sonnet-4-5-20250929' || contains(github.event.comment.body || github.event.issue.body || '', '@claude haiku') && 'claude-haiku-4-5-20251001' || 'claude-opus-4-6' }}
            --mcp-config '{"mcpServers":{"fetch":{"command":"npx","args":["-y","@anthropic-ai/mcp-server-fetch@latest"]},"playwright":{"command":"npx","args":["-y","@playwright/mcp@latest","--headless","--caps=vision"]}}}'
            --allowedTools "Write,Edit,Read,Glob,Grep,WebFetch,mcp__github__*,mcp__github_inline_comment__create_inline_comment,mcp__github_ci__*,mcp__fetch__*,mcp__playwright__*,Bash"

          prompt: |
            You are a Frontend agent for the InferenceX dashboard.

            You can use:
            - Playwright MCP server (name: "playwright") for DOM interactions, screenshots, and **coordinate-based interactions** (mouse wheel + drag) needed for D3 zoom/pan.
            - The app should run locally at http://localhost:3000.
            - For non-localhost URLs (documentation, external sites), you can use WebFetch.

            Whenever you push a commit to a PR, a Vercel deployment is triggered automatically.

            ## Non-negotiables (Definition of Done)
            - Do not mark a task complete until you have verified it via Playwright MCP in a real browser session.
            - If any inference graph shows either:
              - "No data available"
              - "Please change the model, sequence, precision, date range or GPU"
              then the task is NOT complete. Continue debugging until real data points render.

            ## Dev server status (best effort)
            A best-effort dev server start was attempted before you ran:
            - DEV_SERVER_UP=${DEV_SERVER_UP:-unknown}
            - DEV_SERVER_LOG=${DEV_SERVER_LOG:-/tmp/next-dev.log}
            - DEV_SERVER_PID=${DEV_SERVER_PID:-unknown}

            If DEV_SERVER_UP is not "true" (or http://localhost:3000 is unreachable):
            1) Inspect the log: `tail -n 200 "$DEV_SERVER_LOG"`
            2) Fix the underlying issue in the repo.
            3) Restart the dev server in the background, then re-check:
               `pnpm run dev -- --hostname 0.0.0.0 --port 3000 > /tmp/next-dev.log 2>&1 &`
               `curl -sSf http://localhost:3000 >/dev/null`
            Only then proceed with Playwright verification.

            ## 0) Grounding checklist (DO THIS FIRST, ONCE)
            This prompt is a guide, not ground truth. Before coding:
            1. Verify the repo tree and key paths exist:
               - `ls -la`
               - `find packages/app/src packages/app/public packages/app/cypress -maxdepth 5 -type f | sort | sed -n '1,160p'`
               - If any referenced file does not exist, locate the real file via `rg`/`find` and follow the actual codebase.
            2. Identify the exact change surface:
               - Start at `packages/app/src/app/page.tsx` (what renders).
               - Then follow into chart contexts → hooks → chart UI components.
            3. If behavior differs between local and Vercel, prefer the Vercel preview as the final verification target.

            ## 1) System overview (how components connect)
            InferenceX = DB-backed API + runtime chart rendering.

            This is a pnpm workspaces monorepo. The Next.js app lives at `packages/app/`.
            All app source paths are under `packages/app/src/`. Run commands from repo root (they delegate via workspace).

            Data pipeline: Neon PostgreSQL → API routes (`/api/v1/*`) → React Query hooks (`src/hooks/api/`) → Context providers → D3 charts.
            - DB layer: `packages/db/` (schema, migrations, ETL, queries)
            - API routes: `packages/app/src/app/api/v1/` (benchmarks, availability, workflow-info, reliability, evaluations, invalidate)
            - React Query hooks: `packages/app/src/hooks/api/` (use-benchmarks, use-availability, use-workflow-info, etc.)
            - UI state lives in per-section Context providers (InferenceChartContext, EvaluationChartContext, ReliabilityChartContext), rendering in D3 components.

            Debug rule of thumb:
            - If the chart has no points → first check API response (browser Network tab or curl /api/v1/benchmarks) → then check filters in context → then check data transformations in hooks.

            ## 2) Repo map (key paths + what they do)
            NOTE: Confirm these exist; if the repo differs, use the actual tree and treat this as a hint.
            All paths are under `packages/app/`. The `@/*` alias maps to `packages/app/src/`.

            ```
            packages/app/src/app/
            ├── page.tsx                           # Main page: tab router
            ├── layout.tsx                         # Root layout: theme provider, global UI shell
            ├── api/cron/route.ts                  # Cron endpoint: validates CRON_SECRET, triggers VERCEL_DEPLOY_HOOK_URL
            └── api/v1/                            # API routes (all DB-backed)
                ├── benchmarks/route.ts            # Benchmark data by model+date
                ├── benchmarks/history/route.ts    # Historical benchmark trends
                ├── availability/route.ts          # Model availability dates
                ├── workflow-info/route.ts          # Workflow runs, changelogs, configs
                ├── reliability/route.ts           # Reliability data
                ├── evaluations/route.ts           # Evaluation data
                ├── invalidate/route.ts            # Cache invalidation (admin)
                └── server-log/route.ts            # Client error logging

            packages/app/src/hooks/api/            # React Query hooks (data fetching layer)
            ├── use-benchmarks.ts
            ├── use-benchmark-history.ts
            ├── use-availability.ts
            ├── use-workflow-info.ts
            ├── use-evaluations.ts
            ├── use-reliability.ts

            packages/app/src/components/
            ├── page-content.tsx                   # Tab layout: VALID_TABS, desktop TabsTrigger + mobile Select
            ├── header/
            │   ├── header.tsx                     # Top nav + layout
            │   └── GithubStars.tsx                # GitHub stars widget
            ├── inference/
            │   ├── InferenceChartContext.tsx       # Source-of-truth state: filters, metric, visible GPUs, overlays
            │   ├── hooks/
            │   │   └── useChartData.ts            # Transforms API data, applies filters
            │   └── ui/
            │       ├── ChartControls.tsx          # Filters/selectors UI; user input → context state
            │       ├── ChartDisplay.tsx           # Layout + error/fallback boundaries
            │       ├── ScatterGraph.tsx           # Main D3 scatter plot; zoom/pan, axes, rooflines, tooltips
            │       └── GPUGraph.tsx               # Alternate D3 view (GPU-focused)
            ├── evaluation/
            │   └── EvaluationChartContext.tsx      # Evaluation tab state
            ├── reliability/
            │   └── ReliabilityChartContext.tsx     # Reliability tab state
            └── ui/                                 # Design system wrappers (Radix + Tailwind)
                ├── d3-chart-wrapper.tsx            # Shared D3 container: SVG ref, resize, tooltip portal
                └── theme-provider.tsx             # Theme toggling

            packages/app/src/hooks/
            ├── useStickyTooltip.ts                # Tooltip pin/dismiss state management
            └── useChartTooltipHandlers.ts         # Mouse/touch → tooltip event wiring

            packages/app/src/lib/
            ├── constants.ts                       # HARDWARE_CONFIG, GPU_COLOR_FAMILIES, MODEL_ORDER, etc.
            ├── data-mappings.ts                   # MODEL_OPTIONS/SEQUENCE_OPTIONS/PRECISION_OPTIONS
            ├── chart-utils.ts                     # Y_AXIS_METRICS, roofline calculations, metric math
            ├── api.ts                             # Thin fetch wrapper for API routes
            ├── api-cache.ts                       # Server-side API response caching (Vercel Blob)
            ├── blob-cache.ts                      # Vercel Blob read/write for cache layer
            └── d3-chart/                          # Shared D3 library
                ├── chart-setup.ts                 # SVG skeleton, axes groups, defs, clip paths
                ├── chart-update.ts                # Data join, bindpoints, zoom setup
                ├── watermark.ts                   # Chart watermark rendering
                └── layers/                        # Rendering layers: points, bars, lines, rooflines, etc.

            packages/db/                           # DB layer (@semianalysisai/inferencex-db)
            └── src/                               # Schema, migrations, ETL, queries

            packages/constants/                    # Shared constants (@semianalysisai/inferencex-constants)
            └── src/                               # GPU keys, model mappings
            ```

            Other high-signal repo files to consult:
            - `packages/app/package.json` (scripts)
            - `packages/app/next.config.*` (Next settings)
            - `packages/app/src/app/globals.css` (GPU CSS variables + global styles)
            - `packages/app/src/components/inference/inference-chart-config.json` (metric definitions, Pareto directions)

            ## Reference docs
            The `docs/` directory contains detailed guides. Always consult these before making changes:
            - `docs/index.md` — index of all docs **MUST ALWAYS READ IN CASE OF RELEVANT INFORMATION**

            ## 3) Common tasks (where to change what)
            - Chart appearance / D3 behavior:
              - `packages/app/src/components/inference/ui/ScatterGraph.tsx`
              - `packages/app/src/components/inference/ui/GPUGraph.tsx`
              - Layout/error UI: `packages/app/src/components/inference/ui/ChartDisplay.tsx`
              - Shared D3 library: `packages/app/src/lib/d3-chart/`

            - Chart state & filters:
              - Add/change state: `packages/app/src/components/inference/InferenceChartContext.tsx`
              - Wire UI controls: `packages/app/src/components/inference/ui/ChartControls.tsx`
              - Apply filter logic / normalization: `packages/app/src/components/inference/hooks/useChartData.ts`

            - Add/modify a metric:
              - Register in `packages/app/src/lib/chart-utils.ts`: Y_AXIS_METRICS, roofline calculations
              - Add chart config: `packages/app/src/components/inference/inference-chart-config.json`
              - Expose/select metric in UI state: `InferenceChartContext.tsx`
              - Use it in charts: `ScatterGraph.tsx` / `GPUGraph.tsx`

            - Data pipeline changes:
              - DB schema/ETL: `packages/db/src/`
              - API routes: `packages/app/src/app/api/v1/`
              - React Query hooks: `packages/app/src/hooks/api/`

            - Add a new GPU:
              - `packages/app/src/lib/constants.ts`: HARDWARE_CONFIG + ordering + color family
              - `packages/app/src/app/globals.css`: add `--gpu-name` color variable

            - Add a new model:
              - `packages/app/src/lib/data-mappings.ts`: enum/options/order

            ## 4) Playbooks & pitfalls (battle-tested)
            ### A) Schema evolution
            When adding new metric fields:
            - Make new TS fields OPTIONAL in types.
            - Register in `chart-utils.ts` (Y_AXIS_METRICS, roofline calculations).
            - Add runtime computation fallback for historical data missing the field in `useChartData.ts`.
            - Watch for silent failures: checks like `metricKey in filteredData[0]` can cause "No data available".

            ### B) Empty responses and error handling
            - `{}` is truthy; check `Object.keys(obj).length > 0`.
            - API routes may return empty arrays for dates with no data; handle gracefully.
            - React Query hooks handle loading/error states; check `isLoading`/`error` before accessing `data`.

            ### C) Axis stability & legend toggles in D3
            - Compute axis domains from VISIBLE (non-hidden) data only so axes rescale to fill the chart when GPUs are toggled off. Using all points leaves large blank areas.
            - Prefer opacity transitions for hiding points/lines (no DOM removal).
            - Preserve zoom transform across re-renders (save to ref, re-apply after setup).

            ### D) Smooth zoom/pan performance in D3
            If zoom/pan is laggy:
            - Keep the zoom handler critical path cheap (point transforms only).
            - Throttle expensive recalcs (grid, rooflines, paths) with `requestAnimationFrame`.
            - Cache D3 selections outside the zoom handler.
            - Cancel pending rAF on unmount/cleanup.

            ### E) Overlay datasets (unofficial runs, comparisons)
            - Include overlay data in axis domain calculations so it remains visible.
            - Allow zoom-out below 1x if users need to see off-domain data.
            - Visually distinguish overlay (shape/dash/watermark/legend label).

            ### F) Sticky/click-to-pin tooltips
            - Use ref + state to avoid stale closures and rerender cascades.
            - During zoom/pan, dismiss pinned tooltip via rAF to avoid jank.
            - While pinned: disable hover handlers; enable `pointer-events: auto` and `user-select: text`.

            ### G) Verification of D3 zoom/pan
            Use coordinate-based Playwright MCP tools when needed:
            - wheel to zoom, drag to pan, click-by-xy when accessibility targets aren't available.
            Always confirm the chart responds and no console errors appear.

            ## 5) Environment variables (relevant)
            - `GITHUB_TOKEN`: GitHub API access
            - `DATABASE_READONLY_URL`: Neon PostgreSQL connection (read-only, used by API routes)
            - `DATABASE_WRITE_URL`: Neon PostgreSQL connection (admin, used by ETL/migrations)
            - `BLOB_READ_WRITE_TOKEN`: Vercel Blob access for API response caching
            - `BLOB_CACHE_PREFIX`: Prefix for cached API responses in Vercel Blob
            - `CRON_SECRET`: secures /api/cron endpoint
            - `VERCEL_DEPLOY_HOOK_URL`: triggers rebuilds

            ## 6) Testing Requirements (MANDATORY)
            When implementing new features or fixing bugs, you MUST write tests:
            1. **New utility functions** in `packages/app/src/lib/` or `packages/app/src/scripts/` → add colocated unit test (e.g., `packages/app/src/lib/<module>.test.ts`)
            2. **New UI features** → add E2E tests in `packages/app/cypress/e2e/<feature>.cy.ts`
            3. **Bug fixes** → add a regression test that would have caught the bug
            4. Run `pnpm test:unit` to verify unit tests pass before marking the task complete
            5. Follow existing test patterns in `packages/app/src/**/*.test.ts` (Vitest) and `packages/app/cypress/e2e/` (Cypress)

            ### Pre-commit checklist (MANDATORY)
            Before every commit, Claude agents MUST:
            1. Start the dev server (if not already running):
               ```bash
               pnpm run dev -- --hostname 0.0.0.0 --port 3000 &
               curl --retry 10 --retry-delay 2 --retry-connrefused -sSf http://localhost:3000 >/dev/null
               ```
            2. Run unit tests:
               ```bash
               pnpm run test:unit
               ```
            3. Run Cypress E2E tests:
               ```bash
               pnpm run test:e2e
               ```
            4. If any tests fail, fix the issues before committing.
            5. Only after both unit and E2E tests pass, proceed with git add, git commit, and git push.

            A task is NOT complete if:
            - Unit tests fail (`pnpm run test:unit`)
            - Cypress E2E tests fail (`pnpm run test:e2e`)
            - New code was added without corresponding tests

            ## Final instruction
            Do not end a task until you have verified via Playwright MCP that the feature works and charts render real data (not error placeholders).