Skip to content

Commit f58e69b

Browse files
author
Mark Saroufim
committed
vibe coded buildkite prototype
1 parent 7e37b09 commit f58e69b

21 files changed

Lines changed: 2595 additions & 3 deletions

File tree

.buildkite/pipeline.yml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Buildkite Pipeline for Kernel Submissions
2+
#
3+
# This pipeline runs kernel submissions on GPU-bound Buildkite agents.
4+
# Each agent is configured with:
5+
# - CUDA_VISIBLE_DEVICES bound to a single GPU
6+
# - CPU/RAM limits via systemd cgroups
7+
# - Queue tag for GPU routing (e.g., queue=nvidia-h100-0)
8+
#
9+
# Environment variables passed from BuildkiteLauncher:
10+
# - SUBMISSION_PAYLOAD: Base64-encoded, zlib-compressed submission config
11+
# - GPU_QUEUE: Queue name for agent routing
12+
13+
steps:
14+
- label: ":gpu: Run Kernel Submission"
15+
command: "python /opt/kernelbot/buildkite-runner.py"
16+
env:
17+
# Payload is passed via BuildkiteLauncher
18+
SUBMISSION_PAYLOAD: "${SUBMISSION_PAYLOAD}"
19+
agents:
20+
# Route to agent with matching queue tag
21+
queue: "${GPU_QUEUE}"
22+
timeout_in_minutes: 15
23+
artifact_paths:
24+
- "result.json"
25+
- "profile_data/**/*"
26+
plugins:
27+
- docker#v5.11.0:
28+
image: "ghcr.io/gpu-mode/kernelbot-runner:latest"
29+
always-pull: true
30+
propagate-environment: true
31+
# GPU access - agent already bound to single GPU via CUDA_VISIBLE_DEVICES
32+
gpus: all
33+
# Resource limits (can be overridden via env vars)
34+
memory: "${MEMORY_LIMIT:-32g}"
35+
cpus: "${CPU_LIMIT:-16}"
36+
# Mount working directory for artifacts
37+
volumes:
38+
- ".:/workdir"
39+
workdir: "/workdir"
40+
retry:
41+
automatic:
42+
- exit_status: -1 # Agent lost connection
43+
limit: 1
44+
- exit_status: 255 # SSH error
45+
limit: 1
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
name: Build Runner Image
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
paths:
8+
- 'docker/kernelbot-runner/**'
9+
- 'src/libkernelbot/**'
10+
- 'src/runners/buildkite-runner.py'
11+
- '.github/workflows/build-runner-image.yml'
12+
pull_request:
13+
paths:
14+
- 'docker/kernelbot-runner/**'
15+
- 'src/libkernelbot/**'
16+
- 'src/runners/buildkite-runner.py'
17+
workflow_dispatch:
18+
inputs:
19+
push:
20+
description: 'Push image to registry'
21+
required: false
22+
default: 'true'
23+
type: boolean
24+
schedule:
25+
# Rebuild weekly on Sundays at 2 AM UTC
26+
- cron: '0 2 * * 0'
27+
28+
env:
29+
REGISTRY: ghcr.io
30+
IMAGE_NAME: gpu-mode/kernelbot-runner
31+
32+
jobs:
33+
build:
34+
runs-on: ubuntu-latest
35+
permissions:
36+
contents: read
37+
packages: write
38+
39+
steps:
40+
- name: Checkout repository
41+
uses: actions/checkout@v4
42+
43+
- name: Set up Docker Buildx
44+
uses: docker/setup-buildx-action@v3
45+
46+
- name: Log in to Container Registry
47+
if: github.event_name != 'pull_request'
48+
uses: docker/login-action@v3
49+
with:
50+
registry: ${{ env.REGISTRY }}
51+
username: ${{ github.actor }}
52+
password: ${{ secrets.GITHUB_TOKEN }}
53+
54+
- name: Extract metadata for Docker
55+
id: meta
56+
uses: docker/metadata-action@v5
57+
with:
58+
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
59+
tags: |
60+
type=raw,value=latest,enable={{is_default_branch}}
61+
type=sha,prefix=sha-
62+
type=ref,event=branch
63+
type=ref,event=pr
64+
65+
- name: Build and push Docker image
66+
uses: docker/build-push-action@v5
67+
with:
68+
context: .
69+
file: docker/kernelbot-runner/Dockerfile
70+
push: ${{ github.event_name != 'pull_request' && (github.event.inputs.push != 'false') }}
71+
tags: ${{ steps.meta.outputs.tags }}
72+
labels: ${{ steps.meta.outputs.labels }}
73+
cache-from: type=gha
74+
cache-to: type=gha,mode=max
75+
76+
- name: Generate build summary
77+
run: |
78+
echo "## Docker Image Build Summary" >> $GITHUB_STEP_SUMMARY
79+
echo "" >> $GITHUB_STEP_SUMMARY
80+
echo "**Image:** \`${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}\`" >> $GITHUB_STEP_SUMMARY
81+
echo "" >> $GITHUB_STEP_SUMMARY
82+
echo "**Tags:**" >> $GITHUB_STEP_SUMMARY
83+
echo '```' >> $GITHUB_STEP_SUMMARY
84+
echo "${{ steps.meta.outputs.tags }}" >> $GITHUB_STEP_SUMMARY
85+
echo '```' >> $GITHUB_STEP_SUMMARY
86+
87+
- name: Notify vendors (Slack)
88+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
89+
continue-on-error: true
90+
uses: slackapi/slack-github-action@v1.25.0
91+
with:
92+
payload: |
93+
{
94+
"text": "New kernelbot-runner image published",
95+
"blocks": [
96+
{
97+
"type": "section",
98+
"text": {
99+
"type": "mrkdwn",
100+
"text": "*New kernelbot-runner image published* :package:\n\nVendors: run `./scripts/buildkite/update-image.sh` to update your agents.\n\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View build>"
101+
}
102+
}
103+
]
104+
}
105+
env:
106+
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_VENDOR_WEBHOOK }}
107+
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK

docker/kernelbot-runner/Dockerfile

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Kernelbot Runner Docker Image
2+
#
3+
# This image is used by Buildkite agents to run kernel submissions.
4+
# It matches the Modal runner configuration for consistent behavior.
5+
#
6+
# Build:
7+
# docker build -t ghcr.io/gpu-mode/kernelbot-runner:latest -f docker/kernelbot-runner/Dockerfile .
8+
#
9+
# Run locally (for testing):
10+
# docker run --gpus '"device=0"' -e SUBMISSION_PAYLOAD="..." kernelbot-runner:latest
11+
12+
FROM nvidia/cuda:13.1.0-devel-ubuntu24.04
13+
14+
LABEL org.opencontainers.image.source="https://github.com/gpu-mode/kernelbot"
15+
LABEL org.opencontainers.image.description="Kernelbot GPU runner for kernel competitions"
16+
17+
# Install system dependencies
18+
RUN apt-get update && apt-get install -y --no-install-recommends \
19+
python3.13 \
20+
python3.13-venv \
21+
python3-pip \
22+
git \
23+
gcc-13 \
24+
g++-13 \
25+
clang-18 \
26+
curl \
27+
&& rm -rf /var/lib/apt/lists/* \
28+
&& ln -sf /usr/bin/python3.13 /usr/bin/python3 \
29+
&& ln -sf /usr/bin/python3.13 /usr/bin/python
30+
31+
# Create virtual environment
32+
RUN python3 -m venv /opt/venv
33+
ENV PATH="/opt/venv/bin:$PATH"
34+
35+
# Install Python dependencies (matching modal_runner.py)
36+
COPY docker/kernelbot-runner/requirements-runner.txt /tmp/
37+
RUN pip install --upgrade pip && \
38+
pip install -r /tmp/requirements-runner.txt
39+
40+
# Install PyTorch with CUDA 13.0 support
41+
RUN pip install \
42+
torch==2.9.1 \
43+
torchvision \
44+
torchaudio \
45+
--index-url https://download.pytorch.org/whl/cu130
46+
47+
# Install additional frameworks
48+
RUN pip install \
49+
tinygrad~=0.10
50+
51+
# Install NVIDIA CUDA packages
52+
RUN pip install \
53+
nvidia-cupynumeric~=25.3 \
54+
nvidia-cutlass-dsl==4.3.5 \
55+
"cuda-core[cu13]" \
56+
"cuda-python[all]==13.0"
57+
58+
# Copy kernelbot library and runner
59+
WORKDIR /opt/kernelbot
60+
COPY src/libkernelbot /opt/kernelbot/libkernelbot
61+
COPY src/runners/buildkite-runner.py /opt/kernelbot/
62+
63+
# Set PYTHONPATH so libkernelbot is importable
64+
ENV PYTHONPATH="/opt/kernelbot:$PYTHONPATH"
65+
66+
# Default command
67+
CMD ["python", "/opt/kernelbot/buildkite-runner.py"]
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Kernelbot Runner Dependencies
2+
# These should match the Modal runner configuration in modal_runner.py
3+
4+
# Build tools
5+
ninja~=1.11
6+
wheel~=0.45
7+
setuptools
8+
9+
# Core dependencies
10+
requests~=2.32.4
11+
packaging~=25.0
12+
numpy~=2.3
13+
pytest
14+
PyYAML
15+
16+
# Triton for GPU kernels
17+
triton

0 commit comments

Comments
 (0)