Skip to content

Commit bcd4a06

Browse files
committed
Some changes
1 parent 1ac838a commit bcd4a06

10 files changed

Lines changed: 564 additions & 62 deletions

File tree

.github/workflows/benchmarks.yml

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
name: Benchmarks
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
iterations:
7+
description: "Number of iterations for realistic workload benchmark"
8+
required: false
9+
default: "200"
10+
qps_duration:
11+
description: "Duration in seconds for each QPS level"
12+
required: false
13+
default: "10"
14+
compare_with:
15+
description: "Run ID to compare results against (optional)"
16+
required: false
17+
default: ""
18+
19+
jobs:
20+
benchmark:
21+
name: Run Benchmarks
22+
runs-on: ubuntu-latest
23+
timeout-minutes: 30
24+
25+
steps:
26+
- name: Checkout
27+
uses: actions/checkout@v4
28+
29+
- name: Install uv
30+
uses: astral-sh/setup-uv@v4
31+
with:
32+
version: "latest"
33+
34+
- name: Setup Python
35+
run: uv python install 3.9
36+
37+
- name: Cache uv + Python installs + venv
38+
uses: actions/cache@v4
39+
with:
40+
path: |
41+
~/.cache/uv
42+
~/.local/share/uv/python
43+
.venv
44+
key: ${{ runner.os }}-uv-benchmark-3.9-${{ hashFiles('uv.lock') }}
45+
46+
- name: Install dependencies
47+
run: |
48+
uv sync --all-extras
49+
uv pip install flask requests psutil
50+
51+
- name: Get system info
52+
id: sysinfo
53+
run: |
54+
echo "python_version=$(python --version)" >> $GITHUB_OUTPUT
55+
echo "os=$(uname -s)" >> $GITHUB_OUTPUT
56+
echo "arch=$(uname -m)" >> $GITHUB_OUTPUT
57+
echo "cpu_count=$(nproc)" >> $GITHUB_OUTPUT
58+
echo "memory_gb=$(free -g | awk '/^Mem:/{print $2}')" >> $GITHUB_OUTPUT
59+
60+
- name: Run realistic workload benchmark
61+
id: realistic
62+
run: |
63+
uv run python benchmarks/bench/realistic_workload.py 2>&1 | tee realistic_output.txt
64+
# Extract just the results JSON
65+
cat benchmarks/results/realistic-workload.json
66+
67+
- name: Run fixed QPS latency benchmark
68+
id: fixed_qps
69+
run: |
70+
uv run python benchmarks/bench/fixed_qps_latency.py 2>&1 | tee fixed_qps_output.txt
71+
# Extract just the results JSON
72+
cat benchmarks/results/fixed-qps-latency.json
73+
74+
- name: Generate structured results
75+
id: results
76+
run: |
77+
cat > benchmarks/results/benchmark-summary.json << 'EOF'
78+
{
79+
"metadata": {
80+
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
81+
"run_id": "${{ github.run_id }}",
82+
"run_number": "${{ github.run_number }}",
83+
"commit_sha": "${{ github.sha }}",
84+
"branch": "${{ github.ref_name }}",
85+
"triggered_by": "${{ github.actor }}",
86+
"environment": {
87+
"python_version": "${{ steps.sysinfo.outputs.python_version }}",
88+
"os": "${{ steps.sysinfo.outputs.os }}",
89+
"arch": "${{ steps.sysinfo.outputs.arch }}",
90+
"cpu_count": "${{ steps.sysinfo.outputs.cpu_count }}",
91+
"memory_gb": "${{ steps.sysinfo.outputs.memory_gb }}"
92+
}
93+
}
94+
}
95+
EOF
96+
97+
# Create a proper JSON with jq
98+
jq -n \
99+
--slurpfile realistic benchmarks/results/realistic-workload.json \
100+
--slurpfile fixed_qps benchmarks/results/fixed-qps-latency.json \
101+
--arg timestamp "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
102+
--arg run_id "${{ github.run_id }}" \
103+
--arg run_number "${{ github.run_number }}" \
104+
--arg commit_sha "${{ github.sha }}" \
105+
--arg branch "${{ github.ref_name }}" \
106+
--arg triggered_by "${{ github.actor }}" \
107+
--arg python_version "${{ steps.sysinfo.outputs.python_version }}" \
108+
--arg os "${{ steps.sysinfo.outputs.os }}" \
109+
--arg arch "${{ steps.sysinfo.outputs.arch }}" \
110+
--arg cpu_count "${{ steps.sysinfo.outputs.cpu_count }}" \
111+
--arg memory_gb "${{ steps.sysinfo.outputs.memory_gb }}" \
112+
'{
113+
metadata: {
114+
timestamp: $timestamp,
115+
run_id: $run_id,
116+
run_number: ($run_number | tonumber),
117+
commit_sha: $commit_sha,
118+
branch: $branch,
119+
triggered_by: $triggered_by,
120+
environment: {
121+
python_version: $python_version,
122+
os: $os,
123+
arch: $arch,
124+
cpu_count: ($cpu_count | tonumber),
125+
memory_gb: ($memory_gb | tonumber)
126+
}
127+
},
128+
realistic_workload: $realistic[0],
129+
fixed_qps_latency: $fixed_qps[0]
130+
}' > benchmarks/results/benchmark-summary.json
131+
132+
- name: Generate markdown summary
133+
run: |
134+
SUMMARY_FILE="benchmarks/results/benchmark-summary.md"
135+
136+
cat > "$SUMMARY_FILE" << EOF
137+
# Benchmark Results
138+
139+
**Date**: $(date -u +%Y-%m-%d)
140+
**Commit**: ${{ github.sha }}
141+
**Branch**: ${{ github.ref_name }}
142+
**Run ID**: ${{ github.run_id }}
143+
144+
## Environment
145+
- Python: ${{ steps.sysinfo.outputs.python_version }}
146+
- OS: ${{ steps.sysinfo.outputs.os }} (${{ steps.sysinfo.outputs.arch }})
147+
- CPUs: ${{ steps.sysinfo.outputs.cpu_count }}
148+
- Memory: ${{ steps.sysinfo.outputs.memory_gb }} GB
149+
150+
## Realistic Workload Results
151+
152+
EOF
153+
154+
# Parse and format realistic workload results
155+
jq -r '
156+
"| Endpoint | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |",
157+
"|----------|----------|------------|----------|-----------|----------|",
158+
(.comparison_100 | to_entries[] |
159+
"| \(.key) | \(.value.baseline_mean_ms | . * 10 | round / 10)ms | \(.value.sdk_mean_ms | . * 10 | round / 10)ms | +\(.value.mean_overhead_ms | . * 10 | round / 10)ms (\(.value.mean_overhead_pct | round)%) | - | - |"
160+
)
161+
' benchmarks/results/realistic-workload.json >> "$SUMMARY_FILE"
162+
163+
cat >> "$SUMMARY_FILE" << 'EOF'
164+
165+
## Fixed QPS Latency Results
166+
167+
### Mean Latency
168+
169+
EOF
170+
171+
jq -r '
172+
"| QPS | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |",
173+
"|-----|----------|------------|----------|-----------|----------|",
174+
(.baseline | to_entries[] |
175+
. as $b |
176+
($b.key | tostring) as $qps |
177+
"| \($qps) | \($b.value.mean_ms | . * 10 | round / 10)ms | - | - | - | - |"
178+
)
179+
' benchmarks/results/fixed-qps-latency.json >> "$SUMMARY_FILE"
180+
181+
cat >> "$SUMMARY_FILE" << 'EOF'
182+
183+
---
184+
185+
📊 **Full results available in artifacts**
186+
187+
EOF
188+
189+
# Also write to GitHub step summary for UI display
190+
cat "$SUMMARY_FILE" >> $GITHUB_STEP_SUMMARY
191+
192+
- name: Upload benchmark results
193+
uses: actions/upload-artifact@v4
194+
with:
195+
name: benchmark-results-${{ github.run_id }}
196+
path: |
197+
benchmarks/results/*.json
198+
benchmarks/results/*.md
199+
realistic_output.txt
200+
fixed_qps_output.txt
201+
retention-days: 90
202+
203+
- name: Download comparison results (if specified)
204+
if: ${{ inputs.compare_with != '' }}
205+
uses: actions/download-artifact@v4
206+
with:
207+
name: benchmark-results-${{ inputs.compare_with }}
208+
path: benchmarks/results/comparison/
209+
continue-on-error: true
210+
211+
- name: Compare with previous run
212+
if: ${{ inputs.compare_with != '' }}
213+
run: |
214+
if [ -f benchmarks/results/comparison/benchmark-summary.json ]; then
215+
echo "## Comparison with Run ${{ inputs.compare_with }}" >> $GITHUB_STEP_SUMMARY
216+
echo "" >> $GITHUB_STEP_SUMMARY
217+
218+
# Compare realistic workload results
219+
PREV_READ=$(jq '.realistic_workload.comparison_100.typical_read.mean_overhead_ms' benchmarks/results/comparison/benchmark-summary.json)
220+
CURR_READ=$(jq '.realistic_workload.comparison_100.typical_read.mean_overhead_ms' benchmarks/results/benchmark-summary.json)
221+
222+
PREV_WRITE=$(jq '.realistic_workload.comparison_100.typical_write.mean_overhead_ms' benchmarks/results/comparison/benchmark-summary.json)
223+
CURR_WRITE=$(jq '.realistic_workload.comparison_100.typical_write.mean_overhead_ms' benchmarks/results/benchmark-summary.json)
224+
225+
echo "| Metric | Previous | Current | Delta |" >> $GITHUB_STEP_SUMMARY
226+
echo "|--------|----------|---------|-------|" >> $GITHUB_STEP_SUMMARY
227+
echo "| Read API overhead | ${PREV_READ}ms | ${CURR_READ}ms | $(echo "$CURR_READ - $PREV_READ" | bc)ms |" >> $GITHUB_STEP_SUMMARY
228+
echo "| Write API overhead | ${PREV_WRITE}ms | ${CURR_WRITE}ms | $(echo "$CURR_WRITE - $PREV_WRITE" | bc)ms |" >> $GITHUB_STEP_SUMMARY
229+
else
230+
echo "⚠️ Could not find comparison results for run ${{ inputs.compare_with }}" >> $GITHUB_STEP_SUMMARY
231+
fi
232+
233+
- name: Check for performance regression
234+
id: regression
235+
run: |
236+
# Check if overhead exceeds threshold (3ms for 100% sampling)
237+
THRESHOLD_MS=3.0
238+
239+
READ_OVERHEAD=$(jq '.comparison_100.typical_read.mean_overhead_ms' benchmarks/results/realistic-workload.json)
240+
WRITE_OVERHEAD=$(jq '.comparison_100.typical_write.mean_overhead_ms' benchmarks/results/realistic-workload.json)
241+
MIXED_OVERHEAD=$(jq '.comparison_100.realistic_mixed.mean_overhead_ms' benchmarks/results/realistic-workload.json)
242+
243+
REGRESSION=false
244+
245+
if (( $(echo "$READ_OVERHEAD > $THRESHOLD_MS" | bc -l) )); then
246+
echo "⚠️ Read API overhead ($READ_OVERHEAD ms) exceeds threshold ($THRESHOLD_MS ms)" >> $GITHUB_STEP_SUMMARY
247+
REGRESSION=true
248+
fi
249+
250+
if (( $(echo "$WRITE_OVERHEAD > $THRESHOLD_MS" | bc -l) )); then
251+
echo "⚠️ Write API overhead ($WRITE_OVERHEAD ms) exceeds threshold ($THRESHOLD_MS ms)" >> $GITHUB_STEP_SUMMARY
252+
REGRESSION=true
253+
fi
254+
255+
if (( $(echo "$MIXED_OVERHEAD > $THRESHOLD_MS" | bc -l) )); then
256+
echo "⚠️ Mixed API overhead ($MIXED_OVERHEAD ms) exceeds threshold ($THRESHOLD_MS ms)" >> $GITHUB_STEP_SUMMARY
257+
REGRESSION=true
258+
fi
259+
260+
if [ "$REGRESSION" = true ]; then
261+
echo "" >> $GITHUB_STEP_SUMMARY
262+
echo "### ⚠️ Performance regression detected" >> $GITHUB_STEP_SUMMARY
263+
echo "regression=true" >> $GITHUB_OUTPUT
264+
else
265+
echo "" >> $GITHUB_STEP_SUMMARY
266+
echo "### ✅ No performance regression detected" >> $GITHUB_STEP_SUMMARY
267+
echo "regression=false" >> $GITHUB_OUTPUT
268+
fi
269+
270+
- name: Output JSON results
271+
run: |
272+
echo "### Structured Results (JSON)"
273+
echo ""
274+
echo '```json'
275+
cat benchmarks/results/benchmark-summary.json
276+
echo '```'

benchmarks/README.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,36 @@ After running benchmarks, compare the results:
5656
python benchmarks/compare_benchmarks.py
5757
```
5858

59+
### Comparing Different Runs
60+
61+
To compare results from different benchmark runs:
62+
63+
```bash
64+
# Compare two result files
65+
python benchmarks/scripts/compare_runs.py results/old-results.json results/new-results.json
66+
67+
# Output as JSON only (for programmatic use)
68+
python benchmarks/scripts/compare_runs.py --json results/old.json results/new.json
69+
```
70+
71+
### GitHub Actions Workflow
72+
73+
Benchmarks can be run via GitHub Actions for consistent, reproducible results:
74+
75+
1. Go to **Actions****Benchmarks**
76+
2. Click **Run workflow**
77+
3. Optionally specify:
78+
- `iterations`: Number of iterations for realistic workload (default: 200)
79+
- `qps_duration`: Duration per QPS level in seconds (default: 10)
80+
- `compare_with`: Run ID to compare against (optional)
81+
82+
The workflow outputs:
83+
84+
- Structured JSON with all results and metadata
85+
- Markdown summary in the workflow run
86+
- Artifacts containing full results (retained for 90 days)
87+
- Regression detection if overhead exceeds 3ms threshold
88+
5989
### Configuration
6090

6191
You can configure benchmarks via environment variables:
@@ -151,11 +181,14 @@ benchmarks/
151181
│ ├── profile.sh # Profiler runner script
152182
│ ├── simple_profile.py # Profiling workload
153183
│ └── results/ # Profile output (gitignored)
184+
├── scripts/
185+
│ └── compare_runs.py # Compare benchmark results across runs
154186
├── server/
155187
│ └── test_server.py # Flask test server
156188
├── results/ # JSON output (gitignored)
157189
├── compare_benchmarks.py # Result comparison script
158190
├── run_benchmarks.sh # Runner script
159191
├── PROFILING.md # Profiling documentation
192+
├── RESULTS.md # Historical results
160193
└── README.md
161194
```

benchmarks/bench/common.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,12 @@ def run_benchmarks(
4949
"""Run all benchmarks with the given label."""
5050
opts = {**DEFAULT_OPTIONS, **(options or {})}
5151

52-
enable_memory_tracking = os.environ.get("BENCHMARK_ENABLE_MEMORY", "true").lower() != "false"
52+
# Check environment variable first, then fall back to options
53+
env_memory = os.environ.get("BENCHMARK_ENABLE_MEMORY")
54+
if env_memory is not None:
55+
enable_memory_tracking = env_memory.lower() != "false"
56+
else:
57+
enable_memory_tracking = bool(opts.get("enable_memory_tracking", True))
5358

5459
# Start test server
5560
server = TestServer()

benchmarks/bench/resource_monitor.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,13 @@ def _monitor_loop(self) -> None:
147147

148148
def _collect_memory_sample(self) -> None:
149149
"""Collect a memory usage sample."""
150-
if not self._is_running or not self._current_task_stats:
150+
if not self._is_running:
151+
return
152+
153+
# Take a local snapshot to avoid race condition where another thread
154+
# sets _current_task_stats to None between check and usage
155+
task_stats = self._current_task_stats
156+
if task_stats is None:
151157
return
152158

153159
# Get RSS from resource module (in bytes on macOS, kilobytes on Linux)
@@ -169,9 +175,9 @@ def _collect_memory_sample(self) -> None:
169175
except (FileNotFoundError, PermissionError):
170176
pass # Use ru_maxrss fallback
171177

172-
self._current_task_stats.rss_sum += rss
173-
self._current_task_stats.rss_max = max(self._current_task_stats.rss_max, rss)
174-
self._current_task_stats.sample_count += 1
178+
task_stats.rss_sum += rss
179+
task_stats.rss_max = max(task_stats.rss_max, rss)
180+
task_stats.sample_count += 1
175181

176182
def get_task_stats(self, task_name: str) -> TaskResourceStats | None:
177183
"""Get resource statistics for a completed task."""

benchmarks/bench/result_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import statistics
99
import uuid
1010
from dataclasses import dataclass, field
11-
from datetime import UTC, datetime
11+
from datetime import datetime, timezone
1212
from pathlib import Path
1313
from typing import Any
1414

@@ -189,7 +189,7 @@ def create_benchmark_result(
189189
return BenchmarkRunResult(
190190
id=str(uuid.uuid4()),
191191
label=label,
192-
timestamp=datetime.now(UTC).isoformat(),
192+
timestamp=datetime.now(timezone.utc).isoformat(),
193193
duration_ms=duration_ms,
194194
options=options,
195195
system=get_system_info(),

0 commit comments

Comments
 (0)