Use-Tusk
diff --git a/‎.github/workflows/benchmarks.yml‎
Lines changed: 276 additions & 0 deletions b/‎.github/workflows/benchmarks.yml‎
Lines changed: 276 additions & 0 deletions
diff --git a/‎benchmarks/README.md‎
Lines changed: 33 additions & 0 deletions b/‎benchmarks/README.md‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎benchmarks/bench/common.py‎
Lines changed: 6 additions & 1 deletion b/‎benchmarks/bench/common.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎benchmarks/bench/resource_monitor.py‎
Lines changed: 10 additions & 4 deletions b/‎benchmarks/bench/resource_monitor.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎benchmarks/bench/result_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/bench/result_utils.py‎
Lines changed: 2 additions & 2 deletions
@@ -0,0 +1,276 @@
+name: Benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      iterations:
+        description: "Number of iterations for realistic workload benchmark"
+        required: false
+        default: "200"
+      qps_duration:
+        description: "Duration in seconds for each QPS level"
+        required: false
+        default: "10"
+      compare_with:
+        description: "Run ID to compare results against (optional)"
+        required: false
+        default: ""
+
+jobs:
+  benchmark:
+    name: Run Benchmarks
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+
+      - name: Setup Python
+        run: uv python install 3.9
+
+      - name: Cache uv + Python installs + venv
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/uv
+            ~/.local/share/uv/python
+            .venv
+          key: ${{ runner.os }}-uv-benchmark-3.9-${{ hashFiles('uv.lock') }}
+
+      - name: Install dependencies
+        run: |
+          uv sync --all-extras
+          uv pip install flask requests psutil
+
+      - name: Get system info
+        id: sysinfo
+        run: |
+          echo "python_version=$(python --version)" >> $GITHUB_OUTPUT
+          echo "os=$(uname -s)" >> $GITHUB_OUTPUT
+          echo "arch=$(uname -m)" >> $GITHUB_OUTPUT
+          echo "cpu_count=$(nproc)" >> $GITHUB_OUTPUT
+          echo "memory_gb=$(free -g | awk '/^Mem:/{print $2}')" >> $GITHUB_OUTPUT
+
+      - name: Run realistic workload benchmark
+        id: realistic
+        run: |
+          uv run python benchmarks/bench/realistic_workload.py 2>&1 | tee realistic_output.txt
+          # Extract just the results JSON
+          cat benchmarks/results/realistic-workload.json
+
+      - name: Run fixed QPS latency benchmark
+        id: fixed_qps
+        run: |
+          uv run python benchmarks/bench/fixed_qps_latency.py 2>&1 | tee fixed_qps_output.txt
+          # Extract just the results JSON
+          cat benchmarks/results/fixed-qps-latency.json
+
+      - name: Generate structured results
+        id: results
+        run: |
+          cat > benchmarks/results/benchmark-summary.json << 'EOF'
+          {
+            "metadata": {
+              "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
+              "run_id": "${{ github.run_id }}",
+              "run_number": "${{ github.run_number }}",
+              "commit_sha": "${{ github.sha }}",
+              "branch": "${{ github.ref_name }}",
+              "triggered_by": "${{ github.actor }}",
+              "environment": {
+                "python_version": "${{ steps.sysinfo.outputs.python_version }}",
+                "os": "${{ steps.sysinfo.outputs.os }}",
+                "arch": "${{ steps.sysinfo.outputs.arch }}",
+                "cpu_count": "${{ steps.sysinfo.outputs.cpu_count }}",
+                "memory_gb": "${{ steps.sysinfo.outputs.memory_gb }}"
+              }
+            }
+          }
+          EOF
+
+          # Create a proper JSON with jq
+          jq -n \
+            --slurpfile realistic benchmarks/results/realistic-workload.json \
+            --slurpfile fixed_qps benchmarks/results/fixed-qps-latency.json \
+            --arg timestamp "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+            --arg run_id "${{ github.run_id }}" \
+            --arg run_number "${{ github.run_number }}" \
+            --arg commit_sha "${{ github.sha }}" \
+            --arg branch "${{ github.ref_name }}" \
+            --arg triggered_by "${{ github.actor }}" \
+            --arg python_version "${{ steps.sysinfo.outputs.python_version }}" \
+            --arg os "${{ steps.sysinfo.outputs.os }}" \
+            --arg arch "${{ steps.sysinfo.outputs.arch }}" \
+            --arg cpu_count "${{ steps.sysinfo.outputs.cpu_count }}" \
+            --arg memory_gb "${{ steps.sysinfo.outputs.memory_gb }}" \
+            '{
+              metadata: {
+                timestamp: $timestamp,
+                run_id: $run_id,
+                run_number: ($run_number | tonumber),
+                commit_sha: $commit_sha,
+                branch: $branch,
+                triggered_by: $triggered_by,
+                environment: {
+                  python_version: $python_version,
+                  os: $os,
+                  arch: $arch,
+                  cpu_count: ($cpu_count | tonumber),
+                  memory_gb: ($memory_gb | tonumber)
+                }
+              },
+              realistic_workload: $realistic[0],
+              fixed_qps_latency: $fixed_qps[0]
+            }' > benchmarks/results/benchmark-summary.json
+
+      - name: Generate markdown summary
+        run: |
+          SUMMARY_FILE="benchmarks/results/benchmark-summary.md"
+
+          cat > "$SUMMARY_FILE" << EOF
+          # Benchmark Results
+
+          **Date**: $(date -u +%Y-%m-%d)
+          **Commit**: ${{ github.sha }}
+          **Branch**: ${{ github.ref_name }}
+          **Run ID**: ${{ github.run_id }}
+
+          ## Environment
+          - Python: ${{ steps.sysinfo.outputs.python_version }}
+          - OS: ${{ steps.sysinfo.outputs.os }} (${{ steps.sysinfo.outputs.arch }})
+          - CPUs: ${{ steps.sysinfo.outputs.cpu_count }}
+          - Memory: ${{ steps.sysinfo.outputs.memory_gb }} GB
+
+          ## Realistic Workload Results
+
+          EOF
+
+          # Parse and format realistic workload results
+          jq -r '
+            "| Endpoint | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |",
+            "|----------|----------|------------|----------|-----------|----------|",
+            (.comparison_100 | to_entries[] |
+              "| \(.key) | \(.value.baseline_mean_ms | . * 10 | round / 10)ms | \(.value.sdk_mean_ms | . * 10 | round / 10)ms | +\(.value.mean_overhead_ms | . * 10 | round / 10)ms (\(.value.mean_overhead_pct | round)%) | - | - |"
+            )
+          ' benchmarks/results/realistic-workload.json >> "$SUMMARY_FILE"
+
+          cat >> "$SUMMARY_FILE" << 'EOF'
+
+          ## Fixed QPS Latency Results
+
+          ### Mean Latency
+
+          EOF
+
+          jq -r '
+            "| QPS | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |",
+            "|-----|----------|------------|----------|-----------|----------|",
+            (.baseline | to_entries[] |
+              . as $b |
+              ($b.key | tostring) as $qps |
+              "| \($qps) | \($b.value.mean_ms | . * 10 | round / 10)ms | - | - | - | - |"
+            )
+          ' benchmarks/results/fixed-qps-latency.json >> "$SUMMARY_FILE"
+
+          cat >> "$SUMMARY_FILE" << 'EOF'
+
+          ---
+
+          📊 **Full results available in artifacts**
+
+          EOF
+
+          # Also write to GitHub step summary for UI display
+          cat "$SUMMARY_FILE" >> $GITHUB_STEP_SUMMARY
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results-${{ github.run_id }}
+          path: |
+            benchmarks/results/*.json
+            benchmarks/results/*.md
+            realistic_output.txt
+            fixed_qps_output.txt
+          retention-days: 90
+
+      - name: Download comparison results (if specified)
+        if: ${{ inputs.compare_with != '' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: benchmark-results-${{ inputs.compare_with }}
+          path: benchmarks/results/comparison/
+        continue-on-error: true
+
+      - name: Compare with previous run
+        if: ${{ inputs.compare_with != '' }}
+        run: |
+          if [ -f benchmarks/results/comparison/benchmark-summary.json ]; then
+            echo "## Comparison with Run ${{ inputs.compare_with }}" >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+
+            # Compare realistic workload results
+            PREV_READ=$(jq '.realistic_workload.comparison_100.typical_read.mean_overhead_ms' benchmarks/results/comparison/benchmark-summary.json)
+            CURR_READ=$(jq '.realistic_workload.comparison_100.typical_read.mean_overhead_ms' benchmarks/results/benchmark-summary.json)
+
+            PREV_WRITE=$(jq '.realistic_workload.comparison_100.typical_write.mean_overhead_ms' benchmarks/results/comparison/benchmark-summary.json)
+            CURR_WRITE=$(jq '.realistic_workload.comparison_100.typical_write.mean_overhead_ms' benchmarks/results/benchmark-summary.json)
+
+            echo "| Metric | Previous | Current | Delta |" >> $GITHUB_STEP_SUMMARY
+            echo "|--------|----------|---------|-------|" >> $GITHUB_STEP_SUMMARY
+            echo "| Read API overhead | ${PREV_READ}ms | ${CURR_READ}ms | $(echo "$CURR_READ - $PREV_READ" | bc)ms |" >> $GITHUB_STEP_SUMMARY
+            echo "| Write API overhead | ${PREV_WRITE}ms | ${CURR_WRITE}ms | $(echo "$CURR_WRITE - $PREV_WRITE" | bc)ms |" >> $GITHUB_STEP_SUMMARY
+          else
+            echo "⚠️ Could not find comparison results for run ${{ inputs.compare_with }}" >> $GITHUB_STEP_SUMMARY
+          fi
+
+      - name: Check for performance regression
+        id: regression
+        run: |
+          # Check if overhead exceeds threshold (3ms for 100% sampling)
+          THRESHOLD_MS=3.0
+
+          READ_OVERHEAD=$(jq '.comparison_100.typical_read.mean_overhead_ms' benchmarks/results/realistic-workload.json)
+          WRITE_OVERHEAD=$(jq '.comparison_100.typical_write.mean_overhead_ms' benchmarks/results/realistic-workload.json)
+          MIXED_OVERHEAD=$(jq '.comparison_100.realistic_mixed.mean_overhead_ms' benchmarks/results/realistic-workload.json)
+
+          REGRESSION=false
+
+          if (( $(echo "$READ_OVERHEAD > $THRESHOLD_MS" | bc -l) )); then
+            echo "⚠️ Read API overhead ($READ_OVERHEAD ms) exceeds threshold ($THRESHOLD_MS ms)" >> $GITHUB_STEP_SUMMARY
+            REGRESSION=true
+          fi
+
+          if (( $(echo "$WRITE_OVERHEAD > $THRESHOLD_MS" | bc -l) )); then
+            echo "⚠️ Write API overhead ($WRITE_OVERHEAD ms) exceeds threshold ($THRESHOLD_MS ms)" >> $GITHUB_STEP_SUMMARY
+            REGRESSION=true
+          fi
+
+          if (( $(echo "$MIXED_OVERHEAD > $THRESHOLD_MS" | bc -l) )); then
+            echo "⚠️ Mixed API overhead ($MIXED_OVERHEAD ms) exceeds threshold ($THRESHOLD_MS ms)" >> $GITHUB_STEP_SUMMARY
+            REGRESSION=true
+          fi
+
+          if [ "$REGRESSION" = true ]; then
+            echo "" >> $GITHUB_STEP_SUMMARY
+            echo "### ⚠️ Performance regression detected" >> $GITHUB_STEP_SUMMARY
+            echo "regression=true" >> $GITHUB_OUTPUT
+          else
+            echo "" >> $GITHUB_STEP_SUMMARY
+            echo "### ✅ No performance regression detected" >> $GITHUB_STEP_SUMMARY
+            echo "regression=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Output JSON results
+        run: |
+          echo "### Structured Results (JSON)"
+          echo ""
+          echo '```json'
+          cat benchmarks/results/benchmark-summary.json
+          echo '```'
@@ -56,6 +56,36 @@ After running benchmarks, compare the results:
 python benchmarks/compare_benchmarks.py
 ```
 
+### Comparing Different Runs
+
+To compare results from different benchmark runs:
+
+```bash
+# Compare two result files
+python benchmarks/scripts/compare_runs.py results/old-results.json results/new-results.json
+
+# Output as JSON only (for programmatic use)
+python benchmarks/scripts/compare_runs.py --json results/old.json results/new.json
+```
+
+### GitHub Actions Workflow
+
+Benchmarks can be run via GitHub Actions for consistent, reproducible results:
+
+1. Go to **Actions** → **Benchmarks**
+2. Click **Run workflow**
+3. Optionally specify:
+   - `iterations`: Number of iterations for realistic workload (default: 200)
+   - `qps_duration`: Duration per QPS level in seconds (default: 10)
+   - `compare_with`: Run ID to compare against (optional)
+
+The workflow outputs:
+
+- Structured JSON with all results and metadata
+- Markdown summary in the workflow run
+- Artifacts containing full results (retained for 90 days)
+- Regression detection if overhead exceeds 3ms threshold
+
 ### Configuration
 
 You can configure benchmarks via environment variables:
@@ -151,11 +181,14 @@ benchmarks/
 │   ├── profile.sh             # Profiler runner script
 │   ├── simple_profile.py      # Profiling workload
 │   └── results/               # Profile output (gitignored)
+├── scripts/
+│   └── compare_runs.py        # Compare benchmark results across runs
 ├── server/
 │   └── test_server.py         # Flask test server
 ├── results/                   # JSON output (gitignored)
 ├── compare_benchmarks.py      # Result comparison script
 ├── run_benchmarks.sh          # Runner script
 ├── PROFILING.md               # Profiling documentation
+├── RESULTS.md                 # Historical results
 └── README.md
 ```
@@ -49,7 +49,12 @@ def run_benchmarks(
     """Run all benchmarks with the given label."""
     opts = {**DEFAULT_OPTIONS, **(options or {})}
 
-    enable_memory_tracking = os.environ.get("BENCHMARK_ENABLE_MEMORY", "true").lower() != "false"
+    # Check environment variable first, then fall back to options
+    env_memory = os.environ.get("BENCHMARK_ENABLE_MEMORY")
+    if env_memory is not None:
+        enable_memory_tracking = env_memory.lower() != "false"
+    else:
+        enable_memory_tracking = bool(opts.get("enable_memory_tracking", True))
 
     # Start test server
     server = TestServer()
 
@@ -147,7 +147,13 @@ def _monitor_loop(self) -> None:
 
     def _collect_memory_sample(self) -> None:
         """Collect a memory usage sample."""
-        if not self._is_running or not self._current_task_stats:
+        if not self._is_running:
+            return
+
+        # Take a local snapshot to avoid race condition where another thread
+        # sets _current_task_stats to None between check and usage
+        task_stats = self._current_task_stats
+        if task_stats is None:
             return
 
         # Get RSS from resource module (in bytes on macOS, kilobytes on Linux)
@@ -169,9 +175,9 @@ def _collect_memory_sample(self) -> None:
         except (FileNotFoundError, PermissionError):
             pass  # Use ru_maxrss fallback
 
-        self._current_task_stats.rss_sum += rss
-        self._current_task_stats.rss_max = max(self._current_task_stats.rss_max, rss)
-        self._current_task_stats.sample_count += 1
+        task_stats.rss_sum += rss
+        task_stats.rss_max = max(task_stats.rss_max, rss)
+        task_stats.sample_count += 1
 
     def get_task_stats(self, task_name: str) -> TaskResourceStats | None:
         """Get resource statistics for a completed task."""
 
@@ -8,7 +8,7 @@
 import statistics
 import uuid
 from dataclasses import dataclass, field
-from datetime import UTC, datetime
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 
@@ -189,7 +189,7 @@ def create_benchmark_result(
     return BenchmarkRunResult(
         id=str(uuid.uuid4()),
         label=label,
-        timestamp=datetime.now(UTC).isoformat(),
+        timestamp=datetime.now(timezone.utc).isoformat(),
         duration_ms=duration_ms,
         options=options,
         system=get_system_info(),