fix: remove virtual allocation reference from DeepSeek key takeaways … #283
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI Pipeline | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| branches: [main] | |
| concurrency: | |
| group: ci-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| build_and_unit_test: | |
| runs-on: macos-15 | |
| timeout-minutes: 40 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Install Metal Toolchain | |
| run: xcodebuild -downloadComponent MetalToolchain || true | |
| - name: Cache Swift packages | |
| uses: actions/cache@v4 | |
| with: | |
| path: .build | |
| key: ${{ runner.os }}-spm-SwiftLM-v3-${{ hashFiles('Package.resolved') }} | |
| restore-keys: | | |
| ${{ runner.os }}-spm-SwiftLM-v3- | |
| - name: Clear stale module cache | |
| run: find .build -type d -name ModuleCache -exec rm -rf {} + 2>/dev/null || true | |
| - name: Resolve dependencies | |
| run: swift package resolve | |
| - name: Build (Release) | |
| run: swift build -c release | |
| - name: Verify binary | |
| run: | | |
| ls -lh .build/release/SwiftLM | |
| file .build/release/SwiftLM | |
| - name: TurboQuant unit tests | |
| run: | | |
| clang++ -std=c++17 -O2 -o /tmp/tq_test tests/test_turbo_quant.cpp | |
| /tmp/tq_test | |
| - name: Build Test Harness | |
| run: swift build --build-tests | |
| - name: Install MLX Metal library | |
| run: | | |
| python3 -m venv /tmp/mlx_venv | |
| /tmp/mlx_venv/bin/pip install --quiet mlx | |
| cp /tmp/mlx_venv/lib/python*/site-packages/mlx/lib/mlx.metallib .build/release/ | |
| find .build -type d -name "MacOS" -exec cp /tmp/mlx_venv/lib/python*/site-packages/mlx/lib/mlx.metallib {}/ \; | |
| - name: SwiftBuddy Tests (MemPalace & Lifecycle) | |
| run: swift test --skip-build --filter SwiftBuddyTests --disable-swift-testing | |
| - name: SwiftLM Server Tests (Streaming & SSE) | |
| run: swift test --skip-build --filter SwiftLMTests --disable-swift-testing | |
| - name: Upload Binary Artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: swiftlm-architecture | |
| path: .build/release/ | |
| retention-days: 1 | |
| integration_matrix: | |
| needs: build_and_unit_test | |
| runs-on: macos-15 | |
| timeout-minutes: 30 | |
| continue-on-error: ${{ matrix.modality == 'opencode' }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| modality: [server, vision, audio, graph, omni, opencode] | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download Binary Artifact | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: swiftlm-architecture | |
| path: .build/release/ | |
| - name: Restore Architecture Privileges | |
| run: chmod +x .build/release/SwiftLM | |
| - name: Cache MLX model | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.cache/huggingface | |
| key: mlx-model-qwen2.5-0.5b-4bit | |
| - name: Run E2E tests (${{ matrix.modality }}) | |
| env: | |
| HF_HUB_DOWNLOAD_TIMEOUT: "600" | |
| run: | | |
| chmod +x tests/test-${{ matrix.modality }}.sh | |
| for attempt in 1 2 3; do | |
| echo "Attempt $attempt of 3..." | |
| if tests/test-${{ matrix.modality }}.sh .build/release/SwiftLM 15413; then exit 0; fi | |
| if [ "$attempt" -eq 3 ]; then echo "All attempts failed"; exit 1; fi | |
| sleep 10 | |
| done | |
| - name: Upload test logs on failure | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: ci-test-logs-${{ matrix.modality }} | |
| path: /tmp/SwiftLM-test-*.log | |
| retention-days: 7 | |
| # ── Speculative Decoding E2E (dual-model: 0.8B draft + 4B main) ── | |
| # Uses the standard macos-15 runner (7 GB RAM). | |
| # We test the 4B main model which safely fits within memory. | |
| speculative-decoding: | |
| runs-on: macos-15 | |
| timeout-minutes: 45 | |
| needs: build_and_unit_test # Run in parallel with integration_matrix | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Install Metal Toolchain | |
| run: xcodebuild -downloadComponent MetalToolchain || true | |
| - name: Cache Swift packages | |
| uses: actions/cache@v4 | |
| with: | |
| path: .build | |
| key: ${{ runner.os }}-spm-SwiftLM-v3-${{ hashFiles('Package.resolved') }} | |
| restore-keys: | | |
| ${{ runner.os }}-spm-SwiftLM-v3- | |
| - name: Clear stale module cache | |
| run: find .build -type d -name ModuleCache -exec rm -rf {} + 2>/dev/null || true | |
| - name: Resolve dependencies | |
| run: swift package resolve | |
| - name: Build (Release) | |
| run: swift build -c release | |
| - name: Compile and install custom MLX Metal library | |
| run: | | |
| # cmake-based build from SharpAI fork — mirrors build.sh (PR #58) | |
| if [ -d "mlx-swift/Source/Cmlx/mlx" ]; then | |
| MLX_SRC="mlx-swift/Source/Cmlx/mlx" | |
| else | |
| MLX_SRC=".build/checkouts/mlx-swift/Source/Cmlx/mlx" | |
| fi | |
| mkdir -p .build/metallib_build | |
| pushd .build/metallib_build | |
| cmake "../../$MLX_SRC" \ | |
| -DMLX_BUILD_TESTS=OFF \ | |
| -DMLX_BUILD_EXAMPLES=OFF \ | |
| -DMLX_BUILD_BENCHMARKS=OFF \ | |
| -DMLX_BUILD_PYTHON_BINDINGS=OFF \ | |
| -DMLX_METAL_JIT=OFF \ | |
| -DMLX_ENABLE_NAX=1 \ | |
| -DCMAKE_BUILD_TYPE=Release 2>&1 | tail -20 | |
| make mlx-metallib -j$(sysctl -n hw.ncpu) 2>&1 | tail -20 | |
| popd | |
| BUILT=$(find .build/metallib_build -name "mlx.metallib" | head -1) | |
| cp "$BUILT" .build/release/mlx.metallib | |
| # Install hf for model pre-download | |
| python3 -m venv /tmp/mlx_venv | |
| /tmp/mlx_venv/bin/pip install --quiet huggingface_hub hf | |
| - name: Cache MLX models (draft + main) | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.cache/huggingface | |
| key: mlx-speculative-qwen35-2b-0.8b | |
| - name: Pre-download HuggingFace models | |
| run: | | |
| source /tmp/mlx_venv/bin/activate | |
| hf download mlx-community/Qwen3.5-2B-4bit || true | |
| hf download mlx-community/Qwen3.5-0.8B-MLX-4bit || true | |
| - name: Run speculative decoding E2E | |
| env: | |
| HF_HUB_DOWNLOAD_TIMEOUT: "900" | |
| SWIFTLM_TOP_K: "4" | |
| run: | | |
| chmod +x tests/test-speculative.sh | |
| for attempt in 1 2 3; do | |
| echo "Attempt $attempt of 3..." | |
| if tests/test-speculative.sh .build/release/SwiftLM 15414; then | |
| exit 0 | |
| fi | |
| if [ "$attempt" -lt 3 ]; then | |
| echo "Test failed, retrying in 10s..." | |
| sleep 10 | |
| fi | |
| done | |
| echo "All attempts failed" | |
| exit 1 | |
| - name: Upload speculative test logs on failure | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: speculative-test-logs | |
| path: /tmp/SwiftLM-test-speculative.log | |
| retention-days: 7 | |
| # ── Speculative Decoding Memory Evaluation ── | |
| # Runs the 2B model with NUM_DRAFT_TOKENS=2 to check peak | |
| # memory compression/efficiency. Emits vm_stat readings as step summary. | |
| speculative-decoding-eval: | |
| runs-on: macos-15 | |
| timeout-minutes: 45 | |
| needs: build_and_unit_test | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Install Metal Toolchain | |
| run: xcodebuild -downloadComponent MetalToolchain || true | |
| - name: Cache Swift packages | |
| uses: actions/cache@v4 | |
| with: | |
| path: .build | |
| key: ${{ runner.os }}-spm-SwiftLM-v3-${{ hashFiles('Package.resolved') }} | |
| restore-keys: | | |
| ${{ runner.os }}-spm-SwiftLM-v3- | |
| - name: Clear stale module cache | |
| run: find .build -type d -name ModuleCache -exec rm -rf {} + 2>/dev/null || true | |
| - name: Resolve dependencies | |
| run: swift package resolve | |
| - name: Build (Release) | |
| run: swift build -c release | |
| - name: Compile and install custom MLX Metal library | |
| run: | | |
| # cmake-based build from SharpAI fork — mirrors build.sh (PR #58) | |
| if [ -d "mlx-swift/Source/Cmlx/mlx" ]; then | |
| MLX_SRC="mlx-swift/Source/Cmlx/mlx" | |
| else | |
| MLX_SRC=".build/checkouts/mlx-swift/Source/Cmlx/mlx" | |
| fi | |
| mkdir -p .build/metallib_build | |
| pushd .build/metallib_build | |
| cmake "../../$MLX_SRC" \ | |
| -DMLX_BUILD_TESTS=OFF \ | |
| -DMLX_BUILD_EXAMPLES=OFF \ | |
| -DMLX_BUILD_BENCHMARKS=OFF \ | |
| -DMLX_BUILD_PYTHON_BINDINGS=OFF \ | |
| -DMLX_METAL_JIT=OFF \ | |
| -DMLX_ENABLE_NAX=1 \ | |
| -DCMAKE_BUILD_TYPE=Release 2>&1 | tail -20 | |
| make mlx-metallib -j$(sysctl -n hw.ncpu) 2>&1 | tail -20 | |
| popd | |
| BUILT=$(find .build/metallib_build -name "mlx.metallib" | head -1) | |
| cp "$BUILT" .build/release/mlx.metallib | |
| # Install hf for model pre-download | |
| python3 -m venv /tmp/mlx_venv | |
| /tmp/mlx_venv/bin/pip install --quiet huggingface_hub hf | |
| - name: Cache MLX models (draft + 2B) | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.cache/huggingface | |
| key: mlx-speculative-eval-qwen35-2b-0.8b | |
| - name: Pre-download HuggingFace models | |
| run: | | |
| source /tmp/mlx_venv/bin/activate | |
| hf download mlx-community/Qwen3.5-2B-4bit || true | |
| hf download mlx-community/Qwen3.5-0.8B-MLX-4bit || true | |
| - name: Snapshot RAM before test | |
| id: ram_before | |
| run: | | |
| PAGE_SIZE=$(sysctl -n hw.pagesize) | |
| RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" ' | |
| /Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 } | |
| /Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 } | |
| /Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 } | |
| END { printf "%.2f", (act+wire+comp)*page_size/1073741824 } | |
| ') | |
| echo "ram_before=$RAM" >> $GITHUB_OUTPUT | |
| echo "RAM before eval: ${RAM} GB" | |
| - name: Run speculative evaluation E2E | |
| env: | |
| HF_HUB_DOWNLOAD_TIMEOUT: "900" | |
| SWIFTLM_TOP_K: "2" | |
| MAIN_MODEL: "mlx-community/Qwen3.5-2B-4bit" | |
| NUM_DRAFT_TOKENS: "2" | |
| run: | | |
| chmod +x tests/test-speculative-eval.sh | |
| for attempt in 1 2 3; do | |
| echo "Attempt $attempt of 3..." | |
| if tests/test-speculative-eval.sh .build/release/SwiftLM 15414; then | |
| exit 0 | |
| fi | |
| if [ "$attempt" -lt 3 ]; then | |
| echo "Test failed, retrying in 10s..." | |
| sleep 10 | |
| fi | |
| done | |
| echo "All attempts failed" | |
| exit 1 | |
| - name: Snapshot RAM after test | |
| if: always() | |
| id: ram_after | |
| run: | | |
| PAGE_SIZE=$(sysctl -n hw.pagesize) | |
| RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" ' | |
| /Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 } | |
| /Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 } | |
| /Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 } | |
| END { printf "%.2f", (act+wire+comp)*page_size/1073741824 } | |
| ') | |
| echo "ram_after=$RAM" >> $GITHUB_OUTPUT | |
| echo "RAM after eval: ${RAM} GB" | |
| - name: Emit memory summary | |
| if: always() | |
| run: | | |
| BEFORE="${{ steps.ram_before.outputs.ram_before }}" | |
| AFTER="${{ steps.ram_after.outputs.ram_after }}" | |
| TOTAL=$(sysctl -n hw.memsize | awk '{printf "%.1f", $1/1073741824}') | |
| { | |
| echo "## 📊 Speculative Eval — Memory Readings" | |
| echo "| Metric | Value |" | |
| echo "|--------|-------|" | |
| echo "| Runner physical RAM | ${TOTAL} GB |" | |
| echo "| RAM before test | ${BEFORE} GB |" | |
| echo "| RAM after test | ${AFTER} GB |" | |
| echo "| Delta | $(echo "$AFTER $BEFORE" | awk '{printf "%.2f", $1-$2}') GB |" | |
| } >> $GITHUB_STEP_SUMMARY | |
| - name: Upload speculative eval logs on failure | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: speculative-eval-logs | |
| path: /tmp/SwiftLM-test-speculative-eval.log | |
| # ── Issue #72 Regression: SSD streaming + draft model RAM guard ────────────── | |
| # Mandatory (not continue-on-error). Enforces the auto-cap-to-1 fix and the | |
| # memoryLimit sentinel on every PR. Uses tiny models (2B main + 0.8B draft) | |
| # sized for the 7 GB macos-15 runner. | |
| # | |
| # Three checks mirror the local Test 10 in run_benchmark.sh: | |
| # [1] Auto-cap warning present in server log | |
| # [2] Peak RAM ≤ 85% of runner physical RAM during inference | |
| # [3] /v1/chat/completions returns valid content | |
| ssd-draft-memory-guard: | |
| runs-on: macos-15 | |
| timeout-minutes: 45 | |
| needs: build_and_unit_test | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download Binary Artifact | |
| uses: actions/download-artifact@v4 | |
| continue-on-error: true # fall back to building if artifact expired | |
| with: | |
| name: swiftlm-architecture | |
| path: .build/release/ | |
| - name: Build (Release) if artifact missing | |
| run: | | |
| if [ ! -f ".build/release/SwiftLM" ]; then | |
| swift build -c release | |
| fi | |
| chmod +x .build/release/SwiftLM | |
| - name: Install MLX Metal library | |
| run: | | |
| python3 -m venv /tmp/mlx_venv | |
| /tmp/mlx_venv/bin/pip install --quiet mlx huggingface_hub hf | |
| cp /tmp/mlx_venv/lib/python*/site-packages/mlx/lib/mlx.metallib .build/release/ | |
| - name: Cache MLX models (2B main + 0.8B draft) | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.cache/huggingface | |
| key: mlx-ssd-draft-guard-qwen35-2b-0.8b | |
| - name: Pre-download models | |
| run: | | |
| source /tmp/mlx_venv/bin/activate | |
| hf download mlx-community/Qwen3.5-2B-4bit || true | |
| hf download mlx-community/Qwen3.5-0.8B-MLX-4bit || true | |
| - name: Snapshot RAM baseline | |
| id: ram_base | |
| run: | | |
| PAGE_SIZE=$(sysctl -n hw.pagesize) | |
| RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" ' | |
| /Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 } | |
| /Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 } | |
| /Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 } | |
| END { printf "%.2f", (act+wire+comp)*page_size/1073741824 } | |
| ') | |
| TOTAL=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1/1073741824}') | |
| LIMIT=$(echo "$TOTAL * 0.85" | bc | cut -d. -f1) | |
| echo "ram_base=$RAM" >> $GITHUB_OUTPUT | |
| echo "runner_ram=$TOTAL" >> $GITHUB_OUTPUT | |
| echo "ram_limit=$LIMIT" >> $GITHUB_OUTPUT | |
| echo "Baseline RAM: ${RAM} GB | Runner: ${TOTAL} GB | Limit: ${LIMIT} GB" | |
| - name: Start SSD + draft server (Issue #72 scenario) | |
| id: server | |
| run: | | |
| # Launch with --num-draft-tokens 4 intentionally — the auto-cap should | |
| # silently reduce it to 1 and log the advisory message. | |
| .build/release/SwiftLM \ | |
| --model mlx-community/Qwen3.5-2B-4bit \ | |
| --draft-model mlx-community/Qwen3.5-0.8B-MLX-4bit \ | |
| --stream-experts \ | |
| --num-draft-tokens 4 \ | |
| --port 15473 \ | |
| --max-tokens 64 \ | |
| > /tmp/ssd_draft_guard.log 2>&1 & | |
| PID=$! | |
| echo "server_pid=$PID" >> $GITHUB_OUTPUT | |
| echo "Waiting for server (up to 300s)..." | |
| for i in $(seq 1 300); do | |
| if ! kill -0 $PID 2>/dev/null; then | |
| echo "Server died early:" | |
| cat /tmp/ssd_draft_guard.log | |
| exit 1 | |
| fi | |
| if curl -sf http://127.0.0.1:15473/health >/dev/null 2>&1; then | |
| echo "Server ready after ${i}s" | |
| break | |
| fi | |
| sleep 1 | |
| if [ "$i" -eq 300 ]; then echo "Timeout"; exit 1; fi | |
| done | |
| - name: Snapshot RAM after model load | |
| id: ram_loaded | |
| run: | | |
| PAGE_SIZE=$(sysctl -n hw.pagesize) | |
| RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" ' | |
| /Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 } | |
| /Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 } | |
| /Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 } | |
| END { printf "%.2f", (act+wire+comp)*page_size/1073741824 } | |
| ') | |
| echo "ram_loaded=$RAM" >> $GITHUB_OUTPUT | |
| echo "RAM after load: ${RAM} GB" | |
| - name: "[1/3] Verify auto-cap warning in server log" | |
| run: | | |
| if grep -q "auto-capping" /tmp/ssd_draft_guard.log; then | |
| echo "✅ Auto-cap warning found — numDraftTokens correctly reduced to 1" | |
| else | |
| echo "❌ Auto-cap warning NOT found in server log" | |
| echo "--- Last 20 lines of server log ---" | |
| tail -20 /tmp/ssd_draft_guard.log | |
| exit 1 | |
| fi | |
| - name: "[2/3] Run inference and snapshot peak RAM" | |
| id: ram_peak | |
| run: | | |
| RESULT=$(curl -sf --max-time 90 http://127.0.0.1:15473/v1/chat/completions \ | |
| -H "Content-Type: application/json" \ | |
| -d '{"model":"test","messages":[{"role":"user","content":"What is 2+2? One word."}],"max_tokens":32,"stream":false}' \ | |
| 2>/dev/null || echo "{}") | |
| echo "$RESULT" > /tmp/inf_result.json | |
| PAGE_SIZE=$(sysctl -n hw.pagesize) | |
| RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" ' | |
| /Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 } | |
| /Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 } | |
| /Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 } | |
| END { printf "%.2f", (act+wire+comp)*page_size/1073741824 } | |
| ') | |
| echo "ram_peak=$RAM" >> $GITHUB_OUTPUT | |
| echo "RAM after inference: ${RAM} GB" | |
| LIMIT="${{ steps.ram_base.outputs.ram_limit }}" | |
| OK=$(echo "$RAM <= $LIMIT" | bc -l) | |
| if [ "$OK" = "1" ]; then | |
| echo "✅ RAM=${RAM}GB ≤ ${LIMIT}GB (85% of ${{ steps.ram_base.outputs.runner_ram }}GB runner RAM)" | |
| else | |
| echo "❌ RAM=${RAM}GB EXCEEDS limit ${LIMIT}GB — Issue #72 regression detected" | |
| echo " (memoryLimit sentinel or auto-cap may have regressed)" | |
| exit 1 | |
| fi | |
| - name: "[3/3] Validate inference response" | |
| run: | | |
| RESULT=$(cat /tmp/inf_result.json) | |
| if echo "$RESULT" | grep -q '"content"'; then | |
| TEXT=$(echo "$RESULT" | python3 -c \ | |
| "import sys,json;d=json.load(sys.stdin);print(d['choices'][0]['message']['content'])" \ | |
| 2>/dev/null || echo "(parse error)") | |
| echo "✅ Response: $TEXT" | |
| else | |
| echo "❌ No content in response — server may have crashed or returned empty" | |
| echo "Raw: ${RESULT:0:300}" | |
| exit 1 | |
| fi | |
| - name: Stop server | |
| if: always() | |
| run: kill ${{ steps.server.outputs.server_pid }} 2>/dev/null || true | |
| - name: Emit memory summary to step summary | |
| if: always() | |
| run: | | |
| BASE="${{ steps.ram_base.outputs.ram_base }}" | |
| LOADED="${{ steps.ram_loaded.outputs.ram_loaded }}" | |
| PEAK="${{ steps.ram_peak.outputs.ram_peak }}" | |
| TOTAL="${{ steps.ram_base.outputs.runner_ram }}" | |
| LIMIT="${{ steps.ram_base.outputs.ram_limit }}" | |
| { | |
| echo "## 🛡️ Issue #72 — SSD + Draft Model RAM Guard" | |
| echo "| Metric | Value | Threshold |" | |
| echo "|--------|-------|-----------|" | |
| echo "| Runner physical RAM | ${TOTAL} GB | — |" | |
| echo "| RAM baseline (before server) | ${BASE} GB | — |" | |
| echo "| RAM after model load | ${LOADED} GB | — |" | |
| echo "| RAM after inference (peak) | ${PEAK} GB | ≤ ${LIMIT} GB (85%) |" | |
| echo "| Load delta | $(echo "$LOADED $BASE" | awk '{printf "%.2f", $1-$2}') GB | — |" | |
| echo "| Inference delta | $(echo "$PEAK $LOADED" | awk '{printf "%.2f", $1-$2}') GB | — |" | |
| } >> $GITHUB_STEP_SUMMARY | |
| - name: Upload server log on failure | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: ssd-draft-guard-log | |
| path: /tmp/ssd_draft_guard.log | |
| retention-days: 7 | |