Skip to content

fix: remove virtual allocation reference from DeepSeek key takeaways … #283

fix: remove virtual allocation reference from DeepSeek key takeaways …

fix: remove virtual allocation reference from DeepSeek key takeaways … #283

Workflow file for this run

name: CI Pipeline
on:
push:
branches: [main]
pull_request:
branches: [main]
concurrency:
group: ci-${{ github.ref }}
cancel-in-progress: true
jobs:
build_and_unit_test:
runs-on: macos-15
timeout-minutes: 40
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Install Metal Toolchain
run: xcodebuild -downloadComponent MetalToolchain || true
- name: Cache Swift packages
uses: actions/cache@v4
with:
path: .build
key: ${{ runner.os }}-spm-SwiftLM-v3-${{ hashFiles('Package.resolved') }}
restore-keys: |
${{ runner.os }}-spm-SwiftLM-v3-
- name: Clear stale module cache
run: find .build -type d -name ModuleCache -exec rm -rf {} + 2>/dev/null || true
- name: Resolve dependencies
run: swift package resolve
- name: Build (Release)
run: swift build -c release
- name: Verify binary
run: |
ls -lh .build/release/SwiftLM
file .build/release/SwiftLM
- name: TurboQuant unit tests
run: |
clang++ -std=c++17 -O2 -o /tmp/tq_test tests/test_turbo_quant.cpp
/tmp/tq_test
- name: Build Test Harness
run: swift build --build-tests
- name: Install MLX Metal library
run: |
python3 -m venv /tmp/mlx_venv
/tmp/mlx_venv/bin/pip install --quiet mlx
cp /tmp/mlx_venv/lib/python*/site-packages/mlx/lib/mlx.metallib .build/release/
find .build -type d -name "MacOS" -exec cp /tmp/mlx_venv/lib/python*/site-packages/mlx/lib/mlx.metallib {}/ \;
- name: SwiftBuddy Tests (MemPalace & Lifecycle)
run: swift test --skip-build --filter SwiftBuddyTests --disable-swift-testing
- name: SwiftLM Server Tests (Streaming & SSE)
run: swift test --skip-build --filter SwiftLMTests --disable-swift-testing
- name: Upload Binary Artifact
uses: actions/upload-artifact@v4
with:
name: swiftlm-architecture
path: .build/release/
retention-days: 1
integration_matrix:
needs: build_and_unit_test
runs-on: macos-15
timeout-minutes: 30
continue-on-error: ${{ matrix.modality == 'opencode' }}
strategy:
fail-fast: false
matrix:
modality: [server, vision, audio, graph, omni, opencode]
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Download Binary Artifact
uses: actions/download-artifact@v4
with:
name: swiftlm-architecture
path: .build/release/
- name: Restore Architecture Privileges
run: chmod +x .build/release/SwiftLM
- name: Cache MLX model
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: mlx-model-qwen2.5-0.5b-4bit
- name: Run E2E tests (${{ matrix.modality }})
env:
HF_HUB_DOWNLOAD_TIMEOUT: "600"
run: |
chmod +x tests/test-${{ matrix.modality }}.sh
for attempt in 1 2 3; do
echo "Attempt $attempt of 3..."
if tests/test-${{ matrix.modality }}.sh .build/release/SwiftLM 15413; then exit 0; fi
if [ "$attempt" -eq 3 ]; then echo "All attempts failed"; exit 1; fi
sleep 10
done
- name: Upload test logs on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: ci-test-logs-${{ matrix.modality }}
path: /tmp/SwiftLM-test-*.log
retention-days: 7
# ── Speculative Decoding E2E (dual-model: 0.8B draft + 4B main) ──
# Uses the standard macos-15 runner (7 GB RAM).
# We test the 4B main model which safely fits within memory.
speculative-decoding:
runs-on: macos-15
timeout-minutes: 45
needs: build_and_unit_test # Run in parallel with integration_matrix
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Install Metal Toolchain
run: xcodebuild -downloadComponent MetalToolchain || true
- name: Cache Swift packages
uses: actions/cache@v4
with:
path: .build
key: ${{ runner.os }}-spm-SwiftLM-v3-${{ hashFiles('Package.resolved') }}
restore-keys: |
${{ runner.os }}-spm-SwiftLM-v3-
- name: Clear stale module cache
run: find .build -type d -name ModuleCache -exec rm -rf {} + 2>/dev/null || true
- name: Resolve dependencies
run: swift package resolve
- name: Build (Release)
run: swift build -c release
- name: Compile and install custom MLX Metal library
run: |
# cmake-based build from SharpAI fork — mirrors build.sh (PR #58)
if [ -d "mlx-swift/Source/Cmlx/mlx" ]; then
MLX_SRC="mlx-swift/Source/Cmlx/mlx"
else
MLX_SRC=".build/checkouts/mlx-swift/Source/Cmlx/mlx"
fi
mkdir -p .build/metallib_build
pushd .build/metallib_build
cmake "../../$MLX_SRC" \
-DMLX_BUILD_TESTS=OFF \
-DMLX_BUILD_EXAMPLES=OFF \
-DMLX_BUILD_BENCHMARKS=OFF \
-DMLX_BUILD_PYTHON_BINDINGS=OFF \
-DMLX_METAL_JIT=OFF \
-DMLX_ENABLE_NAX=1 \
-DCMAKE_BUILD_TYPE=Release 2>&1 | tail -20
make mlx-metallib -j$(sysctl -n hw.ncpu) 2>&1 | tail -20
popd
BUILT=$(find .build/metallib_build -name "mlx.metallib" | head -1)
cp "$BUILT" .build/release/mlx.metallib
# Install hf for model pre-download
python3 -m venv /tmp/mlx_venv
/tmp/mlx_venv/bin/pip install --quiet huggingface_hub hf
- name: Cache MLX models (draft + main)
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: mlx-speculative-qwen35-2b-0.8b
- name: Pre-download HuggingFace models
run: |
source /tmp/mlx_venv/bin/activate
hf download mlx-community/Qwen3.5-2B-4bit || true
hf download mlx-community/Qwen3.5-0.8B-MLX-4bit || true
- name: Run speculative decoding E2E
env:
HF_HUB_DOWNLOAD_TIMEOUT: "900"
SWIFTLM_TOP_K: "4"
run: |
chmod +x tests/test-speculative.sh
for attempt in 1 2 3; do
echo "Attempt $attempt of 3..."
if tests/test-speculative.sh .build/release/SwiftLM 15414; then
exit 0
fi
if [ "$attempt" -lt 3 ]; then
echo "Test failed, retrying in 10s..."
sleep 10
fi
done
echo "All attempts failed"
exit 1
- name: Upload speculative test logs on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: speculative-test-logs
path: /tmp/SwiftLM-test-speculative.log
retention-days: 7
# ── Speculative Decoding Memory Evaluation ──
# Runs the 2B model with NUM_DRAFT_TOKENS=2 to check peak
# memory compression/efficiency. Emits vm_stat readings as step summary.
speculative-decoding-eval:
runs-on: macos-15
timeout-minutes: 45
needs: build_and_unit_test
continue-on-error: true
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Install Metal Toolchain
run: xcodebuild -downloadComponent MetalToolchain || true
- name: Cache Swift packages
uses: actions/cache@v4
with:
path: .build
key: ${{ runner.os }}-spm-SwiftLM-v3-${{ hashFiles('Package.resolved') }}
restore-keys: |
${{ runner.os }}-spm-SwiftLM-v3-
- name: Clear stale module cache
run: find .build -type d -name ModuleCache -exec rm -rf {} + 2>/dev/null || true
- name: Resolve dependencies
run: swift package resolve
- name: Build (Release)
run: swift build -c release
- name: Compile and install custom MLX Metal library
run: |
# cmake-based build from SharpAI fork — mirrors build.sh (PR #58)
if [ -d "mlx-swift/Source/Cmlx/mlx" ]; then
MLX_SRC="mlx-swift/Source/Cmlx/mlx"
else
MLX_SRC=".build/checkouts/mlx-swift/Source/Cmlx/mlx"
fi
mkdir -p .build/metallib_build
pushd .build/metallib_build
cmake "../../$MLX_SRC" \
-DMLX_BUILD_TESTS=OFF \
-DMLX_BUILD_EXAMPLES=OFF \
-DMLX_BUILD_BENCHMARKS=OFF \
-DMLX_BUILD_PYTHON_BINDINGS=OFF \
-DMLX_METAL_JIT=OFF \
-DMLX_ENABLE_NAX=1 \
-DCMAKE_BUILD_TYPE=Release 2>&1 | tail -20
make mlx-metallib -j$(sysctl -n hw.ncpu) 2>&1 | tail -20
popd
BUILT=$(find .build/metallib_build -name "mlx.metallib" | head -1)
cp "$BUILT" .build/release/mlx.metallib
# Install hf for model pre-download
python3 -m venv /tmp/mlx_venv
/tmp/mlx_venv/bin/pip install --quiet huggingface_hub hf
- name: Cache MLX models (draft + 2B)
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: mlx-speculative-eval-qwen35-2b-0.8b
- name: Pre-download HuggingFace models
run: |
source /tmp/mlx_venv/bin/activate
hf download mlx-community/Qwen3.5-2B-4bit || true
hf download mlx-community/Qwen3.5-0.8B-MLX-4bit || true
- name: Snapshot RAM before test
id: ram_before
run: |
PAGE_SIZE=$(sysctl -n hw.pagesize)
RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" '
/Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 }
/Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 }
/Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 }
END { printf "%.2f", (act+wire+comp)*page_size/1073741824 }
')
echo "ram_before=$RAM" >> $GITHUB_OUTPUT
echo "RAM before eval: ${RAM} GB"
- name: Run speculative evaluation E2E
env:
HF_HUB_DOWNLOAD_TIMEOUT: "900"
SWIFTLM_TOP_K: "2"
MAIN_MODEL: "mlx-community/Qwen3.5-2B-4bit"
NUM_DRAFT_TOKENS: "2"
run: |
chmod +x tests/test-speculative-eval.sh
for attempt in 1 2 3; do
echo "Attempt $attempt of 3..."
if tests/test-speculative-eval.sh .build/release/SwiftLM 15414; then
exit 0
fi
if [ "$attempt" -lt 3 ]; then
echo "Test failed, retrying in 10s..."
sleep 10
fi
done
echo "All attempts failed"
exit 1
- name: Snapshot RAM after test
if: always()
id: ram_after
run: |
PAGE_SIZE=$(sysctl -n hw.pagesize)
RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" '
/Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 }
/Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 }
/Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 }
END { printf "%.2f", (act+wire+comp)*page_size/1073741824 }
')
echo "ram_after=$RAM" >> $GITHUB_OUTPUT
echo "RAM after eval: ${RAM} GB"
- name: Emit memory summary
if: always()
run: |
BEFORE="${{ steps.ram_before.outputs.ram_before }}"
AFTER="${{ steps.ram_after.outputs.ram_after }}"
TOTAL=$(sysctl -n hw.memsize | awk '{printf "%.1f", $1/1073741824}')
{
echo "## 📊 Speculative Eval — Memory Readings"
echo "| Metric | Value |"
echo "|--------|-------|"
echo "| Runner physical RAM | ${TOTAL} GB |"
echo "| RAM before test | ${BEFORE} GB |"
echo "| RAM after test | ${AFTER} GB |"
echo "| Delta | $(echo "$AFTER $BEFORE" | awk '{printf "%.2f", $1-$2}') GB |"
} >> $GITHUB_STEP_SUMMARY
- name: Upload speculative eval logs on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: speculative-eval-logs
path: /tmp/SwiftLM-test-speculative-eval.log
# ── Issue #72 Regression: SSD streaming + draft model RAM guard ──────────────
# Mandatory (not continue-on-error). Enforces the auto-cap-to-1 fix and the
# memoryLimit sentinel on every PR. Uses tiny models (2B main + 0.8B draft)
# sized for the 7 GB macos-15 runner.
#
# Three checks mirror the local Test 10 in run_benchmark.sh:
# [1] Auto-cap warning present in server log
# [2] Peak RAM ≤ 85% of runner physical RAM during inference
# [3] /v1/chat/completions returns valid content
ssd-draft-memory-guard:
runs-on: macos-15
timeout-minutes: 45
needs: build_and_unit_test
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Download Binary Artifact
uses: actions/download-artifact@v4
continue-on-error: true # fall back to building if artifact expired
with:
name: swiftlm-architecture
path: .build/release/
- name: Build (Release) if artifact missing
run: |
if [ ! -f ".build/release/SwiftLM" ]; then
swift build -c release
fi
chmod +x .build/release/SwiftLM
- name: Install MLX Metal library
run: |
python3 -m venv /tmp/mlx_venv
/tmp/mlx_venv/bin/pip install --quiet mlx huggingface_hub hf
cp /tmp/mlx_venv/lib/python*/site-packages/mlx/lib/mlx.metallib .build/release/
- name: Cache MLX models (2B main + 0.8B draft)
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: mlx-ssd-draft-guard-qwen35-2b-0.8b
- name: Pre-download models
run: |
source /tmp/mlx_venv/bin/activate
hf download mlx-community/Qwen3.5-2B-4bit || true
hf download mlx-community/Qwen3.5-0.8B-MLX-4bit || true
- name: Snapshot RAM baseline
id: ram_base
run: |
PAGE_SIZE=$(sysctl -n hw.pagesize)
RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" '
/Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 }
/Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 }
/Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 }
END { printf "%.2f", (act+wire+comp)*page_size/1073741824 }
')
TOTAL=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1/1073741824}')
LIMIT=$(echo "$TOTAL * 0.85" | bc | cut -d. -f1)
echo "ram_base=$RAM" >> $GITHUB_OUTPUT
echo "runner_ram=$TOTAL" >> $GITHUB_OUTPUT
echo "ram_limit=$LIMIT" >> $GITHUB_OUTPUT
echo "Baseline RAM: ${RAM} GB | Runner: ${TOTAL} GB | Limit: ${LIMIT} GB"
- name: Start SSD + draft server (Issue #72 scenario)
id: server
run: |
# Launch with --num-draft-tokens 4 intentionally — the auto-cap should
# silently reduce it to 1 and log the advisory message.
.build/release/SwiftLM \
--model mlx-community/Qwen3.5-2B-4bit \
--draft-model mlx-community/Qwen3.5-0.8B-MLX-4bit \
--stream-experts \
--num-draft-tokens 4 \
--port 15473 \
--max-tokens 64 \
> /tmp/ssd_draft_guard.log 2>&1 &
PID=$!
echo "server_pid=$PID" >> $GITHUB_OUTPUT
echo "Waiting for server (up to 300s)..."
for i in $(seq 1 300); do
if ! kill -0 $PID 2>/dev/null; then
echo "Server died early:"
cat /tmp/ssd_draft_guard.log
exit 1
fi
if curl -sf http://127.0.0.1:15473/health >/dev/null 2>&1; then
echo "Server ready after ${i}s"
break
fi
sleep 1
if [ "$i" -eq 300 ]; then echo "Timeout"; exit 1; fi
done
- name: Snapshot RAM after model load
id: ram_loaded
run: |
PAGE_SIZE=$(sysctl -n hw.pagesize)
RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" '
/Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 }
/Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 }
/Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 }
END { printf "%.2f", (act+wire+comp)*page_size/1073741824 }
')
echo "ram_loaded=$RAM" >> $GITHUB_OUTPUT
echo "RAM after load: ${RAM} GB"
- name: "[1/3] Verify auto-cap warning in server log"
run: |
if grep -q "auto-capping" /tmp/ssd_draft_guard.log; then
echo "✅ Auto-cap warning found — numDraftTokens correctly reduced to 1"
else
echo "❌ Auto-cap warning NOT found in server log"
echo "--- Last 20 lines of server log ---"
tail -20 /tmp/ssd_draft_guard.log
exit 1
fi
- name: "[2/3] Run inference and snapshot peak RAM"
id: ram_peak
run: |
RESULT=$(curl -sf --max-time 90 http://127.0.0.1:15473/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model":"test","messages":[{"role":"user","content":"What is 2+2? One word."}],"max_tokens":32,"stream":false}' \
2>/dev/null || echo "{}")
echo "$RESULT" > /tmp/inf_result.json
PAGE_SIZE=$(sysctl -n hw.pagesize)
RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" '
/Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 }
/Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 }
/Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 }
END { printf "%.2f", (act+wire+comp)*page_size/1073741824 }
')
echo "ram_peak=$RAM" >> $GITHUB_OUTPUT
echo "RAM after inference: ${RAM} GB"
LIMIT="${{ steps.ram_base.outputs.ram_limit }}"
OK=$(echo "$RAM <= $LIMIT" | bc -l)
if [ "$OK" = "1" ]; then
echo "✅ RAM=${RAM}GB ≤ ${LIMIT}GB (85% of ${{ steps.ram_base.outputs.runner_ram }}GB runner RAM)"
else
echo "❌ RAM=${RAM}GB EXCEEDS limit ${LIMIT}GB — Issue #72 regression detected"
echo " (memoryLimit sentinel or auto-cap may have regressed)"
exit 1
fi
- name: "[3/3] Validate inference response"
run: |
RESULT=$(cat /tmp/inf_result.json)
if echo "$RESULT" | grep -q '"content"'; then
TEXT=$(echo "$RESULT" | python3 -c \
"import sys,json;d=json.load(sys.stdin);print(d['choices'][0]['message']['content'])" \
2>/dev/null || echo "(parse error)")
echo "✅ Response: $TEXT"
else
echo "❌ No content in response — server may have crashed or returned empty"
echo "Raw: ${RESULT:0:300}"
exit 1
fi
- name: Stop server
if: always()
run: kill ${{ steps.server.outputs.server_pid }} 2>/dev/null || true
- name: Emit memory summary to step summary
if: always()
run: |
BASE="${{ steps.ram_base.outputs.ram_base }}"
LOADED="${{ steps.ram_loaded.outputs.ram_loaded }}"
PEAK="${{ steps.ram_peak.outputs.ram_peak }}"
TOTAL="${{ steps.ram_base.outputs.runner_ram }}"
LIMIT="${{ steps.ram_base.outputs.ram_limit }}"
{
echo "## 🛡️ Issue #72 — SSD + Draft Model RAM Guard"
echo "| Metric | Value | Threshold |"
echo "|--------|-------|-----------|"
echo "| Runner physical RAM | ${TOTAL} GB | — |"
echo "| RAM baseline (before server) | ${BASE} GB | — |"
echo "| RAM after model load | ${LOADED} GB | — |"
echo "| RAM after inference (peak) | ${PEAK} GB | ≤ ${LIMIT} GB (85%) |"
echo "| Load delta | $(echo "$LOADED $BASE" | awk '{printf "%.2f", $1-$2}') GB | — |"
echo "| Inference delta | $(echo "$PEAK $LOADED" | awk '{printf "%.2f", $1-$2}') GB | — |"
} >> $GITHUB_STEP_SUMMARY
- name: Upload server log on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: ssd-draft-guard-log
path: /tmp/ssd_draft_guard.log
retention-days: 7