PR-B2 (ADR 0008 Phase B): AppendTokens + byte-exact prefill-incremental contract #72
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI | |
| # Runs on every push to main and on every PR targeting main. | |
| # | |
| # Scope: platform-neutral unit tests with 100% line coverage on the | |
| # library modules we actually ship for this commit. We deliberately | |
| # DO NOT run: | |
| # | |
| # * tests/core/ — needs HuggingFace weights | |
| # * tests/system/ — same, plus is slow | |
| # * tests/inference_engine/proposer/ — uses real Qwen3 sparse | |
| # proposer; HF-cache-bound | |
| # * tests/backends/mlx/test_{verifier,proposer,cache,torch_bridge}.py | |
| # — Apple-Silicon only | |
| # | |
| # Mac and CUDA contributors run the full suite locally via | |
| # scripts/run_platform_tests.sh and push the platform-test reports to | |
| # the PR branch as evidence; this CI workflow guards the platform- | |
| # neutral surface so a regression there cannot land on main. | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| branches: [main] | |
| workflow_dispatch: {} | |
| # Cancel superseded runs on the same branch — saves CI time on | |
| # rapid-fire pushes. | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| unit-tests: | |
| name: unit tests + 100% coverage | |
| runs-on: ubuntu-latest | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| python-version: ["3.12"] | |
| steps: | |
| - name: Check out | |
| uses: actions/checkout@v4 | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| cache: pip | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r requirements.txt | |
| - name: Show installed key versions | |
| run: | | |
| python -c "import torch, fastapi, pydantic, prometheus_client, transformers; \ | |
| print('torch', torch.__version__); \ | |
| print('fastapi', fastapi.__version__); \ | |
| print('pydantic', pydantic.VERSION); \ | |
| print('transformers', transformers.__version__); \ | |
| print('prometheus_client', __import__('importlib.metadata', fromlist=['version']).version('prometheus_client'))" | |
| - name: Run platform-neutral test suite with 100% coverage | |
| env: | |
| PYTHONPATH: . | |
| run: | | |
| pytest \ | |
| tests/inference_engine/server/ \ | |
| tests/inference_engine/memory/ \ | |
| tests/inference_engine/scheduler/ \ | |
| tests/inference_engine/pipeline/ \ | |
| tests/inference_engine/session/ \ | |
| tests/training/repr_align/ \ | |
| tests/backends/mlx/test_env.py \ | |
| --cov=inference_engine.server \ | |
| --cov=inference_engine.memory \ | |
| --cov=inference_engine.scheduler \ | |
| --cov=inference_engine.pipeline \ | |
| --cov=inference_engine.session \ | |
| --cov=training.repr_align \ | |
| --cov-report=term \ | |
| --cov-report=xml:coverage.xml \ | |
| --cov-fail-under=100 \ | |
| --junitxml=junit.xml \ | |
| -v | |
| - name: Upload coverage artifact | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: coverage-py${{ matrix.python-version }} | |
| path: | | |
| coverage.xml | |
| junit.xml | |
| if-no-files-found: warn | |
| retention-days: 14 | |
| package-import-smoke: | |
| name: package import smoke | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check out | |
| uses: actions/checkout@v4 | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| cache: pip | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r requirements.txt | |
| - name: Import every shipping subpackage | |
| env: | |
| PYTHONPATH: . | |
| run: | | |
| python -c "import inference_engine; \ | |
| import inference_engine.server; \ | |
| import inference_engine.server.app; \ | |
| import inference_engine.server.config; \ | |
| import inference_engine.server.engine; \ | |
| import inference_engine.server.metrics; \ | |
| import inference_engine.server.errors; \ | |
| import inference_engine.server.auth; \ | |
| import inference_engine.server.tokenizer; \ | |
| import inference_engine.server.streaming; \ | |
| import inference_engine.server.schemas; \ | |
| import inference_engine.memory; \ | |
| import inference_engine.memory.slab; \ | |
| import inference_engine.memory.pool; \ | |
| import inference_engine.scheduler; \ | |
| import inference_engine.scheduler.config; \ | |
| import inference_engine.scheduler.scheduler; \ | |
| import inference_engine.scheduler.session; \ | |
| import inference_engine.pipeline; \ | |
| import inference_engine.pipeline.coordinator; \ | |
| import inference_engine.session; \ | |
| import inference_engine.session.store; \ | |
| import inference_engine.server.grpc_app; \ | |
| import inference_engine.server.proto_gen.kakeya.v1.runtime_pb2; \ | |
| import inference_engine.server.proto_gen.kakeya.v1.runtime_pb2_grpc; \ | |
| import inference_engine.proposer; \ | |
| import inference_engine.proposer.sparse_logits; \ | |
| import inference_engine.backends.mlx.env; \ | |
| import training.repr_align; \ | |
| print('all imports succeeded')" | |
| docker-build: | |
| name: docker build + import smoke | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check out | |
| uses: actions/checkout@v4 | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v3 | |
| - name: Build image | |
| uses: docker/build-push-action@v6 | |
| with: | |
| context: . | |
| file: Dockerfile | |
| tags: kakeya:ci-${{ github.sha }} | |
| push: false | |
| load: true | |
| cache-from: type=gha | |
| cache-to: type=gha,mode=max | |
| - name: Smoke-test image (import only — weights would require HF cache) | |
| run: | | |
| # Confirm the package imports inside the image without | |
| # touching HF cache. | |
| docker run --rm \ | |
| --entrypoint python \ | |
| kakeya:ci-${{ github.sha }} \ | |
| -c "import inference_engine.server.app; print('image imports clean')" | |
| # Confirm the server CLI parses (--help exits 0 before any | |
| # model load, so we don't need HF cache). | |
| docker run --rm \ | |
| kakeya:ci-${{ github.sha }} \ | |
| --help | |
| proto-lint: | |
| name: proto lint (buf) | |
| runs-on: ubuntu-latest | |
| # ADR 0008 PR-A1: the proto/ schema is the wire-contract source of | |
| # truth for the gRPC runtime + Python / TypeScript SDKs. `buf lint` | |
| # under the STANDARD ruleset enforces public-style conventions | |
| # (service / message naming, enum zero-value naming, file-package | |
| # alignment). `buf format --diff` enforces canonical formatting so | |
| # the source of truth is also character-stable across editors. | |
| steps: | |
| - name: Check out | |
| uses: actions/checkout@v4 | |
| - name: Install buf | |
| uses: bufbuild/buf-setup-action@v1 | |
| with: | |
| version: 1.50.0 | |
| github_token: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Lint .proto files | |
| run: buf lint | |
| - name: Verify .proto formatting is canonical | |
| run: | | |
| # `buf format` is an in-place rewriter; --diff makes it print | |
| # what it would change instead of editing files (and always | |
| # exits 0). A non-empty diff is a CI failure. | |
| diff_output="$(buf format --diff)" | |
| if [ -n "$diff_output" ]; then | |
| echo "::error::buf format would have rewritten one or more .proto files." | |
| echo "Run 'buf format -w' locally and commit the result." | |
| printf '%s\n' "$diff_output" | |
| exit 1 | |
| fi | |
| proto-stub-drift: | |
| name: proto stub drift | |
| runs-on: ubuntu-latest | |
| # ADR 0008 PR-B1: the committed Python stubs under | |
| # inference_engine/server/proto_gen/ MUST be byte-identical to what | |
| # scripts/regenerate_proto_stubs.sh produces from the .proto. This | |
| # job re-runs the script, then `git diff --exit-code` fails CI if | |
| # anything changed. Catches drift between the .proto contract and | |
| # the SDK-consumed Python API. | |
| steps: | |
| - name: Check out | |
| uses: actions/checkout@v4 | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| cache: pip | |
| - name: Install grpcio-tools | |
| run: | | |
| python -m pip install --upgrade pip | |
| # Pin the same grpcio-tools range as requirements.txt so the | |
| # stubs we regenerate match what production uses. If | |
| # grpcio-tools updates and starts producing different stub | |
| # bytes, this job catches it as a drift before merge. | |
| pip install 'grpcio>=1.65,<2.0' 'grpcio-tools>=1.65,<2.0' | |
| - name: Regenerate stubs | |
| run: bash scripts/regenerate_proto_stubs.sh | |
| - name: Fail if regenerated stubs differ from committed stubs | |
| run: | | |
| if ! git diff --exit-code -- inference_engine/server/proto_gen/; then | |
| echo "::error::Committed stubs are out of date with proto/." | |
| echo "Run scripts/regenerate_proto_stubs.sh locally and commit." | |
| exit 1 | |
| fi |