diff --git a/.github/workflows/tests-integration-nightly.yml b/.github/workflows/tests-integration-nightly.yml new file mode 100644 index 000000000..8474f862d --- /dev/null +++ b/.github/workflows/tests-integration-nightly.yml @@ -0,0 +1,435 @@ +name: Integration Tests (Downstream) + +# Nightly smoke tests: run the bnb-specific test suites from transformers, +# accelerate, and peft against the latest main-branch bnb wheel. Catches +# downstream breakage before it reaches users. +# +# bnb is installed from the `continuous-release_main` pre-release which +# python-package.yml publishes on every push to main — no duplicate build. +# +# See agents/integration_tests_guide.md for background. + +on: + workflow_dispatch: + pull_request: + paths: + - '.github/workflows/tests-integration-nightly.yml' + - 'scripts/integration_test_report.py' + # schedule: + # - cron: "30 3 * * *" # enable once stable; runs after python-package + tests-nightly + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + PYTHON_VERSION: "3.10" + TORCH_VERSION: "2.9.1" + PYPI_INDEX: "https://download.pytorch.org/whl/cu128" + BNB_WHEEL_URL: "https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl" + CUDA_VISIBLE_DEVICES: "0,1" + +jobs: + # ─── Downstream test jobs ───────────────────────────────────────────────── + # Each job: + # 1. Installs torch, then bnb from the continuous-release wheel + # 2. Installs the downstream lib (latest release from PyPI) + # 3. Clones the matching version tag for the test files + # 4. Runs the library's bnb-specific tests with --junitxml + # 5. Uploads the XML + full log as an artifact for the report job + # + # Runner matching rationale (see integration_tests_guide.md): + # transformers CI runs on T4 → we use T4 + # accelerate / peft CI runs on L4 → closest bnb equivalent is A10 + # This reduces spurious failures from expected values calibrated on their runners. + + test-transformers: + name: Transformers bnb tests (single GPU) + if: github.repository == 'bitsandbytes-foundation/bitsandbytes' + runs-on: bandb-aws-g5-4xlarge-plus-use1-public-80 # A10G (matches transformers CI) + steps: + - name: Show GPU information + run: nvidia-smi + + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install torch + bnb (from continuous-release) + run: | + pip install torch==${TORCH_VERSION} --index-url ${PYPI_INDEX} + pip install "bitsandbytes[test] @ ${BNB_WHEEL_URL}" + + - name: Install transformers and clone matching tag + run: | + pip install -U transformers accelerate + TRANSFORMERS_VERSION=$(pip show transformers | awk '/^Version:/ {print $2}') + echo "Installed transformers v${TRANSFORMERS_VERSION}" + git clone --depth=1 --branch "v${TRANSFORMERS_VERSION}" \ + https://github.com/huggingface/transformers.git /tmp/transformers + + - name: Show environment + run: | + pip list + python -m torch.utils.collect_env + + - name: Run transformers bnb tests + working-directory: /tmp/transformers + env: + RUN_SLOW: "1" + shell: bash -o pipefail {0} + run: | + mkdir -p ${GITHUB_WORKSPACE}/reports + python -m pytest tests/quantization/bnb/ \ + -v \ + -k "not MultiGpu and not multi_gpu" \ + --junitxml=${GITHUB_WORKSPACE}/reports/transformers.xml \ + -o junit_logging=all \ + 2>&1 | tee ${GITHUB_WORKSPACE}/reports/transformers.log + + - name: Upload JUnit XML and log + if: always() + uses: actions/upload-artifact@v4 + with: + name: reports-transformers + path: reports/ + retention-days: 7 + + test-transformers-multigpu: + name: Transformers bnb tests (multi GPU) + if: false # disabled until bandb-aws-g6-12xlarge-plus runner is provisioned + runs-on: bandb-aws-g6-12xlarge-plus-use1-public-80 # 4× L4 (2 used) + steps: + - name: Show GPU information + run: nvidia-smi + + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install torch + bnb (from continuous-release) + run: | + pip install torch==${TORCH_VERSION} --index-url ${PYPI_INDEX} + pip install "bitsandbytes[test] @ ${BNB_WHEEL_URL}" + + - name: Install transformers and clone matching tag + run: | + pip install -U transformers accelerate + TRANSFORMERS_VERSION=$(pip show transformers | awk '/^Version:/ {print $2}') + echo "Installed transformers v${TRANSFORMERS_VERSION}" + git clone --depth=1 --branch "v${TRANSFORMERS_VERSION}" \ + https://github.com/huggingface/transformers.git /tmp/transformers + + - name: Show environment + run: | + pip list + python -m torch.utils.collect_env + + - name: Run transformers bnb tests (multi-GPU only) + working-directory: /tmp/transformers + env: + RUN_SLOW: "1" + shell: bash -o pipefail {0} + run: | + mkdir -p ${GITHUB_WORKSPACE}/reports + python -m pytest tests/quantization/bnb/ \ + -v \ + -k "MultiGpu or multi_gpu" \ + --junitxml=${GITHUB_WORKSPACE}/reports/transformers-multigpu.xml \ + -o junit_logging=all \ + 2>&1 | tee ${GITHUB_WORKSPACE}/reports/transformers-multigpu.log + + - name: Upload JUnit XML and log + if: always() + uses: actions/upload-artifact@v4 + with: + name: reports-transformers-multigpu + path: reports/ + retention-days: 7 + + test-diffusers: + name: Diffusers bnb tests + if: github.repository == 'bitsandbytes-foundation/bitsandbytes' + runs-on: bandb-aws-g6e-4xlarge-plus-use1-public-80 # L40S (matches diffusers CI) + container: + image: diffusers/diffusers-pytorch-cuda + options: --gpus all --shm-size "16gb" --ipc host + steps: + - name: Show GPU information + run: nvidia-smi + + - uses: actions/checkout@v4 + + - name: Install bnb + diffusers from PyPI (overriding image versions) + run: | + pip install "bitsandbytes[test] @ ${BNB_WHEEL_URL}" + pip install --force-reinstall --no-deps diffusers + + - name: Clone diffusers matching installed version + run: | + DIFFUSERS_VERSION=$(pip show diffusers | awk '/^Version:/ {print $2}') + echo "Installed diffusers v${DIFFUSERS_VERSION}" + git clone --depth=1 --branch "v${DIFFUSERS_VERSION}" \ + https://github.com/huggingface/diffusers.git /tmp/diffusers + + - name: Show environment + run: | + pip list + python -m torch.utils.collect_env + + - name: Run diffusers bnb tests + working-directory: /tmp/diffusers + env: + RUN_SLOW: "1" + CUBLAS_WORKSPACE_CONFIG: ":16:8" + shell: bash -o pipefail {0} + run: | + mkdir -p ${GITHUB_WORKSPACE}/reports + python -m pytest \ + -m bitsandbytes \ + tests/ \ + -v \ + --junitxml=${GITHUB_WORKSPACE}/reports/diffusers.xml \ + -o junit_logging=all \ + 2>&1 | tee ${GITHUB_WORKSPACE}/reports/diffusers.log + + - name: Upload JUnit XML and log + if: always() + uses: actions/upload-artifact@v4 + with: + name: reports-diffusers + path: reports/ + retention-days: 7 + + test-axolotl: + name: Axolotl bnb kernel tests + if: github.repository == 'bitsandbytes-foundation/bitsandbytes' + runs-on: bandb-aws-g5-4xlarge-plus-use1-public-80 # A10G + steps: + - name: Show GPU information + run: nvidia-smi + + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install torch + bnb (from continuous-release) + run: | + pip install torch==${TORCH_VERSION} --index-url ${PYPI_INDEX} + pip install "bitsandbytes[test] @ ${BNB_WHEEL_URL}" + + - name: Install axolotl and clone matching tag + run: | + pip install axolotl transformers accelerate peft + AXOLOTL_VERSION=$(pip show axolotl | awk '/^Version:/ {print $2}') + echo "Installed axolotl v${AXOLOTL_VERSION}" + git clone --depth=1 --branch "v${AXOLOTL_VERSION}" \ + https://github.com/axolotl-ai-cloud/axolotl.git /tmp/axolotl + + - name: Show environment + run: | + pip list + python -m torch.utils.collect_env + + - name: Run axolotl bnb kernel tests + working-directory: /tmp/axolotl + shell: bash -o pipefail {0} + run: | + mkdir -p ${GITHUB_WORKSPACE}/reports + python -m pytest \ + tests/e2e/kernels/test_quantize.py \ + tests/e2e/kernels/test_lora.py \ + "tests/e2e/kernels/test_lora_features.py::TestQuantizedModels" \ + -v \ + --junitxml=${GITHUB_WORKSPACE}/reports/axolotl.xml \ + -o junit_logging=all \ + 2>&1 | tee ${GITHUB_WORKSPACE}/reports/axolotl.log + + - name: Upload JUnit XML and log + if: always() + uses: actions/upload-artifact@v4 + with: + name: reports-axolotl + path: reports/ + retention-days: 7 + + test-peft: + name: PEFT bnb tests (single GPU) + if: github.repository == 'bitsandbytes-foundation/bitsandbytes' + runs-on: bandb-aws-g6-4xlarge-plus-use1-public-80 # L4 (matches peft CI) + steps: + - name: Show GPU information + run: nvidia-smi + + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install torch + bnb (from continuous-release) + run: | + pip install torch==${TORCH_VERSION} --index-url ${PYPI_INDEX} + pip install "bitsandbytes[test] @ ${BNB_WHEEL_URL}" + + - name: Install peft and clone matching tag + run: | + pip install "peft[test]" transformers accelerate + PEFT_VERSION=$(pip show peft | awk '/^Version:/ {print $2}') + echo "Installed peft v${PEFT_VERSION}" + git clone --depth=1 --branch "v${PEFT_VERSION}" \ + https://github.com/huggingface/peft.git /tmp/peft + + - name: Show environment + run: | + pip list + python -m torch.utils.collect_env + + - name: Run peft bnb tests + working-directory: /tmp/peft + env: + IS_GITHUB_CI: "1" + shell: bash -o pipefail {0} + run: | + mkdir -p ${GITHUB_WORKSPACE}/reports + python -m pytest \ + -m single_gpu_tests \ + -k PeftBnbGPUExampleTests \ + tests/test_gpu_examples.py \ + -v \ + --junitxml=${GITHUB_WORKSPACE}/reports/peft.xml \ + -o junit_logging=all \ + 2>&1 | tee ${GITHUB_WORKSPACE}/reports/peft.log + + - name: Upload JUnit XML and log + if: always() + uses: actions/upload-artifact@v4 + with: + name: reports-peft + path: reports/ + retention-days: 7 + + test-peft-multigpu: + name: PEFT bnb tests (multi GPU) + if: false # disabled until bandb-aws-g6-12xlarge-plus runner is provisioned + runs-on: bandb-aws-g6-12xlarge-plus-use1-public-80 # 4× L4 + steps: + - name: Show GPU information + run: nvidia-smi + + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install torch + bnb (from continuous-release) + run: | + pip install torch==${TORCH_VERSION} --index-url ${PYPI_INDEX} + pip install "bitsandbytes[test] @ ${BNB_WHEEL_URL}" + + - name: Install peft and clone matching tag + run: | + pip install "peft[test]" transformers accelerate + PEFT_VERSION=$(pip show peft | awk '/^Version:/ {print $2}') + echo "Installed peft v${PEFT_VERSION}" + git clone --depth=1 --branch "v${PEFT_VERSION}" \ + https://github.com/huggingface/peft.git /tmp/peft + + - name: Show environment + run: | + pip list + python -m torch.utils.collect_env + + - name: Run peft bnb tests + working-directory: /tmp/peft + env: + IS_GITHUB_CI: "1" + shell: bash -o pipefail {0} + run: | + mkdir -p ${GITHUB_WORKSPACE}/reports + python -m pytest \ + -m multi_gpu_tests \ + -k PeftBnbGPUExampleTests \ + tests/test_gpu_examples.py \ + -v \ + --junitxml=${GITHUB_WORKSPACE}/reports/peft-multigpu.xml \ + -o junit_logging=all \ + 2>&1 | tee ${GITHUB_WORKSPACE}/reports/peft-multigpu.log + + - name: Upload JUnit XML and log + if: always() + uses: actions/upload-artifact@v4 + with: + name: reports-peft-multigpu + path: reports/ + retention-days: 7 + + # ─── Consolidated report ────────────────────────────────────────────────── + # Runs after all test jobs finish (success or failure). + # Downloads the JUnit XMLs, runs our report script, writes to the job + # summary, uploads artifacts, and posts a consolidated message to + # #bnb-daily-ci-collab on Slack. + + report: + name: Consolidated report + needs: [test-transformers, test-transformers-multigpu, test-diffusers, test-axolotl, test-peft, test-peft-multigpu] + if: always() && github.repository == 'bitsandbytes-foundation/bitsandbytes' + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Download all report artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts + pattern: reports-* + + - name: Consolidate XMLs into reports/ + run: | + mkdir -p reports + # Each artifact lands in artifacts/reports-/ — flatten to reports/.xml + find artifacts -name '*.xml' -exec cp {} reports/ \; + find artifacts -name '*.log' -exec cp {} reports/ \; + ls -la reports/ + + - name: Generate consolidated report + post to Slack + env: + SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + run: | + pip install slack_sdk + python scripts/integration_test_report.py \ + --reports-dir reports/ \ + --output consolidated_report.md \ + --slack-channel bnb-daily-ci-collab + + - name: Write to job summary + if: always() + run: | + cat consolidated_report.md >> $GITHUB_STEP_SUMMARY + + - name: Upload consolidated report + if: always() + uses: actions/upload-artifact@v4 + with: + name: consolidated-report + path: | + consolidated_report.md + reports/ + retention-days: 14 diff --git a/agents/integration_tests_guide.md b/agents/integration_tests_guide.md new file mode 100644 index 000000000..3fea9d4ae --- /dev/null +++ b/agents/integration_tests_guide.md @@ -0,0 +1,630 @@ +# Downstream Integration Tests Guide + +This document captures the research and implementation plan for adding nightly integration tests +that run the bitsandbytes-related test suites from downstream libraries (transformers, accelerate, +peft) against the current bnb build. The goal is to catch breakage before it reaches users. + +## Design Principles + +From discussion with the team: + +- **Smoke test, not overkill** — single GPU, single config, no large matrix +- Latest torch + bnb from main + downstream libs from latest tagged release +- One GPU: **A10** (see GPU selection rationale below) +- Run the relevant tests that already exist in the downstream repos rather than writing new ones from scratch + +--- + +## Downstream Test Suites + +### 1. Transformers (`huggingface/transformers`) + +**Local clone:** `~/src/transformers` (Titus's fork: `Titus-von-Koeller/transformers`) + +**Test files:** +- `tests/quantization/bnb/test_4bit.py` (~666 lines, 7 test classes) +- `tests/quantization/bnb/test_mixed_int8.py` (~400 lines) +- `tests/quantization/bnb/README.md` (troubleshooting guide, not a test) +- `tests/quantization/bnb/__init__.py` + +**Test classes and what they cover:** + +`test_4bit.py`: +- `Bnb4BitTest` — 4-bit model loading (bloom-1b7), memory footprint, parameter counting, config serialization, generation quality, device/dtype assignment guards +- `Bnb4BitT5Test` — 4-bit T5 inference with/without `keep_in_fp32_modules` (t5-small, flan-t5-small) +- `Classes4BitModelTest` — AutoModel, AutoModelForSequenceClassification, AutoModelForCausalLM, AutoModelForSeq2SeqLM all with 4-bit (bloom-560m, t5-small) +- `Pipeline4BitTest` — `pipeline("text-generation")` with 4-bit (bloom-1b7) +- `Bnb4bitTestMultiGpu` — multi-GPU balanced loading (bloom-1b7) — **skip for single-GPU smoke test** +- `Bnb4BitTestTraining` — freeze base + LoRA adapters + forward/backward pass (opt-350m) +- `BaseSerializationTest` / `ExtendedSerializationTest` / `BloomSerializationTest` / `GPTSerializationTest` — save/load round-trips with all quant_type/double_quant/safe_serialization combos (opt-125m, bloom-560m, gpt2-xl) +- `Bnb4BitTestBasicConfigTest` — config validation edge cases (opt-125m) +- `Bnb4BitGPT2Test` — same as Bnb4BitTest but with gpt2-xl + +`test_mixed_int8.py`: +- `MixedInt8Test` — 8-bit model loading (bloom-1b7), memory footprint, generation quality, config serialization, `get_keys_to_not_convert` for various architectures +- `MixedInt8TestMultiGpu` — multi-GPU 8-bit loading — **skip for single-GPU smoke test** +- `MixedInt8TestTraining` — 8-bit training with LoRA adapters (opt-350m) +- `MixedInt8TestPipeline` — pipeline compatibility with 8-bit +- `MixedInt8T5Test` — 8-bit T5 loading/inference + +**Models downloaded:** bloom-1b7, bloom-560m, opt-125m, opt-350m, t5-small, flan-t5-small, gpt2-xl, rwkv-4-169m-pile, (config-only: mpt-7b, blip2-opt-2.7b, roberta-large) + +**Decorators/markers:** `@slow`, `@require_bitsandbytes`, `@require_accelerate`, `@require_torch_gpu`, `@require_torch_multi_gpu` + +**How to run:** +```bash +RUN_SLOW=1 python -m pytest tests/quantization/bnb/ -v +``` + +**Dependencies:** transformers + accelerate + bitsandbytes + torch + +**Their CI GPU:** NVIDIA A10G. The quantization-CI job is defined in `.github/workflows/self-scheduled-caller.yml` → `quantization-ci:` and runs as a matrix over `aws-g5-4xlarge-cache` (1× A10G) and `aws-g5-12xlarge-cache` (4× A10G), so both single- and multi-GPU get coverage. Uses the docker image `huggingface/transformers-quantization-latest-gpu`. Reports go to `#transformers-ci-daily-quantization`. + +--- + +### 2. Accelerate (`huggingface/accelerate`) + +**Local clone:** `~/src/accelerate` + +**Test file:** +- `tests/test_quantization.py` (~400 lines) + +**Test classes and what they cover:** + +- `BitsAndBytesConfigIntegration` — unit test for `BnbQuantizationConfig` validation (no GPU needed) +- `MixedInt8EmptyModelTest` — the main test class: + - Creates model from empty weights via `init_empty_weights()` + `load_and_quantize_model()` + - Tests: memory footprint, 8-bit linear verification, `llm_int8_skip_modules`, inference correctness, `keep_in_fp32_modules`, serialization round-trip + - Uses `marcsun13/bloom-1b7_with_lm_head` (bloom-1b7 variant with pre-uploaded pytorch_model.bin) + - Has `@require_multi_device` tests for CPU+GPU dispatch with custom device maps — **skip for single-GPU** +- `MixedInt8TestPipeline` — pipeline integration +- `MixedInt4EmptyModelTest` — 4-bit equivalent of the above (similar coverage pattern) + +**Models downloaded:** marcsun13/bloom-1b7_with_lm_head + +**Decorators/markers:** `@slow`, `@require_bnb`, `@require_cuda_or_xpu`, `@require_huggingface_suite`, `@require_non_torch_xla` + +**How to run:** +```bash +RUN_SLOW=1 python -m pytest tests/test_quantization.py -s -v +``` + +**Dependencies:** accelerate + bitsandbytes + transformers + torch + huggingface_hub + +**Their CI GPU:** NVIDIA L4 (AWS `g6` instances: `aws-g6-4xlarge-plus`). Expected values calibrated on Quadro RTX 8000 / RTX Titan. + +**Their Makefile:** `test_quantization.py` is included in the `test_core` target (runs all tests/ except big_modeling, examples, cli). + +--- + +### 3. PEFT (`huggingface/peft`) + +**Local clone:** `~/src/peft` + +**Test files:** +- `tests/test_gpu_examples.py` (~3200 lines) — `PeftBnbGPUExampleTests` class with 30+ bnb tests +- `tests/test_common_gpu.py` (~1800 lines) — `PeftGPUCommonTests` class with 30+ bnb tests + +**What `test_gpu_examples.py` covers (bnb subset):** +- `test_causal_lm_training_4bit` / `test_causal_lm_training_multi_gpu_4bit` — QLoRA training end-to-end +- `test_4bit_adalora_causalLM` / `test_8bit_adalora_causalLM` — AdaLoRA + bnb +- `test_4bit_non_default_adapter_name` / `test_8bit_non_default_adapter_name` +- `test_causal_lm_training_4bit_dora` / `test_causal_lm_training_8bit_dora` — DoRA + bnb +- `test_causal_lm_training_4bit_vera` / `test_causal_lm_training_4bit_pvera` — VeRA + bnb +- `test_causal_lm_training_8bit_randlora` / `test_causal_lm_training_4bit_randlora` — RandLoRA + bnb +- `test_causal_lm_training_8bit_road` / `test_causal_lm_training_4bit_road` — ROAD + bnb +- `test_initialize_dora_with_bnb_on_cpu` — DoRA CPU init with quantized model +- PiSSA 4-bit/8-bit tests, OLoRA 4-bit/8-bit tests, LoftQ tests +- Multi-GPU variants of the above (marked `@pytest.mark.multi_gpu_tests`) + +**What `test_common_gpu.py` covers (bnb subset):** +- `test_lora_bnb_8bit_quantization` / `test_lora_bnb_4bit_quantization` +- `test_vera_bnb_8bit_quantization` / `test_vera_bnb_4bit_quantization` +- `test_randlora_bnb_8bit_quantization` / `test_randlora_bnb_4bit_quantization` +- `test_ia3_bnb_8bit_quantization` / `test_ia3_bnb_4bit_quantization` +- `test_road_bnb_8bit_quantization` / `test_road_bnb_4bit_quantization` +- `test_*_bnb_quantization_from_pretrained_safetensors` for lora, adalora, vera, randlora, ia3 +- `test_8bit_merge_lora` / `test_4bit_merge_lora` / `test_*_merge_and_disable_lora` +- `test_4bit_dora_inference` / `test_8bit_dora_inference` / `test_4bit_dora_merging` / `test_8bit_dora_merging` +- `test_4bit_lora_mixed_adapter_batches_lora` / `test_8bit_lora_mixed_adapter_batches_lora` +- `test_adaption_prompt_8bit` / `test_adaption_prompt_4bit` + +**Models downloaded:** opt-350m, opt-125m, bloomz-560m, flan-t5-base, gpt2, whisper-large, TinyLlama-1.1B, various `peft-internal-testing/tiny-random-*` models + +**Marker system:** +- `@require_bitsandbytes` — class-level decorator that also applies `pytest.mark.bitsandbytes` +- `@pytest.mark.single_gpu_tests` / `@pytest.mark.multi_gpu_tests` — per-test method +- The `bitsandbytes` marker is defined in `pyproject.toml`: `"bitsandbytes: select bitsandbytes integration tests"` +- The `require_bitsandbytes` decorator is in `tests/testing_utils.py`: applies both skip-if-missing and the `bitsandbytes` marker + +**How to run (bnb-only, single GPU):** +```bash +python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_gpu_examples.py tests/test_common_gpu.py +``` + +**Dedicated Makefile targets (already exist in peft):** +```makefile +tests_examples_single_gpu_bnb: + python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_gpu_examples.py + +tests_core_single_gpu_bnb: + python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_common_gpu.py +``` + +**Dependencies:** peft + bitsandbytes + transformers + accelerate + datasets + torch + +**Their CI GPU:** NVIDIA L4 (AWS `g6` instances). Tests say "designed for 1x NVIDIA T4 16GB" but actually run on L4 24GB. + +**Notable:** PEFT already has a `transformers_tests` Makefile target that clones transformers and runs its bnb tests: +```makefile +transformers_tests: + RUN_SLOW=1 python -m pytest transformers-clone/tests/quantization/bnb +``` +This is prior art for the exact pattern we want to adopt. + +--- + +## GPU Selection: match each downstream project's own CI + +To minimise spurious failures from GPU-calibrated expected values (memory +footprint ratios, generation outputs), each test job runs on the same GPU +class as that project's own CI. + +| Downstream project | Their CI GPU | Our runner | +|---|---|---| +| Transformers | A10G (`aws-g5-4xlarge-cache` / `aws-g5-12xlarge-cache`) | `bandb-aws-g5-4xlarge-plus-use1-public-80` | +| Accelerate | L4 (`aws-g6-4xlarge-plus`) | `bandb-aws-g5-4xlarge-plus-use1-public-80` (A10G — close enough for now) | +| PEFT single-GPU | L4 (`aws-g6-4xlarge-plus`) | `bandb-aws-g6-4xlarge-plus-use1-public-80` | +| PEFT multi-GPU | 4× L4 (`aws-g6-12xlarge-plus`, uses 2) | `bandb-aws-g6-12xlarge-plus-use1-public-80` (being provisioned by infra) | + +### Available bnb self-hosted runners + +All are single-GPU `4xlarge-plus` unless noted: + +| Runner | AWS instance | GPU | VRAM | +|---|---|---|---| +| `bandb-aws-g4dn-4xlarge-plus-use1-public-80` | g4dn.4xlarge | 1× T4 | 16 GB | +| `bandb-aws-g5-4xlarge-plus-use1-public-80` | g5.4xlarge | 1× A10G | 24 GB | +| `bandb-aws-g5g-4xlarge-plus-use1-public-80` | g5g.4xlarge | 1× T4 (ARM) | 16 GB | +| `bandb-aws-g6-4xlarge-plus-use1-public-80` | g6.4xlarge | 1× L4 | 24 GB | +| `bandb-aws-g6e-4xlarge-plus-use1-public-80` | g6e.4xlarge | 1× L40S | 48 GB | +| `bandb-aws-g6-12xlarge-plus-use1-public-80` | g6.12xlarge | 4× L4 | 4× 24 GB | +| `banb-aws-general-8-plus-use1-public-80` | general-purpose | (no GPU) | — | + +--- + +## Implementation Plan + +### 1. Workflow Structure + +Add a new job to `.github/workflows/tests-nightly.yml` (or a standalone `.github/workflows/tests-integration-nightly.yml`). + +Single matrix entry: +- **Platform:** linux-x64 +- **GPU:** A10 +- **CUDA:** 12.8.1 (maps to torch 2.9.1 in current nightly config) +- **Torch:** latest (2.9.1 per current mapping, or just use latest stable) + +### 2. CI Steps (pseudocode) + +```yaml +test-integration: + name: Integration + if: github.repository == 'bitsandbytes-foundation/bitsandbytes' + runs-on: bandb-aws-g5-4xlarge-plus-use1-public-80 # A10 runner + steps: + - uses: actions/checkout@v4 + + # Build and install bnb from this commit + - name: Build bnb + run: | + pip install torch==2.9.1 --index-url https://download.pytorch.org/whl/cu128 + pip install -e ".[test]" -v + + # Install downstream libs (latest release, not main) + - name: Install downstream dependencies + run: | + pip install transformers accelerate peft datasets + + # Clone downstream repos (for their test files) + - name: Clone downstream test suites + run: | + # Use the version tags matching the installed pip versions + TRANSFORMERS_VERSION=$(pip show transformers | grep Version | cut -d' ' -f2) + ACCELERATE_VERSION=$(pip show accelerate | grep Version | cut -d' ' -f2) + PEFT_VERSION=$(pip show peft | grep Version | cut -d' ' -f2) + + git clone --depth=1 --branch v${TRANSFORMERS_VERSION} \ + https://github.com/huggingface/transformers.git /tmp/transformers + git clone --depth=1 --branch v${ACCELERATE_VERSION} \ + https://github.com/huggingface/accelerate.git /tmp/accelerate + git clone --depth=1 --branch v${PEFT_VERSION} \ + https://github.com/huggingface/peft.git /tmp/peft + + # Run transformers bnb tests + - name: Transformers bnb tests + run: | + cd /tmp/transformers + RUN_SLOW=1 python -m pytest tests/quantization/bnb/ -v \ + -k "not MultiGpu and not multi_gpu" \ + --timeout=600 + + # Run accelerate quantization tests + - name: Accelerate quantization tests + run: | + cd /tmp/accelerate + RUN_SLOW=1 python -m pytest tests/test_quantization.py -s -v \ + -k "not multi_device" \ + --timeout=600 + + # Run peft bnb tests (single GPU only) + - name: PEFT bnb tests + run: | + cd /tmp/peft + python -m pytest \ + -m "single_gpu_tests and bitsandbytes" \ + tests/test_gpu_examples.py tests/test_common_gpu.py -v \ + --timeout=600 +``` + +### 3. Key Implementation Details + +**Version pinning strategy:** Install downstream libs from PyPI (latest release), then clone the matching tag for test files. This ensures the test code matches the installed library version. The `-k "not MultiGpu"` filter excludes multi-GPU tests since we run on a single GPU. + +**Tag naming conventions:** +- transformers: `v4.XX.X` (e.g. `v4.46.0`) +- accelerate: `v0.XX.X` or `v1.X.X` (e.g. `v0.34.0`) +- peft: `v0.XX.X` (e.g. `v0.13.0`) + +**Environment variables:** +- `RUN_SLOW=1` — required for transformers and accelerate to run `@slow`-marked tests +- `CUDA_VISIBLE_DEVICES=0` — single GPU +- `IS_GITHUB_CI=1` — some peft test utilities check this + +**Model caching:** These tests download ~10GB+ of models. Consider: +- Persistent HF_HOME cache on the runner +- Or accept the download time since this is nightly, not PR CI + +**Timeout:** Individual test suites should have a timeout. 10 min per suite is generous. Total job timeout ~45 min. + +**Failure handling:** Use `continue-on-error: true` per step with a summary step at the end, so all three suites run even if one fails. Alternatively, keep it strict — a failure in any downstream suite is a real signal. + +### 4. Changes to pyproject.toml + +No changes needed to `pyproject.toml` extras. The downstream libs and their dependencies are installed directly in CI, not as part of bnb's package metadata. This keeps bnb's dependency footprint clean. + +### 5. Changes to test-runner.yml (if reusing it) + +The current `test-runner.yml` installs `.[test]` and runs `pytest --durations=100` against the bnb test directory. For integration tests, you'd either: + +**Option A:** Add a new reusable workflow `integration-test-runner.yml` that handles downstream cloning and selective test execution. Cleaner separation. + +**Option B:** Add parameters to `test-runner.yml` (e.g. `run_integration: true`) that trigger additional steps. Risks making the runner complex. + +**Recommendation:** Option A — a separate workflow is simpler and won't interfere with the well-tested existing runner. + +--- + +## Test Counts (estimated) + +| Library | Single-GPU bnb tests | Multi-GPU (excluded) | +|---|---|---| +| Transformers test_4bit.py | ~15 tests | ~1 test | +| Transformers test_mixed_int8.py | ~15 tests | ~2 tests | +| Accelerate test_quantization.py | ~10 tests | ~3 tests | +| PEFT test_gpu_examples.py | ~20 tests | ~10 tests | +| PEFT test_common_gpu.py | ~30 tests | ~5 tests | +| **Total** | **~90 tests** | ~21 excluded | + +--- + +## Potential Issues + +1. **Expected value mismatches:** Some tests hardcode memory footprint ratios or generation outputs calibrated on specific GPUs (RTX Titan, Quadro RTX 8000). These may flake on A10. If so, those specific tests can be excluded via `-k "not test_name"`. + +2. **transformers testing_utils imports:** The transformers test files import from `transformers.testing_utils` which requires the transformers package to be installed — this is satisfied by the pip install step. + +3. **peft testing_utils imports:** Similarly, peft tests import from `tests.testing_utils` which is in the peft repo's test directory — running from the cloned repo directory handles this. + +4. **accelerate test_utils imports:** accelerate tests use `from accelerate.test_utils import ...` — these are part of the installed accelerate package, so this works. + +5. **Downstream version compatibility:** If a new bnb release changes APIs, the latest tagged release of downstream libs may not have caught up yet. This is actually the desired behavior — it tells you the release would break current users. For testing against main (to check if downstream has already adapted), that could be a separate optional job. + +6. **HuggingFace Hub rate limits / model availability:** Model downloads could fail due to transient Hub issues. Consider retry logic or cached models. + +--- + +## Reference: Existing bnb Nightly Config + +From `.github/workflows/tests-nightly.yml`, the A10 CUDA entries: + +```yaml +# A10 runs on: bandb-aws-g5-4xlarge-plus-use1-public-80 +- platform: linux-x64 + gpu_type: A10 + cuda_version: "12.8.1" + torch_version: "2.9.1" + pypi_index: "https://download.pytorch.org/whl/cu128" +``` + +This configuration should be reused for the integration test job. + +--- + +## Reference: Downstream Repo Locations + +| Repo | Local path | Remote | +|---|---|---| +| transformers | `~/src/transformers` | `Titus-von-Koeller/transformers` (fork of `huggingface/transformers`) | +| accelerate | `~/src/accelerate` | `huggingface/accelerate` | +| peft | `~/src/peft` | `huggingface/peft` | +| diffusers | `~/src/diffusers` | `huggingface/diffusers` | + +--- + +## Slack Reporting + +### Decision: Diffusers-style consolidated reports via JUnit XML + +We evaluated Slack reporting across HF projects: + +| Approach | Used by | Complexity | Multi-job consolidation | +|---|---|---|---| +| `pytest-reportlog` JSONL + simple parser | peft, accelerate | Low | No (one msg per job) | +| Custom `--make-reports` + consolidated parser | diffusers | Medium | Yes | +| Full `notification_service.py` | transformers | High | Yes | + +**Chosen approach:** Diffusers-style consolidated reporting, adapted to use **JUnit XML** (`--junitxml`, +built into pytest) instead of diffusers' custom `--make-reports` plugin. JUnit XML works everywhere — +we can't rely on `--make-reports` being available in downstream repos. + +### Architecture + +``` +Job: transformers-bnb-tests + → pytest ... --junitxml=reports/transformers.xml + → upload-artifact: reports/ + +Job: accelerate-bnb-tests + → pytest ... --junitxml=reports/accelerate.xml + → upload-artifact: reports/ + +Job: peft-bnb-tests + → pytest ... --junitxml=reports/peft.xml + → upload-artifact: reports/ + +Job: report (needs: all three, if: always()) + → download all artifacts into reports/ + → python scripts/integration_test_report.py --reports-dir reports/ --slack-channel bnb-ci-nightly + → also writes markdown to $GITHUB_STEP_SUMMARY +``` + +### Report script: `scripts/integration_test_report.py` + +**Status: implemented and tested locally with synthetic fixtures.** + +Parses JUnit XML files, produces: +1. **Markdown report** (for `$GITHUB_STEP_SUMMARY` and `--output` file) +2. **Slack main message** — summary line + per-suite table (tests, failed, duration, success rate) +3. **Slack thread replies** — one per failing suite with test names and error messages + +Success rate is calculated as `passed / (passed + failed)`, excluding skipped tests. + +Key flags: +- `--reports-dir` — directory containing `*.xml` files (suite name derived from filename) +- `--slack-channel` — Slack channel to post to (omit to skip Slack) +- `--dry-run` — print Slack payload as JSON instead of posting +- `--output` — write markdown report to file + +Dependencies: `slack_sdk` (only needed if posting to Slack), `tabulate` is NOT required +(we use manual string formatting for the Slack table). + +### Slack setup needed (not yet done) + +1. Create a Slack bot with `chat:write` scope +2. Add bot token as `SLACK_API_TOKEN` repository secret +3. Create a `#bnb-ci-nightly` channel (or similar) and invite the bot +4. Ask HF colleagues about preferred conventions — see draft message below + +### Draft Slack message to colleagues + +``` +CI results Slack posting tool? best practice? + +Hey, + +I'm currently looking into revamping the BNB x HF integration tests pipeline and +just wanted to check what are the best practices for Slack reporting rn? + +The plan is to run downstream bnb test suites from transformers, accelerate, and peft +as a nightly smoke test (single A10, latest torch, bnb from main, downstream libs from +latest release). I want to post results to Slack so we catch breakage early. + +I looked at how the HF repos handle this today: + +- transformers has a full-blown notification_service.py (~500 lines) that parses + artifacts, categorizes by model/GPU, compares against previous runs — probably more + than we need +- peft and accelerate share a much simpler pattern: pytest-reportlog writes JSONL logs, + then a ~140-line Python script parses them and posts a Block Kit message via + slack_sdk.WebClient to a dedicated channel (#peft-ci-daily / #accelerate-ci-daily) +- diffusers has a middle ground: multiple test jobs upload report artifacts, a + consolidation job downloads them all and posts a unified summary with threaded + failure details + +The diffusers approach seems like the right fit for us (multiple downstream suites → +one consolidated report). Before I go ahead and set that up: + +1. Is there a shared HF tool or recommended approach for this now, or is it still + per-repo scripts? +2. Any preferred Slack workspace / channel naming convention for bnb CI? +3. Anything I should know about bot token provisioning — who do I ask to set up the + secret? + +Thanks! +``` + +--- + +## Implementation Progress + +### Completed + +1. **Research** — all downstream test suites identified with exact file paths, test commands, + markers, models, and dependencies +2. **GPU selection** — A10 chosen (rationale documented above) +3. **Report script** — `scripts/integration_test_report.py` written and tested with synthetic + JUnit XML fixtures +4. **NixOS CUDA fix** — `~/dotfiles/home/shell.nix` updated to include `/run/opengl-driver/lib` + in `LD_LIBRARY_PATH` (required for NVIDIA driver runtime `libcuda.so.1` on NixOS) + +### Next steps (iterative approach) + +**Phase 1: transformers only, manual dispatch** +- Write workflow YAML with `workflow_dispatch` trigger (no cron yet) +- Single job: transformers bnb tests only + report job +- Validate: cloning, pip installs, pytest command, artifact upload, report parsing +- Iterate via push → manual trigger → check logs → fix + +**Phase 2: add accelerate + peft** +- Add remaining two test jobs (same pattern, most issues already solved) +- Verify all three produce valid JUnit XML and report consolidates correctly + +**Phase 3: enable Slack + cron** +- Add `SLACK_API_TOKEN` secret +- Remove `--dry-run` from report step +- Uncomment cron schedule + +### Local testing notes + +- **GPU available:** dual RTX 4090 (24GB each), CUDA 13.0, driver 580.119.02 +- **Pixi test env:** has `torch 2.11.0+cu130` and `transformers 4.57.6` installed +- **CUDA in pixi:** requires `LD_LIBRARY_PATH` to include `/run/opengl-driver/lib` + (NixOS-specific — the pixi env has CUDA-enabled torch but the driver libs are in a + non-standard NixOS path). Fixed in dotfiles, also exportable per-session: + `export LD_LIBRARY_PATH="/run/opengl-driver/lib:$LD_LIBRARY_PATH"` +- **Running transformers tests locally:** + ```bash + export LD_LIBRARY_PATH="/run/opengl-driver/lib:$LD_LIBRARY_PATH" + cd ~/src/transformers + ~/src/bnb/.pixi/envs/test/bin/python -m pytest tests/quantization/bnb/ \ + --junitxml=reports/transformers.xml -v \ + -k "not MultiGpu and not multi_gpu" + ``` + Note: needs `accelerate` installed (`pip install accelerate` in the test env). + The `RUN_SLOW` env var may be needed for some tests. + +--- + +## Current state (2026-04-13) + +This section supersedes the older **Implementation Plan** and **Implementation +Progress** sections above. PR #1923 on the `ci/nightly-integration-tests` branch +contains the working implementation. + +### Workflow structure + +`.github/workflows/tests-integration-nightly.yml` — 5 test jobs + 1 report job: + +| Job | Runner | GPU | Filter / command | +|---|---|---|---| +| `test-transformers` | `bandb-aws-g5-4xlarge-plus` | 1× A10G | `pytest tests/quantization/bnb/ -k "not MultiGpu and not multi_gpu"` | +| `test-transformers-multigpu` | `bandb-aws-g6-12xlarge-plus` | 2× L4 (of 4) | `pytest tests/quantization/bnb/ -k "MultiGpu or multi_gpu"` | +| `test-accelerate` | `bandb-aws-g5-4xlarge-plus` | 1× A10G | `pytest tests/test_quantization.py -k "not multi_device" -rs` | +| `test-peft` | `bandb-aws-g6-4xlarge-plus` | 1× L4 | `pytest -m single_gpu_tests -k PeftBnbGPUExampleTests tests/test_gpu_examples.py` | +| `test-peft-multigpu` | `bandb-aws-g6-12xlarge-plus` | 2× L4 (of 4) | `pytest -m multi_gpu_tests -k PeftBnbGPUExampleTests tests/test_gpu_examples.py` | +| `report` | `ubuntu-22.04` | — | consolidates all `*.xml` artifacts, writes markdown, uploads | + +Triggers: `workflow_dispatch` + `pull_request` scoped to changes in the workflow +file and the report script. `schedule:` is commented out — flip on in a follow-up +PR once the workflow is stable. + +### PEFT filter: Benjamin Bossan's recommendation + +For PEFT, use the narrower filter (20 tests) rather than the legacy +`-m "single_gpu_tests and bitsandbytes"` (86 tests across two files). The +narrower one focuses on `PeftBnbGPUExampleTests` — end-to-end QLoRA/QAdaLoRA/ +QDoRA/QVeRA training on bnb-quantized models, the highest-signal integration +regression tests. The excluded 66 tests are either: +- In `test_common_gpu.py`: per-tuner bnb API surface tests (bnb-facing but more + unit-test-like — already well covered elsewhere) +- In `test_gpu_examples.py` non-`PeftBnb*` classes (`TestPiSSA`, `TestOLoRA`, + `TestLoftQ`): these exercise bnb incidentally but failures there are usually + about those algorithms, not bnb itself + +Same shape for multi-GPU: `-m multi_gpu_tests -k PeftBnbGPUExampleTests`. + +### Legacy peft bnb CI (deleted 2025-10-21 in peft#2858) + +The legacy nightly lived in peft as `.github/workflows/nightly-bnb.yml`. Worth +preserving the design notes since our new workflow is effectively its +replacement: + +- Single-GPU runner: `aws-g6-4xlarge-plus` (1× L4) +- Multi-GPU runner: `aws-g6-12xlarge-plus` (2 of 4 L4s via `CUDA_VISIBLE_DEVICES=0,1`) +- Docker image matrix: `peft-gpu-bnb-source:latest` (bnb from main) **and** + `peft-gpu-bnb-latest:latest` (bnb release). Our workflow only tests bnb main + via the continuous-release wheel; testing against last-released bnb would + require a second job per downstream. +- Makefile targets: + - `tests_examples_single_gpu_bnb` / `tests_examples_multi_gpu_bnb` + - `tests_core_single_gpu_bnb` / `tests_core_multi_gpu_bnb` + - `transformers_tests` (cloned transformers, ran `tests/quantization/bnb/` — + no `-k` filter, so multi-GPU-marked transformers tests ran on the multi-GPU + job). This is what our `test-transformers-multigpu` recreates. + - `tests_gpu_bnb_regression` — already commented out before deletion. The + file it ran (`tests/bnb/test_bnb_regression.py`, 258 lines) was + byte-exact regression tests for bnb with a docstring saying it should + eventually live in bnb. **Opportunity:** move those tests into the bnb + repo as a separate follow-up. +- Slack channel: `#bnb-daily-ci-collab` (still exists) +- Used `huggingface/hf-workflows/.github/actions/post-slack` reusable action + (requires bot token — not viable if we only have a webhook; see Slack section) + +### Build reuse + +No dedicated build job. Each test job installs bnb from the +`continuous-release_main` GitHub pre-release wheel published by +`python-package.yml` on every push to main: + +``` +BNB_WHEEL_URL: "https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl" +``` + +Install command: `pip install "bitsandbytes[test] @ ${BNB_WHEEL_URL}"` — the +`[test]` extras pull pytest + other test deps declared in `pyproject.toml`. +Matches the convention in the existing `test-runner.yml`. + +### Slack + +Infra provisioned a **webhook URL** (`SLACK_WEBHOOK_URL` secret) for +`#bnb-daily-ci-collab`. Open question: accept webhook limitations, or request a +bot token? + +- **Webhook** — single POST, rich Block Kit supported, no threading, single + channel. Simplest to implement. +- **Bot token** (`chat:write` scope) — supports threaded replies (diffusers / + peft / accelerate all use this pattern). Cleaner UX when failures are + expanded in a thread rather than a large main message. + +Current path: start with webhook, fold per-suite failure details into the main +message (truncated, with a link to the GitHub Action). Can upgrade to a bot +token later if the noise becomes a problem. + +### Report script + +`scripts/integration_test_report.py` — parses JUnit XML files, produces a +markdown summary for `$GITHUB_STEP_SUMMARY`, and (pending webhook/bot-token +decision) posts to Slack. Sort by success rate ascending (worst first). Current +code uses `slack_sdk.WebClient` (bot-token flow); needs adaptation to a webhook +POST if we stay on webhook. + +### Follow-up work + +1. **Implement Slack posting** — once webhook vs bot token is decided. +2. **Iterate on transformers test failures** — sampling flakes in pipeline tests + (non-deterministic generation with `temperature=0.7`); `mosaicml/mpt-7b` is + gated (needs an `HF_TOKEN` secret). +3. **Update `test_bnb_regression.py` home** — move the 258-line byte-exact + regression file from legacy peft into bnb. Separate effort. +4. **Enable nightly `schedule:` cron** — last step, in its own PR. diff --git a/pyproject.toml b/pyproject.toml index f448a079e..2bfe1ba0d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ test = [ "lion-pytorch==0.2.3", "pytest~=8.3", "scipy>=1.11.4,<2", - "transformers>=4.30.1,<5" + "transformers>=4.30.1,<6" ] [tool.setuptools] diff --git a/scripts/integration_test_report.py b/scripts/integration_test_report.py new file mode 100644 index 000000000..3286042ce --- /dev/null +++ b/scripts/integration_test_report.py @@ -0,0 +1,336 @@ +#!/usr/bin/env python +"""Parse JUnit XML test reports and post a consolidated summary to Slack. + +Designed for the bitsandbytes nightly integration tests that run downstream +test suites (transformers, accelerate, peft) against the current bnb build. + +Usage: + # Dry-run (print to stdout, no Slack): + python scripts/integration_test_report.py --reports-dir reports/ + + # Post to Slack: + python scripts/integration_test_report.py --reports-dir reports/ --slack-channel bnb-ci-nightly +""" + +import argparse +from datetime import date +import glob +import os +import sys +from xml.etree import ElementTree + + +def parse_junit_xml(file_path): + """Parse a JUnit XML file and return structured results.""" + tree = ElementTree.parse(file_path) + root = tree.getroot() + + # Handle both ... and bare ... + if root.tag == "testsuites": + suites = root.findall("testsuite") + else: + suites = [root] + + tests = 0 + passed = 0 + failed = 0 + skipped = 0 + errors = 0 + total_time = 0.0 + failures = [] + + for suite in suites: + tests += int(suite.get("tests", 0)) + skipped += int(suite.get("skipped", 0)) + errors += int(suite.get("errors", 0)) + failed += int(suite.get("failures", 0)) + total_time += float(suite.get("time", 0)) + + for testcase in suite.findall("testcase"): + failure = testcase.find("failure") + error = testcase.find("error") + if failure is not None: + failures.append( + { + "test": f"{testcase.get('classname', '')}::{testcase.get('name', '')}", + "message": failure.get("message", ""), + } + ) + elif error is not None: + failures.append( + { + "test": f"{testcase.get('classname', '')}::{testcase.get('name', '')}", + "message": error.get("message", ""), + } + ) + + passed = tests - failed - skipped - errors + + return { + "tests": tests, + "passed": passed, + "failed": failed + errors, + "skipped": skipped, + "time": total_time, + "failures": failures, + } + + +def format_duration(seconds): + """Format seconds into a human-readable string.""" + m, s = divmod(int(seconds), 60) + if m > 0: + return f"{m}m{s:02d}s" + return f"{s}s" + + +def consolidate_reports(reports_dir): + """Find and parse all JUnit XML files in the reports directory.""" + xml_files = sorted(glob.glob(os.path.join(reports_dir, "**", "*.xml"), recursive=True)) + + if not xml_files: + print(f"No XML report files found in {reports_dir}", file=sys.stderr) + return {} + + results = {} + for xml_file in xml_files: + # Derive suite name from filename: "transformers.xml" -> "transformers" + suite_name = os.path.splitext(os.path.basename(xml_file))[0] + results[suite_name] = parse_junit_xml(xml_file) + + return results + + +def _success_rate(r): + """Success rate: passed / (passed + failed), ignoring skipped.""" + run = r["passed"] + r["failed"] + return (r["passed"] / run) if run > 0 else 1.0 + + +def generate_markdown(results): + """Generate a markdown summary report.""" + if not results: + return "No test results found." + + total_passed = sum(r["passed"] for r in results.values()) + total_failed = sum(r["failed"] for r in results.values()) + total_skipped = sum(r["skipped"] for r in results.values()) + total_time = sum(r["time"] for r in results.values()) + + lines = [] + lines.append("# BNB Integration Test Report") + lines.append("") + + total_run = total_passed + total_failed + if total_failed == 0: + lines.append(f"All {total_run} tests passed in {format_duration(total_time)}.") + else: + lines.append(f"**{total_failed} failures** out of {total_run} tests in {format_duration(total_time)}.") + if total_skipped > 0: + lines.append(f"({total_skipped} skipped)") + + lines.append("") + lines.append("| Suite | Tests | Passed | Failed | Skipped | Duration | Success Rate |") + lines.append("|-------|------:|-------:|-------:|--------:|---------:|-------------:|") + + # Sort by success rate ascending (worst first) + sorted_results = sorted(results.items(), key=lambda x: _success_rate(x[1])) + + for suite_name, r in sorted_results: + run = r["passed"] + r["failed"] + rate = f"{r['passed'] / run * 100:.1f}%" if run > 0 else "N/A" + lines.append( + f"| {suite_name} | {r['tests']} | {r['passed']} | {r['failed']} " + f"| {r['skipped']} | {format_duration(r['time'])} | {rate} |" + ) + + # Failure details + any_failures = any(r["failures"] for r in results.values()) + if any_failures: + lines.append("") + lines.append("## Failures") + for suite_name, r in sorted_results: + if r["failures"]: + lines.append(f"### {suite_name}") + lines.append("```") + for f in r["failures"]: + if f["message"]: + lines.append(f"FAILED {f['test']} - {f['message']}") + else: + lines.append(f"FAILED {f['test']}") + lines.append("```") + lines.append("") + + return "\n".join(lines) + + +def create_slack_payload(results): + """Create Slack Block Kit payload from results.""" + total_passed = sum(r["passed"] for r in results.values()) + total_failed = sum(r["failed"] for r in results.values()) + total_skipped = sum(r["skipped"] for r in results.values()) + + total_run = total_passed + total_failed + + if total_run == 0: + emoji = "⚠️" + rate_str = "N/A" + elif total_failed == 0: + emoji = "✅" + rate_str = "100%" + elif total_failed / total_run < 0.1: + emoji = "⚠️" + rate_str = f"{total_passed / total_run * 100:.1f}%" + else: + emoji = "❌" + rate_str = f"{total_passed / total_run * 100:.1f}%" + + summary = f"{emoji} *BNB Integration Tests:* {rate_str} success ({total_passed}/{total_run} tests" + if total_skipped > 0: + summary += f", {total_skipped} skipped" + if total_failed > 0: + summary += f", {total_failed} failed" + summary += ")" + + # Build table — sorted by success rate ascending (worst first) + sorted_results = sorted(results.items(), key=lambda x: _success_rate(x[1])) + + table_lines = ["```"] + header = f"{'Suite':<15} {'Tests':>6} {'Failed':>7} {'Duration':>10} {'Success':>8}" + table_lines.append(header) + table_lines.append("-" * len(header)) + + for suite_name, r in sorted_results: + run = r["passed"] + r["failed"] + rate = f"{r['passed'] / run * 100:.1f}%" if run > 0 else "N/A" + table_lines.append(f"{suite_name:<15} {run:>6} {r['failed']:>7} {format_duration(r['time']):>10} {rate:>8}") + + table_lines.append("```") + + payload = [ + {"type": "section", "text": {"type": "mrkdwn", "text": summary}}, + {"type": "section", "text": {"type": "mrkdwn", "text": "\n".join(table_lines)}}, + ] + + # GitHub Actions link + run_id = os.environ.get("GITHUB_RUN_ID") + repo = os.environ.get("GITHUB_REPOSITORY", "bitsandbytes-foundation/bitsandbytes") + if run_id: + payload.append( + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": f"**", + }, + } + ) + + payload.append( + { + "type": "context", + "elements": [{"type": "plain_text", "text": f"Nightly integration test results for {date.today()}"}], + } + ) + + return payload + + +def create_failure_thread_payloads(results): + """Create per-suite Slack thread replies for failures.""" + threads = [] + + for suite_name, r in results.items(): + if not r["failures"]: + continue + + run = r["passed"] + r["failed"] + rate = f"{r['passed'] / run * 100:.1f}%" if run > 0 else "N/A" + lines = [f"*{suite_name}* (Success Rate: {rate})"] + lines.append("```") + for f in r["failures"]: + if f["message"]: + lines.append(f"FAILED {f['test']}") + lines.append(f" {f['message'][:200]}") + else: + lines.append(f"FAILED {f['test']}") + lines.append("```") + + threads.append("\n".join(lines)) + + return threads + + +def post_to_slack(channel, payload, thread_payloads): + """Post the report to Slack.""" + from slack_sdk import WebClient + + token = os.environ.get("SLACK_API_TOKEN") + if not token: + print("SLACK_API_TOKEN not set, skipping Slack post", file=sys.stderr) + return + + client = WebClient(token=token) + + # Main message + response = client.chat_postMessage( + channel=f"#{channel}", + text="BNB Integration Test Results", + blocks=payload, + ) + print(f"Posted to #{channel}") + + # Threaded failure details + ts = response["ts"] + for thread_msg in thread_payloads: + client.chat_postMessage( + channel=f"#{channel}", + thread_ts=ts, + text=thread_msg, + ) + + if thread_payloads: + print(f"Posted {len(thread_payloads)} failure thread replies") + + +def main(): + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--reports-dir", default="reports", help="Directory containing JUnit XML files") + parser.add_argument("--slack-channel", default=None, help="Slack channel name (omit to skip Slack)") + parser.add_argument("--output", default=None, help="Write markdown report to file") + parser.add_argument("--dry-run", action="store_true", help="Print Slack payload as JSON instead of posting") + args = parser.parse_args() + + results = consolidate_reports(args.reports_dir) + if not results: + sys.exit(1) + + # Markdown report + markdown = generate_markdown(results) + + if args.output: + with open(args.output, "w") as f: + f.write(markdown) + print(f"Report written to {args.output}") + + # Always print markdown (for $GITHUB_STEP_SUMMARY piping) + print(markdown) + + # Slack + payload = create_slack_payload(results) + thread_payloads = create_failure_thread_payloads(results) + + if args.dry_run: + import json + + print("\n--- Slack main payload ---") + print(json.dumps(payload, indent=2)) + for i, tp in enumerate(thread_payloads): + print(f"\n--- Thread reply {i + 1} ---") + print(tp) + elif args.slack_channel: + post_to_slack(args.slack_channel, payload, thread_payloads) + + +if __name__ == "__main__": + main()