kernels/.github/workflows/build_kernel_windows.yaml at ffee275f99f8863a01b7e9bbc8d2ca99840a6907 · huggingface/kernels · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
name: "Build and test kernel - Windows"
on:
  push:
    branches: [main]
  pull_request:
    branches: [main]
    types: [opened, synchronize, reopened] # trigger on PRs
    paths-ignore:
      - "docs/**"
      - "*.md"
  workflow_dispatch:

jobs:
  build:
    strategy:
      matrix:
        os: [ windows-2022 ]
        python: [ 3.12 ]
        torch: [
#          { version: '2.9.1', cuda: '12.6.3', wheel: '126' },
          { version: '2.9.1', cuda: '12.8.1', wheel: '128' },
#          { version: '2.9.1', cuda: '13.0.1', wheel: '130' }
        ]

    name: Build kernel
    runs-on: ${{ matrix.os }}

    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      # ---- CUDA toolkit (cache + skip installer on hit) ----
      # On a cache hit we restore C:\Program Files\NVIDIA GPU Computing Toolkit
      # and skip the cuda-toolkit action entirely (which otherwise spends ~7
      # min running the MSI even when the files are already on disk). We then
      # replicate the small bit of env setup the action would have done — see
      # the next step.
      - name: Cache CUDA toolkit
        id: cuda-cache
        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
        with:
          path: C:\Program Files\NVIDIA GPU Computing Toolkit
          # Key bumps:
          #  - matrix.torch.cuda — different CUDA versions get separate caches
          #  - 714c97b3 — pinned SHA of huggingface/cuda-toolkit; bump when the
          #    action changes so we re-download instead of reusing a stale tree
          key: cuda-toolkit-${{ matrix.torch.cuda }}-714c97b3-${{ matrix.os }}

      - name: Install CUDA toolkit
        if: steps.cuda-cache.outputs.cache-hit != 'true'
        uses: huggingface/cuda-toolkit@714c97b32958862237b96401fb253a4261453c3b # v0.1.0
        with:
          cuda: ${{ matrix.torch.cuda }}

      - name: Restore CUDA env vars (cache hit only)
        # huggingface/cuda-toolkit's updatePath sets CUDA_PATH, CUDA_PATH_VX_Y,
        # and prepends <CUDA_PATH>\bin to PATH. When we skip the action above,
        # those env mutations don't happen — replicate them here so nvcc and
        # the downstream builds find the toolkit.
        #
        # Also re-install the MSBuild integration: the CUDA installer normally
        # copies CUDA <ver>.{props,targets,xml} from the toolkit's
        # extras\visual_studio_integration\MSBuildExtensions\ into the VS
        # BuildCustomizations dir. Without that, CMake's CUDA language detection
        # fails with "No CUDA toolset found". Cache only restores the toolkit
        # tree, so we copy the props in by hand on cache hits.
        if: steps.cuda-cache.outputs.cache-hit == 'true'
        shell: pwsh
        run: |
          $parts = "${{ matrix.torch.cuda }}".Split('.')
          $major = $parts[0]
          $minor = $parts[1]
          $cudaPath = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$major.$minor"
          "CUDA_PATH=$cudaPath"                              | Out-File $env:GITHUB_ENV  -Append -Encoding utf8
          "CUDA_PATH_V${major}_${minor}=$cudaPath"           | Out-File $env:GITHUB_ENV  -Append -Encoding utf8
          "$cudaPath\bin"                                    | Out-File $env:GITHUB_PATH -Append -Encoding utf8

          $msBuildExt = Join-Path $cudaPath 'extras\visual_studio_integration\MSBuildExtensions'
          if (-not (Test-Path $msBuildExt)) {
            throw "MSBuild integration not found in cached toolkit at $msBuildExt — cache may be incomplete."
          }
          # GitHub-hosted windows-2022 ships VS 2022 Enterprise; glob anyway so
          # we don't silently break if the image switches edition.
          $vsRoots = Get-ChildItem 'C:\Program Files\Microsoft Visual Studio\2022' -Directory -ErrorAction SilentlyContinue
          if (-not $vsRoots) { throw "Visual Studio 2022 not found on runner." }
          foreach ($vs in $vsRoots) {
            $dest = Join-Path $vs.FullName 'MSBuild\Microsoft\VC\v170\BuildCustomizations'
            New-Item -ItemType Directory -Force -Path $dest | Out-Null
            Copy-Item -Path (Join-Path $msBuildExt '*') -Destination $dest -Force -Recurse
            Write-Host "Installed CUDA MSBuild integration into $dest"
          }

      - name: "NVCC checks"
        run: nvcc -V

      # ---- Rust toolchain + cached kernel-builder build ----
      - uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
        with:
          toolchain: stable
          profile: minimal
          override: true

      # Caches the workspace target/ plus ~/.cargo/{registry,git}. Keys on
      # Cargo.lock so a clean dep-graph change invalidates the artifact cache
      # but unrelated edits reuse it incrementally. Cuts the kernel-builder
      # build from ~8 min cold to ~30s warm.
      #
      # workspaces must point at the actual workspace root (root Cargo.toml
      # has `[workspace] members = [..., "kernel-builder", ...]`). Cargo
      # always writes target/ at the workspace root, so caching
      # ./kernel-builder/target would restore to a path cargo never reads.
      - name: Cache cargo + kernel-builder target
        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
        with:
          workspaces: .
          shared-key: kernel-builder-${{ matrix.os }}

      - name: Build kernel-builder
        run: ( cd kernel-builder && cargo build --release )

      # Python environment setup
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: ${{ matrix.python }}
          cache: 'pip'

      - name: Install PyTorch
        run: pip install torch --index-url https://download.pytorch.org/whl/cu${{ matrix.torch.wheel }}

      - name: Build cutlass GEMM kernel
        run: ( nix-builder\scripts\windows\builder.ps1 -SourceFolder examples/kernels/cutlass-gemm -BuildConfig Release -Backend cuda -Build -Force )

      - name: Build relu kernel
        run: ( nix-builder\scripts\windows\builder.ps1 -SourceFolder examples/kernels/relu -BuildConfig Release -Backend cuda -Build -Force )

      - name: Build relu-backprop-compile kernel
        run: ( nix-builder\scripts\windows\builder.ps1 -SourceFolder examples/kernels/relu-backprop-compile -BuildConfig Release -Backend cuda -Build -Force  )

      - name: Build silu-and-mul kernel
        run: ( nix-builder\scripts\windows\builder.ps1 -SourceFolder examples/kernels/silu-and-mul -BuildConfig Release -Backend cuda -Build -Force)