Skip to content

Commit 8b250fb

Browse files
committed
Fix HIP FA launch bounds and Windows CPU packaging
Parenthesize the type-aware vec FlashAttention launch-bound expression so HIP's variadic __launch_bounds__ macro does not split the template argument list. Replace the Windows CPU release package's hard-coded VS2022 Enterprise OpenMP runtime lookup with a Visual Studio install-root search that works on newer runner images.
1 parent df8933c commit 8b250fb

4 files changed

Lines changed: 94 additions & 11 deletions

File tree

.github/workflows/release.yml

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -801,15 +801,8 @@ jobs:
801801
$ErrorActionPreference = "Stop"
802802
$bin = ".\build\bin\Release"
803803
Copy-Item ".\LICENSE" $bin
804-
$redistRoot = "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC"
805-
$omp = Get-ChildItem $redistRoot -Recurse -Filter "libomp140.x86_64.dll" |
806-
Where-Object { $_.FullName -like "*\debug_nonredist\x64\Microsoft.VC143.OpenMP.LLVM\libomp140.x86_64.dll" } |
807-
Sort-Object FullName -Descending |
808-
Select-Object -First 1
809-
if (-not $omp) {
810-
throw "VC143 x64 libomp140.x86_64.dll not found under $redistRoot"
811-
}
812-
Copy-Item $omp.FullName $bin -Force
804+
$omp = & ".\scripts\find-msvc-openmp-runtime.ps1" -Architecture x64
805+
Copy-Item -LiteralPath $omp -Destination $bin -Force
813806
python scripts\verify-windows-package.py $bin
814807
& "$bin\llama-server.exe" --version
815808
& "$bin\llama-cli.exe" --version

ggml/src/ggml-cuda/fattn-vec.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ static constexpr __host__ __device__ int ggml_cuda_fattn_vec_get_min_blocks() {
2828
#pragma clang diagnostic ignored "-Wpass-failed"
2929
#endif // __clang__
3030
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
31-
__launch_bounds__(ggml_cuda_fattn_vec_get_nthreads_device(), ggml_cuda_fattn_vec_get_min_blocks<type_K, type_V>())
31+
__launch_bounds__(ggml_cuda_fattn_vec_get_nthreads_device(), (ggml_cuda_fattn_vec_get_min_blocks<type_K, type_V>()))
3232
static __global__ void flash_attn_ext_vec(
3333
const char * Q_ptr,
3434
const char * K_ptr,
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
param(
2+
[ValidateSet("x64")]
3+
[string] $Architecture = "x64"
4+
)
5+
6+
$ErrorActionPreference = "Stop"
7+
8+
function Get-VersionOrZero($File) {
9+
try {
10+
return [Version] $File.VersionInfo.FileVersion
11+
} catch {
12+
return [Version] "0.0.0.0"
13+
}
14+
}
15+
16+
$programFilesX86 = [Environment]::GetEnvironmentVariable("ProgramFiles(x86)")
17+
$programRoots = @($env:ProgramFiles, $programFilesX86) | Where-Object { $_ }
18+
19+
$vsRoots = @()
20+
$vswhereCandidates = foreach ($programRoot in $programRoots) {
21+
Join-Path $programRoot "Microsoft Visual Studio\Installer\vswhere.exe"
22+
}
23+
$vswhereCandidates = $vswhereCandidates |
24+
Where-Object { Test-Path -LiteralPath $_ } |
25+
Sort-Object -Unique
26+
27+
$vswhereArgSets = @(
28+
,@("-all", "-products", "*", "-requires", "Microsoft.VisualStudio.Component.VC.Redist.14.Latest", "-property", "installationPath"),
29+
,@("-all", "-products", "*", "-requires", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", "-property", "installationPath"),
30+
,@("-all", "-products", "*", "-property", "installationPath")
31+
)
32+
33+
foreach ($vswhere in $vswhereCandidates) {
34+
foreach ($argSet in $vswhereArgSets) {
35+
$vsRoots += & $vswhere @argSet 2>$null
36+
}
37+
}
38+
39+
foreach ($programRoot in $programRoots) {
40+
$visualStudioRoot = Join-Path $programRoot "Microsoft Visual Studio"
41+
if (-not (Test-Path -LiteralPath $visualStudioRoot)) {
42+
continue
43+
}
44+
45+
foreach ($versionRoot in Get-ChildItem -LiteralPath $visualStudioRoot -Directory -ErrorAction SilentlyContinue) {
46+
foreach ($editionRoot in Get-ChildItem -LiteralPath $versionRoot.FullName -Directory -ErrorAction SilentlyContinue) {
47+
$vsRoots += $editionRoot.FullName
48+
}
49+
}
50+
}
51+
52+
$vsRoots = $vsRoots |
53+
Where-Object { $_ -and (Test-Path -LiteralPath $_) } |
54+
Sort-Object -Unique
55+
56+
$searchRoots = foreach ($vsRoot in $vsRoots) {
57+
foreach ($subdir in @("VC\Redist\MSVC", "VC\Tools\MSVC")) {
58+
$path = Join-Path $vsRoot $subdir
59+
if (Test-Path -LiteralPath $path) {
60+
$path
61+
}
62+
}
63+
}
64+
$searchRoots = $searchRoots | Sort-Object -Unique
65+
66+
$candidates = foreach ($searchRoot in $searchRoots) {
67+
Get-ChildItem -LiteralPath $searchRoot -Recurse -File -Filter "libomp140.x86_64.dll" -ErrorAction SilentlyContinue |
68+
Where-Object {
69+
$_.FullName -match "\\$Architecture\\" -and (
70+
$_.FullName -match "\\Microsoft\.VC\d+\.OpenMP\.LLVM\\" -or
71+
$_.FullName -match "\\bin\\Host(?:x86|x64)\\$Architecture\\"
72+
)
73+
}
74+
}
75+
76+
$runtime = $candidates |
77+
Sort-Object `
78+
@{ Expression = { Get-VersionOrZero $_ } },
79+
@{ Expression = { $_.FullName } } `
80+
-Descending |
81+
Select-Object -First 1
82+
83+
if (-not $runtime) {
84+
$searched = if ($searchRoots) { $searchRoots -join "; " } else { "<none>" }
85+
throw "MSVC $Architecture libomp140.x86_64.dll not found. Searched: $searched"
86+
}
87+
88+
$runtime.FullName

tests/test-cuda-fattn-vec-policy.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,10 @@ int main(int argc, char ** argv) {
7171
ok &= expect(helpers.find("? 2 : 1") != std::string::npos,
7272
"VEC FA min blocks must be 2 only for Turbo/TCQ pairs and 1 otherwise");
7373

74-
ok &= expect(kernel.find("ggml_cuda_fattn_vec_get_min_blocks<type_K, type_V>()") != std::string::npos,
74+
ok &= expect(kernel.find("ggml_cuda_fattn_vec_get_nthreads_device(), (ggml_cuda_fattn_vec_get_min_blocks<type_K, type_V>())") != std::string::npos,
7575
"VEC FA launch_bounds must use the K/V type-aware min-block policy");
76+
ok &= expect(kernel.find("ggml_cuda_fattn_vec_get_nthreads_device(), ggml_cuda_fattn_vec_get_min_blocks<type_K, type_V>()") == std::string::npos,
77+
"VEC FA launch_bounds min-block expression must be parenthesized for HIP's variadic macro parser");
7678
ok &= expect(kernel.find("ggml_cuda_fattn_vec_get_nthreads_device(), 2") == std::string::npos,
7779
"VEC FA launch_bounds must not force all cache types to minBlocksPerSM=2");
7880

0 commit comments

Comments
 (0)