Skip to content

Commit 9b0e765

Browse files
committed
num_ctas > 1 requires Blackwell (sm_100+)
1 parent 9eb95d0 commit 9b0e765

2 files changed

Lines changed: 39 additions & 8 deletions

File tree

ext/CUDAExt.jl

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,16 @@ using CUDA_Compiler_jll
1313

1414
public launch
1515

16+
function run_and_collect(cmd)
17+
stdout = Pipe()
18+
proc = run(pipeline(ignorestatus(cmd); stdout, stderr=stdout), wait=false)
19+
close(stdout.in)
20+
reader = Threads.@spawn String(read(stdout))
21+
Base.wait(proc)
22+
log = strip(fetch(reader))
23+
return proc, log
24+
end
25+
1626
"""
1727
check_tile_ir_support()
1828
@@ -61,14 +71,29 @@ function emit_binary(cache::CacheView, mi::Core.MethodInstance;
6171
# Run tileiras to produce CUBIN
6272
input_path = tempname() * ".tile"
6373
output_path = tempname() * ".cubin"
74+
compiled = false
6475
try
6576
write(input_path, bytecode)
6677
cmd = addenv(`$(CUDA_Compiler_jll.tileiras()) $input_path -o $output_path --gpu-name $(opts.sm_arch) -O$(opts.opt_level)`,
6778
"CUDA_ROOT" => CUDA_Compiler_jll.artifact_dir)
68-
run(cmd)
79+
proc, log = run_and_collect(cmd)
80+
if !success(proc)
81+
reason = proc.termsignal > 0 ? "tileiras received signal $(proc.termsignal)" :
82+
"tileiras exited with code $(proc.exitcode)"
83+
msg = "Failed to compile Tile IR ($reason)"
84+
if !isempty(log)
85+
msg *= "\n" * log
86+
end
87+
msg *= "\nIf you think this is a bug, please file an issue and attach $(input_path)"
88+
if parse(Bool, get(ENV, "BUILDKITE", "false"))
89+
run(`buildkite-agent artifact upload $(input_path)`)
90+
end
91+
error(msg)
92+
end
93+
compiled = true
6994
res.cuda_bin = read(output_path)
7095
finally
71-
rm(input_path, force=true)
96+
compiled && rm(input_path, force=true)
7297
rm(output_path, force=true)
7398
end
7499

test/execution/hints.jl

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,12 @@ using CUDA
1818
b = CUDA.ones(Float32, n) .* 2
1919
c = CUDA.zeros(Float32, n)
2020

21-
ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2)
22-
23-
@test Array(c) ones(Float32, n) .* 3
21+
if capability(device()) >= v"10"
22+
ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2)
23+
@test Array(c) ones(Float32, n) .* 3
24+
else
25+
@test_throws "num_cta_in_cga" ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2)
26+
end
2427
end
2528

2629
@testset "launch with occupancy" begin
@@ -60,9 +63,12 @@ end
6063
b = CUDA.ones(Float32, n) .* 2
6164
c = CUDA.zeros(Float32, n)
6265

63-
ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8)
64-
65-
@test Array(c) ones(Float32, n) .* 3
66+
if capability(device()) >= v"10"
67+
ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8)
68+
@test Array(c) ones(Float32, n) .* 3
69+
else
70+
@test_throws "num_cta_in_cga" ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8)
71+
end
6672
end
6773

6874
end

0 commit comments

Comments
 (0)