Skip to content

Commit 64a0be6

Browse files
MagicalTuxclaude
andcommitted
Skip cubins newer than the GPU to avoid a driver SIGILL
engine.zip ships cubins up to sm_120 (Blackwell). load_first tried them highest-arch-first and handed the sm_120 image to cuModuleLoadData first. An older driver (550.x / CUDA 12.4) doesn't reject an architecture it has never heard of — it faults with SIGILL inside libcuda, killing the worker right after "claimed" and before any kernel ran. Carry each cubin's sm arch (from its filename) down to load_first, query the device's compute capability up front, and skip any cubin newer than the GPU. The highest compatible cubin then loads normally (sm_89 on an RTX 4080 SUPER, verified end-to-end). Bump version to 0.1.6. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 1a611ea commit 64a0be6

4 files changed

Lines changed: 45 additions & 13 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "decryptd"
3-
version = "0.1.5"
3+
version = "0.1.6"
44
edition = "2024"
55
license = "Proprietary"
66
authors = ["Karpeles Lab Inc"]

src/cuda.rs

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,17 +127,42 @@ pub struct Gpu {
127127
}
128128

129129
impl Gpu {
130-
/// Init device 0 and load the first cubin the driver accepts (callers pass them
131-
/// highest-arch-first; a cubin for sm_X.y loads only on CC X.z, z≥y).
132-
pub fn load_first(cubins: &[Vec<u8>]) -> Result<Gpu, String> {
130+
/// Init device 0 and load the best cubin for it. Callers pass `(arch, bytes)`
131+
/// pairs highest-arch-first, where arch is CC `X.Y` encoded as `X*10+Y`.
132+
///
133+
/// Cubins newer than the device are skipped rather than tried: an old driver
134+
/// (e.g. 550.x / CUDA 12.4) doesn't cleanly reject a cubin for an architecture
135+
/// it has never heard of — `cuModuleLoadData` faults with SIGILL *inside*
136+
/// libcuda. So we query the GPU's compute capability first and never hand the
137+
/// driver anything above it. Same-major-lower cubins that still don't load
138+
/// (a known arch the driver rejects) fall through to the next candidate.
139+
pub fn load_first(cubins: &[(u32, Vec<u8>)]) -> Result<Gpu, String> {
133140
unsafe {
134141
check(cuInit(0), "cuInit")?;
135142
let mut dev: CuDevice = 0;
136143
check(cuDeviceGet(&mut dev, 0), "cuDeviceGet")?;
144+
145+
// Device compute capability, encoded to match the `smNN` tags.
146+
let (mut maj, mut min) = (0i32, 0i32);
147+
check(
148+
cuDeviceGetAttribute(&mut maj, CU_DEV_ATTR_CC_MAJOR, dev),
149+
"cuDeviceGetAttribute(CC_MAJOR)",
150+
)?;
151+
check(
152+
cuDeviceGetAttribute(&mut min, CU_DEV_ATTR_CC_MINOR, dev),
153+
"cuDeviceGetAttribute(CC_MINOR)",
154+
)?;
155+
let gpu_arch = (maj.max(0) as u32) * 10 + (min.max(0) as u32);
156+
137157
let mut ctx: CuContext = ptr::null_mut();
138158
check(cuCtxCreate_v2(&mut ctx, 0, dev), "cuCtxCreate")?;
139-
let mut last = String::from("no cubins provided");
140-
for cubin in cubins {
159+
let mut last = format!("no cubin for sm_{gpu_arch} or older in engine.zip");
160+
for (arch, cubin) in cubins {
161+
// Never feed the driver an arch newer than the GPU — it can't run
162+
// here anyway, and a beyond-driver arch can hard-crash libcuda.
163+
if *arch > gpu_arch {
164+
continue;
165+
}
141166
let mut module: CuModule = ptr::null_mut();
142167
let r = cuModuleLoadData(&mut module, cubin.as_ptr() as *const c_void);
143168
if r == 0 {
@@ -146,7 +171,7 @@ impl Gpu {
146171
last = check(r, "cuModuleLoadData").unwrap_err();
147172
}
148173
cuCtxDestroy_v2(ctx);
149-
Err(format!("no cubin loaded on this GPU ({last})"))
174+
Err(format!("no cubin loaded on sm_{gpu_arch} ({last})"))
150175
}
151176
}
152177

src/main.rs

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ fn de_u64<'de, D: serde::Deserializer<'de>>(d: D) -> Result<u64, D::Error> {
121121
}
122122

123123
// ----------------------------------------------------------------- engine.zip
124+
/// An arch-tagged cubin from `engine.zip`: compute capability `X.Y` encoded as
125+
/// `X*10+Y` (matching the `smNN` filename tag), paired with the raw cubin bytes.
126+
type Cubin = (u32, Vec<u8>);
127+
124128
/// `manifest.json` shipped inside `engine.zip` — the generic kernel launch
125129
/// parameters that the platform's Decrypt/Job row does not carry.
126130
#[derive(Deserialize)]
@@ -301,9 +305,11 @@ fn percent_decode(s: &str) -> Result<Vec<u8>> {
301305
Ok(out)
302306
}
303307

304-
/// Unpack engine.zip: parse `manifest.json` and collect every `*.sm<NN>.cubin`'s
305-
/// bytes, highest compute-capability first.
306-
fn unpack_engine(zip_bytes: &[u8]) -> Result<(Manifest, Vec<Vec<u8>>)> {
308+
/// Unpack engine.zip: parse `manifest.json` and collect every `*.sm<NN>.cubin` as
309+
/// an `(arch, bytes)` pair, highest compute-capability first. The arch tag rides
310+
/// along so the GPU loader can skip cubins newer than the device (see
311+
/// [`cuda::Gpu::load_first`]) instead of handing them to a driver that may crash.
312+
fn unpack_engine(zip_bytes: &[u8]) -> Result<(Manifest, Vec<Cubin>)> {
307313
let mut zip = zip::ZipArchive::new(Cursor::new(zip_bytes)).context("opening engine.zip")?;
308314
let mut manifest: Option<Manifest> = None;
309315
let mut cubins: Vec<(u32, Vec<u8>)> = Vec::new();
@@ -328,7 +334,7 @@ fn unpack_engine(zip_bytes: &[u8]) -> Result<(Manifest, Vec<Vec<u8>>)> {
328334
bail!("engine.zip contains no *.sm<NN>.cubin files");
329335
}
330336
cubins.sort_by_key(|c| std::cmp::Reverse(c.0));
331-
Ok((manifest, cubins.into_iter().map(|(_, b)| b).collect()))
337+
Ok((manifest, cubins))
332338
}
333339

334340
// --------------------------------------------------------------------------- run
@@ -338,7 +344,8 @@ struct ReadyJob {
338344
start: u64,
339345
end: u64,
340346
manifest: Manifest,
341-
cubins: Vec<Vec<u8>>,
347+
/// Arch-tagged cubins, highest arch first (see [`Cubin`]).
348+
cubins: Vec<Cubin>,
342349
data: Vec<u8>,
343350
}
344351

0 commit comments

Comments
 (0)