Skip cubins newer than the GPU to avoid a driver SIGILL

MagicalTux · claude · MagicalTux · commit 64a0be6f9003 · 2026-07-01T15:06:55.000+09:00
engine.zip ships cubins up to sm_120 (Blackwell). load_first tried them
highest-arch-first and handed the sm_120 image to cuModuleLoadData first.
An older driver (550.x / CUDA 12.4) doesn't reject an architecture it has
never heard of — it faults with SIGILL inside libcuda, killing the worker
right after "claimed" and before any kernel ran.

Carry each cubin's sm arch (from its filename) down to load_first, query
the device's compute capability up front, and skip any cubin newer than
the GPU. The highest compatible cubin then loads normally (sm_89 on an
RTX 4080 SUPER, verified end-to-end). Bump version to 0.1.6.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "decryptd"
-version = "0.1.5"
+version = "0.1.6"
 edition = "2024"
 license = "Proprietary"
 authors = ["Karpeles Lab Inc"]
diff --git a/src/cuda.rs b/src/cuda.rs
@@ -127,17 +127,42 @@ pub struct Gpu {
 }
 
 impl Gpu {
-    /// Init device 0 and load the first cubin the driver accepts (callers pass them
-    /// highest-arch-first; a cubin for sm_X.y loads only on CC X.z, z≥y).
-    pub fn load_first(cubins: &[Vec<u8>]) -> Result<Gpu, String> {
+    /// Init device 0 and load the best cubin for it. Callers pass `(arch, bytes)`
+    /// pairs highest-arch-first, where arch is CC `X.Y` encoded as `X*10+Y`.
+    ///
+    /// Cubins newer than the device are skipped rather than tried: an old driver
+    /// (e.g. 550.x / CUDA 12.4) doesn't cleanly reject a cubin for an architecture
+    /// it has never heard of — `cuModuleLoadData` faults with SIGILL *inside*
+    /// libcuda. So we query the GPU's compute capability first and never hand the
+    /// driver anything above it. Same-major-lower cubins that still don't load
+    /// (a known arch the driver rejects) fall through to the next candidate.
+    pub fn load_first(cubins: &[(u32, Vec<u8>)]) -> Result<Gpu, String> {
         unsafe {
             check(cuInit(0), "cuInit")?;
             let mut dev: CuDevice = 0;
             check(cuDeviceGet(&mut dev, 0), "cuDeviceGet")?;
+
+            // Device compute capability, encoded to match the `smNN` tags.
+            let (mut maj, mut min) = (0i32, 0i32);
+            check(
+                cuDeviceGetAttribute(&mut maj, CU_DEV_ATTR_CC_MAJOR, dev),
+                "cuDeviceGetAttribute(CC_MAJOR)",
+            )?;
+            check(
+                cuDeviceGetAttribute(&mut min, CU_DEV_ATTR_CC_MINOR, dev),
+                "cuDeviceGetAttribute(CC_MINOR)",
+            )?;
+            let gpu_arch = (maj.max(0) as u32) * 10 + (min.max(0) as u32);
+
             let mut ctx: CuContext = ptr::null_mut();
             check(cuCtxCreate_v2(&mut ctx, 0, dev), "cuCtxCreate")?;
-            let mut last = String::from("no cubins provided");
-            for cubin in cubins {
+            let mut last = format!("no cubin for sm_{gpu_arch} or older in engine.zip");
+            for (arch, cubin) in cubins {
+                // Never feed the driver an arch newer than the GPU — it can't run
+                // here anyway, and a beyond-driver arch can hard-crash libcuda.
+                if *arch > gpu_arch {
+                    continue;
+                }
                 let mut module: CuModule = ptr::null_mut();
                 let r = cuModuleLoadData(&mut module, cubin.as_ptr() as *const c_void);
                 if r == 0 {
@@ -146,7 +171,7 @@ impl Gpu {
                 last = check(r, "cuModuleLoadData").unwrap_err();
             }
             cuCtxDestroy_v2(ctx);
-            Err(format!("no cubin loaded on this GPU ({last})"))
+            Err(format!("no cubin loaded on sm_{gpu_arch} ({last})"))
         }
     }
 
diff --git a/src/main.rs b/src/main.rs
@@ -121,6 +121,10 @@ fn de_u64<'de, D: serde::Deserializer<'de>>(d: D) -> Result<u64, D::Error> {
 }
 
 // ----------------------------------------------------------------- engine.zip
+/// An arch-tagged cubin from `engine.zip`: compute capability `X.Y` encoded as
+/// `X*10+Y` (matching the `smNN` filename tag), paired with the raw cubin bytes.
+type Cubin = (u32, Vec<u8>);
+
 /// `manifest.json` shipped inside `engine.zip` — the generic kernel launch
 /// parameters that the platform's Decrypt/Job row does not carry.
 #[derive(Deserialize)]
@@ -301,9 +305,11 @@ fn percent_decode(s: &str) -> Result<Vec<u8>> {
     Ok(out)
 }
 
-/// Unpack engine.zip: parse `manifest.json` and collect every `*.sm<NN>.cubin`'s
-/// bytes, highest compute-capability first.
-fn unpack_engine(zip_bytes: &[u8]) -> Result<(Manifest, Vec<Vec<u8>>)> {
+/// Unpack engine.zip: parse `manifest.json` and collect every `*.sm<NN>.cubin` as
+/// an `(arch, bytes)` pair, highest compute-capability first. The arch tag rides
+/// along so the GPU loader can skip cubins newer than the device (see
+/// [`cuda::Gpu::load_first`]) instead of handing them to a driver that may crash.
+fn unpack_engine(zip_bytes: &[u8]) -> Result<(Manifest, Vec<Cubin>)> {
     let mut zip = zip::ZipArchive::new(Cursor::new(zip_bytes)).context("opening engine.zip")?;
     let mut manifest: Option<Manifest> = None;
     let mut cubins: Vec<(u32, Vec<u8>)> = Vec::new();
@@ -328,7 +334,7 @@ fn unpack_engine(zip_bytes: &[u8]) -> Result<(Manifest, Vec<Vec<u8>>)> {
         bail!("engine.zip contains no *.sm<NN>.cubin files");
     }
     cubins.sort_by_key(|c| std::cmp::Reverse(c.0));
-    Ok((manifest, cubins.into_iter().map(|(_, b)| b).collect()))
+    Ok((manifest, cubins))
 }
 
 // --------------------------------------------------------------------------- run
@@ -338,7 +344,8 @@ struct ReadyJob {
     start: u64,
     end: u64,
     manifest: Manifest,
-    cubins: Vec<Vec<u8>>,
+    /// Arch-tagged cubins, highest arch first (see [`Cubin`]).
+    cubins: Vec<Cubin>,
     data: Vec<u8>,
 }