diff --git a/.cargo/config.toml b/.cargo/config.toml
index cdd03a35..b41f6ee1 100644
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -1,2 +1,3 @@
 [alias]
 xtask = "run -p xtask --bin xtask --"
+compiletest = "run --release -p compiletests --"
diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml
index 18c05def..fe12895a 100644
--- a/.github/workflows/ci_linux.yml
+++ b/.github/workflows/ci_linux.yml
@@ -153,3 +153,18 @@ jobs:
           MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
         run: |
           echo "Stubbed out"
+  compiletest:
+    name: Compile tests
+    runs-on: ubuntu-latest
+    container:
+      image: "ghcr.io/rust-gpu/rust-cuda-ubuntu24-cuda12:latest"
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Run cargo version
+        run: cargo --version
+      - name: Rustfmt compiletests
+        shell: bash
+        run: shopt -s globstar && rustfmt --check tests/compiletests/ui/**/*.rs
+      - name: Compiletest
+        run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_70,compute_90
diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml
index a5e1055a..654e24ec 100644
--- a/.github/workflows/ci_windows.yml
+++ b/.github/workflows/ci_windows.yml
@@ -26,7 +26,18 @@ jobs:
             target: x86_64-pc-windows-msvc
             cuda: "12.8.1"
             linux-local-args: []
-            sub-packages: ["nvcc", "nvrtc", "nvrtc_dev", "cuda_profiler_api", "cudart", "cublas", "cublas_dev", "curand", "curand_dev"]
+            sub-packages:
+              [
+                "nvcc",
+                "nvrtc",
+                "nvrtc_dev",
+                "cuda_profiler_api",
+                "cudart",
+                "cublas",
+                "cublas_dev",
+                "curand",
+                "curand_dev",
+              ]
 
     steps:
       - name: Checkout repository
@@ -41,7 +52,7 @@ jobs:
           linux-local-args: ${{ toJson(matrix.linux-local-args) }}
           use-local-cache: false
           sub-packages: ${{ toJson(matrix.sub-packages) }}
-          log-file-suffix: '${{matrix.os}}-${{matrix.cuda}}'
+          log-file-suffix: "${{matrix.os}}-${{matrix.cuda}}"
 
       - name: Verify CUDA installation
         run: nvcc --version
@@ -76,3 +87,6 @@ jobs:
         env:
           RUSTDOCFLAGS: -Dwarnings
         run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
+      # Disabled due to dll issues, someone with  Windows knowledge needed
+      # - name: Compiletest
+      #  run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_70,compute_90
diff --git a/Cargo.toml b/Cargo.toml
index 4c495879..c10feff4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,6 +16,8 @@ members = [
   "examples/cuda/path_tracer/kernels",
 
   "examples/optix/*",
+  "tests/compiletests",
+  "tests/compiletests/deps-helper",
 ]
 
 exclude = [
@@ -24,3 +26,7 @@ exclude = [
 
 [profile.dev.package.rustc_codegen_nvvm]
 opt-level = 3
+
+[workspace.dependencies]
+cuda_std = { path = "crates/cuda_std" }
+cuda_builder = { path = "crates/cuda_builder" }
diff --git a/crates/rustc_codegen_nvvm/src/context.rs b/crates/rustc_codegen_nvvm/src/context.rs
index 6392705e..253bb457 100644
--- a/crates/rustc_codegen_nvvm/src/context.rs
+++ b/crates/rustc_codegen_nvvm/src/context.rs
@@ -551,25 +551,40 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
     }
 }
 
+#[derive(Clone)]
+pub enum DisassembleMode {
+    All,
+    Function(String),
+    Entry(String),
+    Globals,
+}
+
 #[derive(Default, Clone)]
 pub struct CodegenArgs {
     pub nvvm_options: Vec<NvvmOption>,
     pub override_libm: bool,
     pub use_constant_memory_space: bool,
     pub final_module_path: Option<PathBuf>,
+    pub disassemble: Option<DisassembleMode>,
 }
 
 impl CodegenArgs {
     pub fn from_session(sess: &Session) -> Self {
-        Self::parse(&sess.opts.cg.llvm_args)
+        Self::parse(&sess.opts.cg.llvm_args, sess)
     }
 
     // we may want to use rustc's own option parsing facilities to have better errors in the future.
-    pub fn parse(args: &[String]) -> Self {
+    pub fn parse(args: &[String], sess: &Session) -> Self {
         // TODO: replace this with a "proper" arg parser.
         let mut cg_args = Self::default();
 
+        let mut skip_next = false;
         for (idx, arg) in args.iter().enumerate() {
+            if skip_next {
+                skip_next = false;
+                continue;
+            }
+
             if let Ok(flag) = NvvmOption::from_str(arg) {
                 cg_args.nvvm_options.push(flag);
             } else if arg == "--override-libm" {
@@ -577,9 +592,40 @@ impl CodegenArgs {
             } else if arg == "--use-constant-memory-space" {
                 cg_args.use_constant_memory_space = true;
             } else if arg == "--final-module-path" {
-                cg_args.final_module_path = Some(PathBuf::from(
-                    args.get(idx + 1).expect("No path for --final-module-path"),
-                ));
+                let path = match args.get(idx + 1) {
+                    Some(p) => p,
+                    None => sess
+                        .dcx()
+                        .fatal("--final-module-path requires a path argument"),
+                };
+                cg_args.final_module_path = Some(PathBuf::from(path));
+                skip_next = true;
+            } else if arg == "--disassemble" {
+                cg_args.disassemble = Some(DisassembleMode::All);
+            } else if arg == "--disassemble-globals" {
+                cg_args.disassemble = Some(DisassembleMode::Globals);
+            } else if arg == "--disassemble-fn" {
+                let func_name = match args.get(idx + 1) {
+                    Some(name) => name.clone(),
+                    None => sess
+                        .dcx()
+                        .fatal("--disassemble-fn requires a function name argument"),
+                };
+                cg_args.disassemble = Some(DisassembleMode::Function(func_name));
+                skip_next = true;
+            } else if let Some(func) = arg.strip_prefix("--disassemble-fn=") {
+                cg_args.disassemble = Some(DisassembleMode::Function(func.to_string()));
+            } else if arg == "--disassemble-entry" {
+                let entry_name = match args.get(idx + 1) {
+                    Some(name) => name.clone(),
+                    None => sess
+                        .dcx()
+                        .fatal("--disassemble-entry requires an entry name argument"),
+                };
+                cg_args.disassemble = Some(DisassembleMode::Entry(entry_name));
+                skip_next = true;
+            } else if let Some(entry) = arg.strip_prefix("--disassemble-entry=") {
+                cg_args.disassemble = Some(DisassembleMode::Entry(entry.to_string()));
             }
         }
 
diff --git a/crates/rustc_codegen_nvvm/src/lib.rs b/crates/rustc_codegen_nvvm/src/lib.rs
index a0656a69..825748ca 100644
--- a/crates/rustc_codegen_nvvm/src/lib.rs
+++ b/crates/rustc_codegen_nvvm/src/lib.rs
@@ -52,6 +52,7 @@ mod lto;
 mod mono_item;
 mod nvvm;
 mod override_fns;
+mod ptx_filter;
 mod target;
 mod ty;
 
@@ -216,7 +217,7 @@ impl CodegenBackend for NvvmCodegenBackend {
         let cmdline = sess.opts.cg.target_feature.split(',');
         let cfg = sess.target.options.features.split(',');
 
-        let target_features: Vec<_> = cfg
+        let mut target_features: Vec<_> = cfg
             .chain(cmdline)
             .filter(|l| l.starts_with('+'))
             .map(|l| &l[1..])
@@ -224,6 +225,22 @@ impl CodegenBackend for NvvmCodegenBackend {
             .map(rustc_span::Symbol::intern)
             .collect();
 
+        // Add backend-synthesized features (e.g., hierarchical compute capabilities)
+        // Parse CodegenArgs to get the architecture from llvm-args
+        let args = context::CodegenArgs::from_session(sess);
+        for opt in &args.nvvm_options {
+            if let ::nvvm::NvvmOption::Arch(arch) = opt {
+                // Add all features up to and including the current architecture
+                let backend_features = arch.all_target_features();
+                target_features.extend(
+                    backend_features
+                        .iter()
+                        .map(|f| rustc_span::Symbol::intern(f)),
+                );
+                break;
+            }
+        }
+
         // For NVPTX, all target features are stable
         let unstable_target_features = target_features.clone();
 
diff --git a/crates/rustc_codegen_nvvm/src/link.rs b/crates/rustc_codegen_nvvm/src/link.rs
index 9cfe1e5a..b84812c2 100644
--- a/crates/rustc_codegen_nvvm/src/link.rs
+++ b/crates/rustc_codegen_nvvm/src/link.rs
@@ -30,6 +30,7 @@ use tracing::{debug, trace};
 
 use crate::LlvmMod;
 use crate::context::CodegenArgs;
+use crate::ptx_filter::{PtxFilter, PtxFilterConfig};
 
 pub(crate) struct NvvmMetadataLoader;
 
@@ -305,6 +306,31 @@ fn codegen_into_ptx_file(
         }
     };
 
+    // If disassembly is requested, print PTX to stderr
+    if args.disassemble.is_some()
+        && let Ok(ptx_str) = std::str::from_utf8(&ptx_bytes)
+    {
+        let config = PtxFilterConfig::from_codegen_args(&args);
+        let filter = PtxFilter::new(config);
+        let output = filter.filter(ptx_str);
+        if !output.is_empty() {
+            // Check if we're in JSON mode by checking the error format
+            use rustc_session::config::ErrorOutputType;
+            match sess.opts.error_format {
+                ErrorOutputType::Json { .. } => {
+                    sess.dcx()
+                        .err("PTX disassembly output in JSON mode is not supported");
+                }
+                _ => {
+                    // In normal mode, just print to stderr
+                    // Replace tabs with spaces for cleaner output
+                    let output = output.replace('\t', "    ");
+                    eprintln!("{output}");
+                }
+            }
+        }
+    }
+
     std::fs::write(out_filename, ptx_bytes)
 }
 
diff --git a/crates/rustc_codegen_nvvm/src/ptx_filter.rs b/crates/rustc_codegen_nvvm/src/ptx_filter.rs
new file mode 100644
index 00000000..4b695600
--- /dev/null
+++ b/crates/rustc_codegen_nvvm/src/ptx_filter.rs
@@ -0,0 +1,454 @@
+/// What to include when filtering PTX output
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum PtxOutputMode {
+    /// Include everything
+    All,
+    /// Include only function declarations (no bodies)
+    DeclarationsOnly,
+    /// Include specific functions based on filter
+    Filtered,
+}
+
+/// Filter for selecting specific functions
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum FunctionFilter {
+    /// Include all functions
+    All,
+    /// Include functions with names containing this string
+    ByName(String),
+    /// Include only entry points with names containing this string
+    EntryPoint(String),
+}
+
+/// Configuration for filtering PTX output
+#[derive(Debug, Clone)]
+pub struct PtxFilterConfig {
+    /// What content to include
+    pub mode: PtxOutputMode,
+
+    /// Filter for selecting functions (only used when mode is Filtered)
+    pub function_filter: FunctionFilter,
+
+    /// What additional content to include
+    pub include_header: bool,
+    pub include_globals: bool,
+}
+
+impl Default for PtxFilterConfig {
+    fn default() -> Self {
+        Self {
+            mode: PtxOutputMode::Filtered,
+            function_filter: FunctionFilter::All,
+            include_header: true,
+            include_globals: true,
+        }
+    }
+}
+
+impl PtxFilterConfig {
+    /// Create a config that includes everything
+    pub fn all() -> Self {
+        Self {
+            mode: PtxOutputMode::All,
+            function_filter: FunctionFilter::All,
+            include_header: true,
+            include_globals: true,
+        }
+    }
+
+    /// Create a config for declarations only
+    pub fn declarations_only() -> Self {
+        Self {
+            mode: PtxOutputMode::DeclarationsOnly,
+            function_filter: FunctionFilter::All,
+            include_header: true,
+            include_globals: true,
+        }
+    }
+
+    /// Create a config that filters by function name
+    pub fn by_function_name(name: impl Into<String>) -> Self {
+        Self {
+            mode: PtxOutputMode::Filtered,
+            function_filter: FunctionFilter::ByName(name.into()),
+            include_header: false,
+            include_globals: false,
+        }
+    }
+
+    /// Create a config that filters by entry point name
+    pub fn by_entry_point(name: impl Into<String>) -> Self {
+        Self {
+            mode: PtxOutputMode::Filtered,
+            function_filter: FunctionFilter::EntryPoint(name.into()),
+            include_header: false,
+            include_globals: false,
+        }
+    }
+
+    /// Create a config from CodegenArgs
+    pub fn from_codegen_args(args: &crate::context::CodegenArgs) -> Self {
+        use crate::context::DisassembleMode;
+        match &args.disassemble {
+            Some(DisassembleMode::All) => Self::all(),
+            Some(DisassembleMode::Globals) => Self::declarations_only(),
+            Some(DisassembleMode::Function(func_name)) => Self::by_function_name(func_name),
+            Some(DisassembleMode::Entry(entry_name)) => Self::by_entry_point(entry_name),
+            None => Self::default(),
+        }
+    }
+}
+
+/// PTX output filter that processes PTX assembly based on configuration
+pub struct PtxFilter {
+    config: PtxFilterConfig,
+}
+
+impl PtxFilter {
+    pub fn new(config: PtxFilterConfig) -> Self {
+        Self { config }
+    }
+
+    /// Filter PTX content based on the configuration
+    pub fn filter(&self, ptx: &str) -> String {
+        // If mode is All, return everything
+        if self.config.mode == PtxOutputMode::All {
+            return ptx.to_string();
+        }
+
+        let parsed = PtxContent::parse(ptx);
+        parsed.format(&self.config)
+    }
+}
+
+/// Parsed PTX content
+#[derive(Debug, Default)]
+struct PtxContent {
+    header_lines: Vec<String>,
+    globals: Vec<String>,
+    functions: Vec<PtxFunction>,
+}
+
+/// A parsed PTX function
+#[derive(Debug)]
+struct PtxFunction {
+    name: String,
+    is_entry: bool,
+    declaration_line: String,
+    body_lines: Vec<String>,
+}
+
+impl PtxContent {
+    /// Parse PTX text into structured content
+    fn parse(ptx: &str) -> Self {
+        let mut content = Self::default();
+        let mut in_function = false;
+        let mut current_function: Option<PtxFunction> = None;
+
+        for line in ptx.lines() {
+            if Self::is_header_line(line) {
+                content.header_lines.push(line.to_string());
+            } else if Self::is_global_line(line) && !in_function {
+                content.globals.push(line.to_string());
+            } else if let Some(func) = Self::parse_function_start(line) {
+                // Save previous function if any
+                if let Some(f) = current_function.take() {
+                    content.functions.push(f);
+                }
+                current_function = Some(func);
+                in_function = true;
+            } else if in_function && let Some(ref mut func) = current_function {
+                func.body_lines.push(line.to_string());
+                if line.trim() == "}" {
+                    content.functions.push(current_function.take().unwrap());
+                    in_function = false;
+                }
+            }
+        }
+
+        // Handle case where file ends while in function
+        if let Some(func) = current_function {
+            content.functions.push(func);
+        }
+
+        content
+    }
+
+    fn is_header_line(line: &str) -> bool {
+        line.starts_with(".version")
+            || line.starts_with(".target")
+            || line.starts_with(".address_size")
+    }
+
+    fn is_global_line(line: &str) -> bool {
+        line.contains(".global") || line.contains(".const") || line.contains(".shared")
+    }
+
+    fn parse_function_start(line: &str) -> Option<PtxFunction> {
+        if line.contains(".func") || line.contains(".entry") {
+            Some(PtxFunction {
+                name: Self::extract_function_name(line),
+                is_entry: line.contains(".entry"),
+                declaration_line: line.to_string(),
+                body_lines: vec![],
+            })
+        } else {
+            None
+        }
+    }
+
+    fn extract_function_name(line: &str) -> String {
+        // Look for patterns like:
+        // .visible .entry kernel_main(
+        // .func (.reg .u32 %ret) helper_func()
+        // .entry simple_kernel (
+
+        // Strategy: Find all potential function names (valid identifiers)
+        // The last one before the final '(' is usually the function name
+        let mut potential_names = Vec::new();
+        let mut current_word = String::new();
+        let mut paren_depth = 0;
+
+        for ch in line.chars() {
+            match ch {
+                '(' => {
+                    // Save any current word before we see a paren
+                    if !current_word.is_empty()
+                        && current_word
+                            .chars()
+                            .all(|c| c.is_alphanumeric() || c == '_')
+                        && paren_depth == 0
+                    {
+                        potential_names.push(current_word.clone());
+                    }
+                    current_word.clear();
+                    paren_depth += 1;
+                }
+                ')' => {
+                    current_word.clear();
+                    if paren_depth > 0 {
+                        paren_depth -= 1;
+                    }
+                }
+                ' ' | '\t' | ',' | '.' => {
+                    if !current_word.is_empty()
+                        && current_word
+                            .chars()
+                            .all(|c| c.is_alphanumeric() || c == '_')
+                        && paren_depth == 0
+                    {
+                        // This is a word at depth 0 (not inside parentheses)
+                        potential_names.push(current_word.clone());
+                    }
+                    current_word.clear();
+                }
+                _ => {
+                    if ch.is_alphanumeric() || ch == '_' {
+                        current_word.push(ch);
+                    } else {
+                        current_word.clear();
+                    }
+                }
+            }
+        }
+
+        // Handle case where line ends with the function name
+        if !current_word.is_empty()
+            && current_word
+                .chars()
+                .all(|c| c.is_alphanumeric() || c == '_')
+        {
+            potential_names.push(current_word);
+        }
+
+        // Return the last potential name found, or empty string
+        potential_names.into_iter().last().unwrap_or_default()
+    }
+
+    /// Format the parsed content according to the configuration
+    fn format(&self, config: &PtxFilterConfig) -> String {
+        let mut output = String::new();
+
+        // Add header if requested
+        if config.include_header {
+            for line in &self.header_lines {
+                output.push_str(line);
+                output.push('\n');
+            }
+        }
+
+        // Add globals if requested
+        if config.include_globals {
+            for line in &self.globals {
+                output.push_str(line);
+                output.push('\n');
+            }
+        }
+
+        // Add functions based on mode
+        match config.mode {
+            PtxOutputMode::All => {
+                // Already handled above
+                unreachable!()
+            }
+            PtxOutputMode::DeclarationsOnly => {
+                for func in &self.functions {
+                    output.push_str(&func.declaration_line);
+                    output.push_str(" { ... }\n\n");
+                }
+            }
+            PtxOutputMode::Filtered => {
+                for func in &self.functions {
+                    if self.should_include_function(func, &config.function_filter) {
+                        output.push_str(&func.declaration_line);
+                        output.push('\n');
+                        for line in &func.body_lines {
+                            output.push_str(line);
+                            output.push('\n');
+                        }
+                        output.push('\n');
+                    }
+                }
+            }
+        }
+
+        output
+    }
+
+    fn should_include_function(&self, func: &PtxFunction, filter: &FunctionFilter) -> bool {
+        match filter {
+            FunctionFilter::All => true,
+            FunctionFilter::ByName(name) => func.name.contains(name),
+            FunctionFilter::EntryPoint(name) => func.is_entry && func.name.contains(name),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const SAMPLE_PTX: &str = r#".version 8.7
+.target sm_61, debug
+.address_size 64
+
+.global .align 4 .u32 global_var = 42;
+
+.visible .entry kernel_main(
+    .param .u64 kernel_main_param_0
+)
+{
+    .reg .u64 %r1;
+    ld.param.u64 %r1, [kernel_main_param_0];
+    ret;
+}
+
+.func (.reg .u32 %ret) helper_func()
+{
+    .reg .u32 %r1;
+    mov.u32 %r1, 10;
+    mov.u32 %ret, %r1;
+    ret;
+}
+
+.visible .entry another_kernel()
+{
+    ret;
+}
+"#;
+
+    #[test]
+    fn test_filter_all() {
+        let config = PtxFilterConfig::all();
+        let filter = PtxFilter::new(config);
+        let result = filter.filter(SAMPLE_PTX);
+        assert_eq!(result, SAMPLE_PTX);
+    }
+
+    #[test]
+    fn test_filter_by_entry_point() {
+        let config = PtxFilterConfig::by_entry_point("kernel_main");
+        let filter = PtxFilter::new(config);
+        let result = filter.filter(SAMPLE_PTX);
+
+        // Should NOT include header or globals with new config
+        assert!(!result.contains(".version 8.7"));
+        assert!(!result.contains(".target sm_61"));
+        assert!(!result.contains(".address_size 64"));
+        assert!(!result.contains(".global .align 4 .u32 global_var"));
+
+        // Should include kernel_main
+        assert!(result.contains(".visible .entry kernel_main"));
+        assert!(result.contains("ld.param.u64 %r1"));
+
+        // Should NOT include helper_func or another_kernel
+        assert!(!result.contains("helper_func"));
+        assert!(!result.contains("another_kernel"));
+    }
+
+    #[test]
+    fn test_filter_by_function_name() {
+        let config = PtxFilterConfig::by_function_name("helper_func");
+        let filter = PtxFilter::new(config);
+        let result = filter.filter(SAMPLE_PTX);
+
+        // Should NOT include header with new config
+        assert!(!result.contains(".version 8.7"));
+
+        // Should include helper_func
+        assert!(result.contains(".func (.reg .u32 %ret) helper_func"));
+        assert!(result.contains("mov.u32 %r1, 10"));
+
+        // Should NOT include kernels
+        assert!(!result.contains("kernel_main"));
+        assert!(!result.contains("another_kernel"));
+    }
+
+    #[test]
+    fn test_declarations_only() {
+        let config = PtxFilterConfig::declarations_only();
+        let filter = PtxFilter::new(config);
+        let result = filter.filter(SAMPLE_PTX);
+
+        // Should include header
+        assert!(result.contains(".version 8.7"));
+
+        // Should include globals
+        assert!(result.contains(".global .align 4 .u32 global_var"));
+
+        // Should include function declarations but not bodies
+        assert!(result.contains(".visible .entry kernel_main"));
+        assert!(result.contains(" { ... }"));
+        assert!(!result.contains("ld.param.u64"));
+    }
+
+    #[test]
+    fn test_partial_name_match() {
+        let config = PtxFilterConfig::by_entry_point("kernel");
+        let filter = PtxFilter::new(config);
+        let result = filter.filter(SAMPLE_PTX);
+
+        // Should include both kernels that contain "kernel"
+        assert!(result.contains("kernel_main"));
+        assert!(result.contains("another_kernel"));
+
+        // Should NOT include helper_func
+        assert!(!result.contains("helper_func"));
+    }
+
+    #[test]
+    fn test_extract_function_name() {
+        assert_eq!(
+            PtxContent::extract_function_name(".visible .entry kernel_main("),
+            "kernel_main"
+        );
+        assert_eq!(
+            PtxContent::extract_function_name(".func (.reg .u32 %ret) helper_func()"),
+            "helper_func"
+        );
+        assert_eq!(
+            PtxContent::extract_function_name(".entry simple_kernel ("),
+            "simple_kernel"
+        );
+    }
+}
diff --git a/tests/compiletests/Cargo.toml b/tests/compiletests/Cargo.toml
new file mode 100644
index 00000000..f8470e58
--- /dev/null
+++ b/tests/compiletests/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "compiletests"
+version = "0.1.0"
+edition = "2021"
+
+[[bin]]
+name = "compiletests"
+path = "src/main.rs"
+
+[dependencies]
+compiletest_rs = "0.11"
+clap = { version = "4.5", features = ["derive"] }
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+cuda_builder = { workspace = true }
\ No newline at end of file
diff --git a/tests/compiletests/README.md b/tests/compiletests/README.md
new file mode 100644
index 00000000..f07f9db0
--- /dev/null
+++ b/tests/compiletests/README.md
@@ -0,0 +1,54 @@
+# Compiletests for Rust-CUDA
+
+This directory contains compile tests for the Rust-CUDA project using the `compiletest` framework.
+
+The code in these tests is not executed. Tests check that the compiler compiles
+correctly. Tests in `dis/` verify correct PTX output.
+
+## Running Tests
+
+You can run the tests using the cargo alias:
+
+```bash
+cargo compiletest
+```
+
+Or run directly from this directory:
+
+```bash
+cargo run --release
+```
+
+### Options
+
+- `--bless` - Update expected output files
+- `--target-arch=compute_61,compute_70,compute_90` - Test multiple CUDA compute capabilities (comma-separated)
+- Filter by test name: `cargo compiletest simple`
+- `RUST_LOG=info` - Enable progress logging
+- `RUST_LOG=debug` - Enable detailed debug logging
+
+### Architecture-Specific Tests
+
+Tests can target specific architectures using stage IDs:
+
+```rust
+// only-compute_70   - Only run on compute_70
+// only-compute_90   - Only run on compute_90
+// ignore-compute_61 - Skip on compute_61
+```
+
+## Multi-Architecture Testing
+
+Test against multiple CUDA architectures:
+
+```bash
+cargo compiletest -- --target-arch=compute_61,compute_70,compute_90
+```
+
+Each test runs for all specified architectures.
+
+## Debugging
+
+- Use `RUST_LOG=debug` for detailed test execution
+- Check generated PTX in `target/compiletest-results/`
+- Filter specific tests: `cargo compiletest simple`
diff --git a/tests/compiletests/deps-helper/Cargo.toml b/tests/compiletests/deps-helper/Cargo.toml
new file mode 100644
index 00000000..b5167c77
--- /dev/null
+++ b/tests/compiletests/deps-helper/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "compiletests-deps-helper"
+description = "Shared dependencies of all the compiletest tests"
+version = "0.1.0"
+edition = "2024"
+publish = false
+
+[lib]
+crate-type = ["cdylib", "rlib"]
+
+[dependencies]
+cuda_std = { workspace = true }
diff --git a/tests/compiletests/deps-helper/src/lib.rs b/tests/compiletests/deps-helper/src/lib.rs
new file mode 100644
index 00000000..d93d0978
--- /dev/null
+++ b/tests/compiletests/deps-helper/src/lib.rs
@@ -0,0 +1,6 @@
+#[cfg_attr(target_os = "cuda", panic_handler)]
+#[allow(dead_code)]
+fn panic(_: &core::panic::PanicInfo) -> ! {
+    #[allow(clippy::empty_loop)]
+    loop {}
+}
diff --git a/tests/compiletests/src/main.rs b/tests/compiletests/src/main.rs
new file mode 100644
index 00000000..9fb2ed4f
--- /dev/null
+++ b/tests/compiletests/src/main.rs
@@ -0,0 +1,621 @@
+use clap::Parser;
+use std::env;
+use std::io;
+use std::path::{Path, PathBuf};
+use std::process::Command;
+
+#[derive(Parser)]
+#[command(bin_name = "cargo compiletest")]
+struct Opt {
+    /// Automatically update stderr/stdout files.
+    #[arg(long)]
+    bless: bool,
+
+    /// The CUDA compute capability to target (e.g., compute_70, compute_80, compute_90).
+    /// Can specify multiple architectures comma-separated.
+    #[arg(long, default_value = "compute_70", value_delimiter = ',')]
+    target_arch: Vec<String>,
+
+    /// Only run tests that match these filters.
+    #[arg(name = "FILTER")]
+    filters: Vec<String>,
+}
+
+impl Opt {
+    pub fn architectures(&self) -> impl Iterator<Item = &str> {
+        self.target_arch.iter().map(|s| s.as_str())
+    }
+}
+
+const CUDA_TARGET: &str = "nvptx64-nvidia-cuda";
+
+#[derive(Copy, Clone)]
+enum DepKind {
+    CudaLib,
+    ProcMacro,
+}
+
+impl DepKind {
+    fn prefix_and_extension(self) -> (&'static str, &'static str) {
+        match self {
+            Self::CudaLib => ("lib", "rlib"),
+            Self::ProcMacro => (env::consts::DLL_PREFIX, env::consts::DLL_EXTENSION),
+        }
+    }
+
+    fn target_dir_suffix(self, target: &str) -> String {
+        match self {
+            Self::CudaLib => format!("{target}/release/deps"),
+            Self::ProcMacro => "release/deps".into(),
+        }
+    }
+}
+
+fn main() {
+    let opt = Opt::parse();
+
+    let tests_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    let workspace_root = tests_dir.parent().unwrap().parent().unwrap().to_path_buf();
+    let original_target_dir = workspace_root.join("target");
+    let deps_target_dir = original_target_dir.join("compiletest-deps");
+    let compiletest_build_dir = original_target_dir.join("compiletest-results");
+
+    // Find the rustc_codegen_nvvm backend before changing directory
+    let codegen_backend_path = find_rustc_codegen_nvvm(&workspace_root);
+
+    // HACK(eddyb) force `compiletest` to pass `ui/...` relative paths to `rustc`,
+    // which should always end up being the same regardless of the path that the
+    // Rust-CUDA repo is checked out at (among other things, this avoids hardcoded
+    // `compiletest` limits being hit by e.g. users with slightly longer paths).
+    std::env::set_current_dir(tests_dir).unwrap();
+    let tests_dir = PathBuf::from("");
+
+    let runner = Runner {
+        opt,
+        tests_dir,
+        compiletest_build_dir,
+        deps_target_dir,
+        codegen_backend_path,
+    };
+
+    runner.run_mode("ui");
+}
+
+struct Runner {
+    opt: Opt,
+    tests_dir: PathBuf,
+    compiletest_build_dir: PathBuf,
+    deps_target_dir: PathBuf,
+    codegen_backend_path: PathBuf,
+}
+
+impl Runner {
+    /// Runs the given `mode` on the directory that matches that name, using the
+    /// backend provided by `codegen_backend_path`.
+    #[allow(clippy::string_add)]
+    fn run_mode(&self, mode: &'static str) {
+        /// RUSTFLAGS passed to all test files.
+        fn test_rustc_flags(
+            codegen_backend_path: &Path,
+            deps: &TestDeps,
+            indirect_deps_dirs: &[&Path],
+            target_arch: &str,
+        ) -> String {
+            [
+                &*rust_flags(codegen_backend_path, target_arch),
+                &*indirect_deps_dirs
+                    .iter()
+                    .map(|dir| format!("-L dependency={}", dir.display()))
+                    .fold(String::new(), |a, b| b + " " + &a),
+                "--edition 2021",
+                &*format!("--extern noprelude:core={}", deps.core.display()),
+                &*format!(
+                    "--extern noprelude:compiler_builtins={}",
+                    deps.compiler_builtins.display()
+                ),
+                &*format!(
+                    "--extern cuda_std_macros={}",
+                    deps.cuda_std_macros.display()
+                ),
+                &*format!("--extern cuda_std={}", deps.cuda_std.display()),
+                "--crate-type cdylib",
+                "-Zunstable-options",
+                "-Zcrate-attr=no_std",
+                "-Zcrate-attr=feature(abi_ptx)",
+            ]
+            .join(" ")
+        }
+
+        struct Variation {
+            name: &'static str,
+            extra_flags: &'static str,
+        }
+        const VARIATIONS: &[Variation] = &[Variation {
+            name: "default",
+            extra_flags: "",
+        }];
+
+        for (arch, variation) in self
+            .opt
+            .architectures()
+            .flat_map(|arch| VARIATIONS.iter().map(move |variation| (arch, variation)))
+        {
+            // HACK(eddyb) in order to allow *some* tests to have separate output
+            // in different testing variations (i.e. experimental features), while
+            // keeping *most* of the tests unchanged, we make use of "stage IDs",
+            // which offer `// only-S` and `// ignore-S` for any stage ID `S`.
+            let stage_id = if variation.name == "default" {
+                // Use the architecture name as the stage ID.
+                arch.to_string()
+            } else {
+                // Include the variation name in the stage ID.
+                format!("{}-{}", arch, variation.name)
+            };
+
+            println!("Testing arch: {stage_id}\n");
+
+            let libs = build_deps(
+                &self.deps_target_dir,
+                &self.codegen_backend_path,
+                CUDA_TARGET,
+                arch,
+            );
+            let mut flags = test_rustc_flags(
+                &self.codegen_backend_path,
+                &libs,
+                &[
+                    &self
+                        .deps_target_dir
+                        .join(DepKind::CudaLib.target_dir_suffix(CUDA_TARGET)),
+                    &self
+                        .deps_target_dir
+                        .join(DepKind::ProcMacro.target_dir_suffix(CUDA_TARGET)),
+                ],
+                arch,
+            );
+            flags += variation.extra_flags;
+
+            let config = compiletest_rs::Config {
+                stage_id,
+                target_rustcflags: Some(flags),
+                mode: mode.parse().expect("Invalid mode"),
+                target: CUDA_TARGET.to_string(),
+                src_base: self.tests_dir.join(mode),
+                build_base: self.compiletest_build_dir.clone(),
+                bless: self.opt.bless,
+                filters: self.opt.filters.clone(),
+                ..compiletest_rs::Config::default()
+            };
+            // FIXME(eddyb) do we need this? shouldn't `compiletest` be independent?
+            config.clean_rmeta();
+
+            // Set up CUDA environment
+            setup_cuda_environment();
+
+            compiletest_rs::run_tests(&config);
+        }
+    }
+}
+
+/// Runs the processes needed to build `cuda-std` & other deps.
+fn build_deps(
+    deps_target_dir: &Path,
+    codegen_backend_path: &Path,
+    target: &str,
+    arch: &str,
+) -> TestDeps {
+    // Build compiletests-deps-helper using the same approach as cuda_builder
+    let rustflags = vec![
+        format!("-Zcodegen-backend={}", codegen_backend_path.display()),
+        "-Zcrate-attr=feature(register_tool)".into(),
+        "-Zcrate-attr=register_tool(nvvm_internal)".into(),
+        "-Zcrate-attr=no_std".into(),
+        "-Zcrate-attr=feature(abi_ptx)".into(),
+        "-Zsaturating_float_casts=false".into(),
+        "-Cembed-bitcode=no".into(),
+        "-Cdebuginfo=0".into(),
+        "-Coverflow-checks=off".into(),
+        "-Copt-level=3".into(),
+        "-Cpanic=abort".into(),
+        "-Cno-redzone=yes".into(),
+        format!("-Cllvm-args=-arch={} --override-libm", arch),
+        format!("-Ctarget-feature=+{}", arch),
+    ];
+
+    let cargo_encoded_rustflags = rustflags.join("\x1f");
+
+    std::process::Command::new("cargo")
+        .args([
+            "build",
+            "--lib",
+            "-p",
+            "compiletests-deps-helper",
+            "--release",
+            "-Zbuild-std=core,alloc",
+            "-Zbuild-std-features=panic_immediate_abort",
+            &*format!("--target={target}"),
+        ])
+        .arg("--target-dir")
+        .arg(deps_target_dir)
+        .env("CARGO_ENCODED_RUSTFLAGS", cargo_encoded_rustflags)
+        .env("CUDA_ARCH", "70")
+        .stderr(std::process::Stdio::inherit())
+        .stdout(std::process::Stdio::inherit())
+        .status()
+        .and_then(map_status_to_result)
+        .unwrap();
+
+    let compiler_builtins = find_lib(
+        deps_target_dir,
+        "compiler_builtins",
+        DepKind::CudaLib,
+        target,
+    );
+    let core = find_lib(deps_target_dir, "core", DepKind::CudaLib, target);
+    let cuda_std = find_lib(deps_target_dir, "cuda_std", DepKind::CudaLib, target);
+    let cuda_std_macros = find_lib(
+        deps_target_dir,
+        "cuda_std_macros",
+        DepKind::ProcMacro,
+        target,
+    );
+
+    let all_libs = [&compiler_builtins, &core, &cuda_std, &cuda_std_macros];
+    if all_libs.iter().any(|r| r.is_err()) {
+        // FIXME(eddyb) `missing_count` should always be `0` anyway.
+        // FIXME(eddyb) use `--message-format=json-render-diagnostics` to
+        // avoid caring about duplicates (or search within files at all).
+        let missing_count = all_libs
+            .iter()
+            .filter(|r| matches!(r, Err(FindLibError::Missing)))
+            .count();
+        let duplicate_count = all_libs
+            .iter()
+            .filter(|r| matches!(r, Err(FindLibError::Duplicate)))
+            .count();
+        eprintln!(
+            "warning: cleaning deps ({missing_count} missing libs, {duplicate_count} duplicated libs)"
+        );
+        clean_deps(deps_target_dir);
+        build_deps(deps_target_dir, codegen_backend_path, target, arch)
+    } else {
+        TestDeps {
+            core: core.ok().unwrap(),
+            compiler_builtins: compiler_builtins.ok().unwrap(),
+            cuda_std: cuda_std.ok().unwrap(),
+            cuda_std_macros: cuda_std_macros.ok().unwrap(),
+        }
+    }
+}
+
+fn clean_deps(deps_target_dir: &Path) {
+    std::process::Command::new("cargo")
+        .arg("clean")
+        .arg("--target-dir")
+        .arg(deps_target_dir)
+        .stderr(std::process::Stdio::inherit())
+        .stdout(std::process::Stdio::inherit())
+        .status()
+        .and_then(map_status_to_result)
+        .unwrap();
+}
+
+enum FindLibError {
+    Missing,
+    Duplicate,
+}
+
+/// Attempt find the rlib that matches `base`, if multiple rlibs are found then
+/// a clean build is required and `Err(FindLibError::Duplicate)` is returned.
+fn find_lib(
+    deps_target_dir: &Path,
+    base: impl AsRef<Path>,
+    dep_kind: DepKind,
+    target: &str,
+) -> Result<PathBuf, FindLibError> {
+    let base = base.as_ref();
+    let (expected_prefix, expected_extension) = dep_kind.prefix_and_extension();
+    let expected_name = format!("{}{}", expected_prefix, base.display());
+
+    let dir = deps_target_dir.join(dep_kind.target_dir_suffix(target));
+
+    let matching_paths: Vec<PathBuf> = std::fs::read_dir(dir)
+        .unwrap()
+        .map(|entry| entry.unwrap().path())
+        .filter(|path| {
+            let name = {
+                let name = path.file_stem();
+                if name.is_none() {
+                    return false;
+                }
+                name.unwrap()
+            };
+
+            let name_matches = name.to_str().unwrap().starts_with(&expected_name)
+                && name.len() == expected_name.len() + 17   // we expect our name, '-', and then 16 hexadecimal digits
+                && ends_with_dash_hash(name.to_str().unwrap());
+            let extension_matches = path
+                .extension()
+                .is_some_and(|ext| ext == expected_extension);
+
+            name_matches && extension_matches
+        })
+        .collect();
+
+    match matching_paths.len() {
+        0 => Err(FindLibError::Missing),
+        1 => Ok(matching_paths.into_iter().next().unwrap()),
+        _ => Err(FindLibError::Duplicate),
+    }
+}
+
+/// Returns whether this string ends with a dash ('-'), followed by 16 lowercase hexadecimal characters
+fn ends_with_dash_hash(s: &str) -> bool {
+    let n = s.len();
+    if n < 17 {
+        return false;
+    }
+    let mut bytes = s.bytes().skip(n - 17);
+    if bytes.next() != Some(b'-') {
+        return false;
+    }
+
+    bytes.all(|b| b.is_ascii_hexdigit())
+}
+
+/// Paths to all of the library artifacts of dependencies needed to compile tests.
+struct TestDeps {
+    core: PathBuf,
+    compiler_builtins: PathBuf,
+    cuda_std: PathBuf,
+    cuda_std_macros: PathBuf,
+}
+
+/// The RUSTFLAGS passed to all CUDA builds.
+// FIXME(eddyb) expose most of these from `cuda-builder`.
+fn rust_flags(codegen_backend_path: &Path, target_arch: &str) -> String {
+    [
+        &*format!("-Zcodegen-backend={}", codegen_backend_path.display()),
+        // Ensure the codegen backend is emitted in `.d` files to force Cargo
+        // to rebuild crates compiled with it when it changes (this used to be
+        // the default until https://github.com/rust-lang/rust/pull/93969).
+        "-Zbinary-dep-depinfo",
+        "-Csymbol-mangling-version=v0",
+        "-Zcrate-attr=feature(register_tool)",
+        "-Zcrate-attr=register_tool(nvvm_internal)",
+        // HACK(eddyb) this is the same configuration that we test with, and
+        // ensures no unwanted surprises from e.g. `core` debug assertions.
+        "-Coverflow-checks=off",
+        "-Cdebug-assertions=off",
+        // HACK(eddyb) we need this for `core::fmt::rt::Argument::new_*` calls
+        // to *never* be inlined, so we can pattern-match the calls themselves.
+        "-Zinline-mir=off",
+        // HACK(eddyb) avoid ever reusing instantiations from `compiler_builtins`
+        // which is special-cased to turn calls to functions that never return,
+        // into aborts, and this applies to the panics of UB-checking helpers
+        // (https://github.com/rust-lang/rust/pull/122580#issuecomment-3033026194)
+        // but while upstream that only loses the panic message, for us it's even
+        // worse, as we lose the chance to remove otherwise-dead `fmt::Arguments`.
+        "-Zshare-generics=off",
+        // NOTE(eddyb) flags copied from `cuda-builder` are all above this line.
+        "-Cdebuginfo=2",
+        "-Cembed-bitcode=no",
+        &format!("-Ctarget-feature=+{target_arch}"),
+        "-Cpanic=abort",
+        "-Cno-redzone=yes",
+        &format!("-Cllvm-args=-arch={target_arch}"),
+        "-Cllvm-args=--override-libm",
+    ]
+    .join(" ")
+}
+
+/// Convenience function to map process failure to results in Rust.
+fn map_status_to_result(status: std::process::ExitStatus) -> io::Result<()> {
+    match status.success() {
+        true => Ok(()),
+        false => Err(io::Error::other(format!(
+            "process terminated with non-zero code: {}",
+            status.code().unwrap_or(0)
+        ))),
+    }
+}
+
+// https://github.com/rust-lang/cargo/blob/1857880b5124580c4aeb4e8bc5f1198f491d61b1/src/cargo/util/paths.rs#L29-L52
+fn dylib_path_envvar() -> &'static str {
+    if cfg!(windows) {
+        "PATH"
+    } else if cfg!(target_os = "macos") {
+        "DYLD_FALLBACK_LIBRARY_PATH"
+    } else {
+        "LD_LIBRARY_PATH"
+    }
+}
+
+fn dylib_path() -> Vec<PathBuf> {
+    match env::var_os(dylib_path_envvar()) {
+        Some(var) => env::split_paths(&var).collect(),
+        None => Vec::new(),
+    }
+}
+
+#[cfg(windows)]
+fn setup_windows_dll_path(codegen_backend_path: &Path) {
+    fn add_to_dylib_path(dir: &Path) {
+        let lib_path_var = dylib_path_envvar();
+        let existing_path = env::var(lib_path_var).unwrap_or_default();
+        let separator = ";";
+
+        let dir_str = dir.to_string_lossy();
+        // Check if the directory is already in the path
+        if !existing_path
+            .split(separator)
+            .any(|p| p == dir_str.as_ref())
+        {
+            let new_path = if existing_path.is_empty() {
+                dir_str.to_string()
+            } else {
+                format!("{dir_str}{separator}{existing_path}")
+            };
+            env::set_var(lib_path_var, new_path);
+        }
+    }
+
+    // Add the directory containing the codegen backend
+    if let Some(dir) = codegen_backend_path.parent() {
+        add_to_dylib_path(dir);
+    }
+
+    // Try to find LLVM directories and add them to PATH
+    // Look for llvm-config to find LLVM installation
+    let llvm_config_paths = vec![
+        "llvm-config",
+        "llvm-config-7",
+        "llvm-config.exe",
+        "llvm-config-7.exe",
+    ];
+
+    for llvm_config in &llvm_config_paths {
+        if let Ok(output) = Command::new(llvm_config).arg("--bindir").output() {
+            if output.status.success() {
+                if let Ok(bindir) = String::from_utf8(output.stdout) {
+                    let bindir = bindir.trim();
+                    let bindir_path = Path::new(bindir);
+                    if bindir_path.exists() {
+                        add_to_dylib_path(bindir_path);
+                        // Also add the lib directory which might contain DLLs
+                        if let Some(parent) = bindir_path.parent() {
+                            let libdir = parent.join("lib");
+                            if libdir.exists() {
+                                add_to_dylib_path(&libdir);
+                            }
+                        }
+                    }
+                }
+                break;
+            }
+        }
+    }
+
+    // Also check common LLVM installation directories on Windows
+    let common_llvm_paths = vec![
+        "C:\\Program Files\\LLVM\\bin",
+        "C:\\Program Files (x86)\\LLVM\\bin",
+        "C:\\Tools\\LLVM\\bin",
+        "C:\\llvm\\bin",
+    ];
+
+    for path in &common_llvm_paths {
+        let path = Path::new(path);
+        if path.exists() {
+            add_to_dylib_path(path);
+        }
+    }
+}
+
+fn find_rustc_codegen_nvvm(workspace_root: &Path) -> PathBuf {
+    let filename = format!(
+        "{}rustc_codegen_nvvm{}",
+        env::consts::DLL_PREFIX,
+        env::consts::DLL_SUFFIX
+    );
+
+    // First check if it's already built
+    let target_dir = workspace_root.join("target");
+    let search_paths = vec![
+        target_dir.join("debug").join(&filename),
+        target_dir.join("release").join(&filename),
+    ];
+
+    for path in &search_paths {
+        if path.is_file() {
+            // On Windows, ensure the directory containing the DLL is in PATH
+            // so that its dependencies can be found
+            #[cfg(windows)]
+            setup_windows_dll_path(&path);
+
+            return path.clone();
+        }
+    }
+
+    // If not found, try to build it
+    println!("Building rustc_codegen_nvvm...");
+    let status = Command::new("cargo")
+        .args(["build", "-p", "rustc_codegen_nvvm"])
+        .current_dir(workspace_root)
+        .status()
+        .expect("Failed to execute cargo build");
+
+    if !status.success() {
+        panic!("Failed to build rustc_codegen_nvvm");
+    }
+
+    // Try to find it again after building
+    for path in &search_paths {
+        if path.is_file() {
+            // On Windows, ensure the directory containing the DLL is in PATH
+            #[cfg(windows)]
+            setup_windows_dll_path(&path);
+
+            return path.clone();
+        }
+    }
+
+    // Last resort: check library path
+    for mut path in dylib_path() {
+        path.push(&filename);
+        if path.is_file() {
+            return path;
+        }
+    }
+    panic!("Could not find {filename} in library path or target directory");
+}
+
+fn setup_cuda_environment() {
+    // Set library path to include CUDA NVVM libraries
+    let lib_path_var = dylib_path_envvar();
+
+    // Try to find CUDA installation
+    let cuda_paths = vec![
+        "/usr/local/cuda/nvvm/lib64",
+        "/usr/local/cuda-12/nvvm/lib64",
+        "/usr/local/cuda-11/nvvm/lib64",
+        "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.8\\nvvm\\lib\\x64",
+        "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.0\\nvvm\\lib\\x64",
+        "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.8\\nvvm\\lib\\x64",
+    ];
+
+    let mut found_cuda_paths = Vec::new();
+
+    // Check CUDA_PATH environment variable
+    if let Ok(cuda_path) = env::var("CUDA_PATH") {
+        let nvvm_path = Path::new(&cuda_path).join("nvvm").join("lib64");
+        if nvvm_path.exists() {
+            found_cuda_paths.push(nvvm_path.to_string_lossy().to_string());
+        }
+        let nvvm_path_win = Path::new(&cuda_path).join("nvvm").join("lib").join("x64");
+        if nvvm_path_win.exists() {
+            found_cuda_paths.push(nvvm_path_win.to_string_lossy().to_string());
+        }
+    }
+
+    // Check standard paths
+    for path in &cuda_paths {
+        if Path::new(path).exists() {
+            found_cuda_paths.push(path.to_string());
+        }
+    }
+
+    if !found_cuda_paths.is_empty() {
+        let existing_path = env::var(lib_path_var).unwrap_or_default();
+        let separator = if cfg!(windows) { ";" } else { ":" };
+
+        let new_paths = found_cuda_paths.join(separator);
+        let new_lib_path = if existing_path.is_empty() {
+            new_paths
+        } else {
+            format!("{new_paths}{separator}{existing_path}")
+        };
+
+        env::set_var(lib_path_var, new_lib_path);
+    }
+}
diff --git a/tests/compiletests/ui/core/ops/logical_and.rs b/tests/compiletests/ui/core/ops/logical_and.rs
new file mode 100644
index 00000000..9fe0f052
--- /dev/null
+++ b/tests/compiletests/ui/core/ops/logical_and.rs
@@ -0,0 +1,13 @@
+// Test using `&&` operator.
+// build-pass
+
+use cuda_std::kernel;
+
+fn f(x: bool, y: bool) -> bool {
+    x && y
+}
+
+#[kernel]
+pub unsafe fn main() {
+    f(false, true);
+}
diff --git a/tests/compiletests/ui/core/ops/logical_or.rs b/tests/compiletests/ui/core/ops/logical_or.rs
new file mode 100644
index 00000000..f0515254
--- /dev/null
+++ b/tests/compiletests/ui/core/ops/logical_or.rs
@@ -0,0 +1,13 @@
+// Test using `||` operator.
+// build-pass
+
+use cuda_std::kernel;
+
+fn f(x: bool, y: bool) -> bool {
+    x || y
+}
+
+#[kernel]
+pub unsafe fn main() {
+    f(false, true);
+}
diff --git a/tests/compiletests/ui/core/ops/range_contains.rs b/tests/compiletests/ui/core/ops/range_contains.rs
new file mode 100644
index 00000000..f7f733b7
--- /dev/null
+++ b/tests/compiletests/ui/core/ops/range_contains.rs
@@ -0,0 +1,12 @@
+// build-pass
+
+use cuda_std::kernel;
+
+fn has_two_decimal_digits(x: u32) -> bool {
+    (10..100).contains(&x)
+}
+
+#[kernel]
+pub unsafe fn main(i: u32, o: *mut u32) {
+    *o = has_two_decimal_digits(i) as u32;
+}
diff --git a/tests/compiletests/ui/dis/simple_add.rs b/tests/compiletests/ui/dis/simple_add.rs
new file mode 100644
index 00000000..15b6b179
--- /dev/null
+++ b/tests/compiletests/ui/dis/simple_add.rs
@@ -0,0 +1,12 @@
+// build-pass
+// compile-flags: -Cllvm-args=--disassemble-entry=simple_add_kernel --error-format=human
+
+// This test verifies PTX generation for a simple kernel
+
+use cuda_std::kernel;
+
+#[kernel]
+pub unsafe fn simple_add_kernel(a: *const f32, b: *const f32, c: *mut f32) {
+    let sum = *a + *b;
+    *c = sum;
+}
diff --git a/tests/compiletests/ui/dis/simple_add.stderr b/tests/compiletests/ui/dis/simple_add.stderr
new file mode 100644
index 00000000..0386db78
--- /dev/null
+++ b/tests/compiletests/ui/dis/simple_add.stderr
@@ -0,0 +1,39 @@
+.visible .entry simple_add_kernel(
+    .param .u64 simple_add_kernel_param_0,
+    .param .u64 simple_add_kernel_param_1,
+    .param .u64 simple_add_kernel_param_2
+)
+{
+    .reg .f32     %f<4>;
+    .reg .b64     %rd<7>;
+    .loc    1 9 0
+$L__func_begin0:
+    .loc    1 9 0
+
+
+    ld.param.u64     %rd1, [simple_add_kernel_param_0];
+    ld.param.u64     %rd2, [simple_add_kernel_param_1];
+    ld.param.u64     %rd3, [simple_add_kernel_param_2];
+$L__tmp0:
+    .loc    1 9 33
+    cvta.to.global.u64     %rd4, %rd3;
+    cvta.to.global.u64     %rd5, %rd2;
+    cvta.to.global.u64     %rd6, %rd1;
+    .loc    1 10 15
+    ld.global.f32     %f1, [%rd6];
+    .loc    1 10 20
+    ld.global.f32     %f2, [%rd5];
+    .loc    1 10 15
+    add.f32     %f3, %f1, %f2;
+$L__tmp1:
+    .loc    1 11 5
+    st.global.f32     [%rd4], %f3;
+$L__tmp2:
+    .loc    1 12 2
+    ret;
+$L__tmp3:
+$L__func_end0:
+
+}
+
+
diff --git a/tests/compiletests/ui/dis/target_feature_arch_cc.rs b/tests/compiletests/ui/dis/target_feature_arch_cc.rs
new file mode 100644
index 00000000..7e1cf88c
--- /dev/null
+++ b/tests/compiletests/ui/dis/target_feature_arch_cc.rs
@@ -0,0 +1,55 @@
+// only-compute_120a
+// build-fail
+// compile-flags: -Cllvm-args=--disassemble-entry=test_arch_cc_120a --error-format=human
+
+// This test verifies feature inheritance for compute_120a (architecture capability)
+// FIXME: This currently fails because NVVM doesn't support architecture suffixes like compute_120a
+// This test is ignored until we use a later NVVM that supports architecture suffixes
+
+use cuda_std::kernel;
+
+#[kernel]
+pub unsafe fn test_arch_cc_120a(result: *mut f32) {
+    let mut val = 0.0f32;
+
+    // compute_120a should have compute_120 enabled (base of architecture)
+    #[cfg(target_feature = "compute_120")]
+    {
+        val += 120.0;
+    }
+
+    // compute_120a should also have all lower capabilities enabled
+    #[cfg(target_feature = "compute_100")]
+    {
+        val += 100.0;
+    }
+
+    #[cfg(target_feature = "compute_90")]
+    {
+        val += 90.0;
+    }
+
+    #[cfg(target_feature = "compute_80")]
+    {
+        val += 80.0;
+    }
+
+    #[cfg(target_feature = "compute_70")]
+    {
+        val += 70.0;
+    }
+
+    // compute_120a should NOT have family features from lower versions
+    #[cfg(target_feature = "compute_100f")]
+    {
+        val += 100.5;
+    }
+
+    #[cfg(target_feature = "compute_101f")]
+    {
+        val += 101.5;
+    }
+
+    // Prevent DCE - expected value should be 460.0 (120 + 100 + 90 + 80 + 70)
+    core::ptr::write_volatile(result, val);
+}
diff --git a/tests/compiletests/ui/dis/target_feature_arch_cc.stderr b/tests/compiletests/ui/dis/target_feature_arch_cc.stderr
new file mode 100644
index 00000000..8db06a35
--- /dev/null
+++ b/tests/compiletests/ui/dis/target_feature_arch_cc.stderr
@@ -0,0 +1 @@
+error: libnvvm returned an error that was not previously caught by the verifier: InvalidOption
\ No newline at end of file
diff --git a/tests/compiletests/ui/dis/target_feature_base_cc.rs b/tests/compiletests/ui/dis/target_feature_base_cc.rs
new file mode 100644
index 00000000..7a30cd5e
--- /dev/null
+++ b/tests/compiletests/ui/dis/target_feature_base_cc.rs
@@ -0,0 +1,39 @@
+// build-pass
+// only-compute_70
+// compile-flags: -Cllvm-args=--disassemble-entry=test_base_cc_70 --error-format=human
+
+// This test verifies feature inheritance for compute_70 (base capability)
+
+use cuda_std::kernel;
+
+#[kernel]
+pub unsafe fn test_base_cc_70(result: *mut f32) {
+    let mut val = 0.0f32;
+
+    // arch=compute_70 should have target_feature=compute_70 enabled
+    #[cfg(target_feature = "compute_70")]
+    {
+        val += 70.0;
+    }
+
+    // arch=compute_70 should also have target_feature=compute_60 enabled (lower capability)
+    #[cfg(target_feature = "compute_60")]
+    {
+        val += 60.0;
+    }
+
+    // arch=compute_70 should NOT have target_feature=compute_80 enabled (higher capability)
+    #[cfg(target_feature = "compute_80")]
+    {
+        val += 80.0;
+    }
+
+    // arch=compute_70 should NOT have target_feature=compute_90 enabled
+    #[cfg(target_feature = "compute_90")]
+    {
+        val += 90.0;
+    }
+
+    // Prevent DCE - expected value should be 130.0 (70 + 60)
+    core::ptr::write_volatile(result, val);
+}
diff --git a/tests/compiletests/ui/dis/target_feature_base_cc.stderr b/tests/compiletests/ui/dis/target_feature_base_cc.stderr
new file mode 100644
index 00000000..8d6ad01b
--- /dev/null
+++ b/tests/compiletests/ui/dis/target_feature_base_cc.stderr
@@ -0,0 +1,31 @@
+.visible .entry test_base_cc_70(
+    .param .u64 test_base_cc_70_param_0
+)
+{
+    .reg .b32     %r<2>;
+    .reg .b64     %rd<3>;
+    .loc    1 10 0
+$L__func_begin0:
+    .loc    1 10 0
+
+
+    ld.param.u64     %rd1, [test_base_cc_70_param_0];
+$L__tmp0:
+    .loc    1 10 31
+    cvta.to.global.u64     %rd2, %rd1;
+$L__tmp1:
+    .loc    1 38 5
+    bra.uni    $L__tmp2;
+$L__tmp2:
+    .loc    2 2180 9
+    mov.u32     %r1, 1124204544;
+    st.volatile.global.u32     [%rd2], %r1;
+$L__tmp3:
+    .loc    1 39 2
+    ret;
+$L__tmp4:
+$L__func_end0:
+
+}
+
+
diff --git a/tests/compiletests/ui/dis/target_feature_family_cc.rs b/tests/compiletests/ui/dis/target_feature_family_cc.rs
new file mode 100644
index 00000000..14745810
--- /dev/null
+++ b/tests/compiletests/ui/dis/target_feature_family_cc.rs
@@ -0,0 +1,57 @@
+// only-compute_101f
+// build-fail
+// compile-flags: -Cllvm-args=--disassemble-entry=test_family_cc_101f --error-format=human
+
+// This test verifies feature inheritance for compute_101f (family capability)
+// FIXME: This currently fails because NVVM doesn't support family suffixes like compute_101f
+// This test is ignored until we use a later NVVM that supports family suffixes
+
+use cuda_std::kernel;
+
+#[kernel]
+pub unsafe fn test_family_cc_101f(result: *mut f32) {
+    let mut val = 0.0f32;
+
+    // compute_101f should have compute_101 enabled
+    #[cfg(target_feature = "compute_101")]
+    {
+        val += 101.0;
+    }
+
+    // compute_101f should have compute_100 enabled (lower family version)
+    #[cfg(target_feature = "compute_100")]
+    {
+        val += 100.0;
+    }
+
+    // compute_101f should NOT have compute_100f enabled (same family, lower minor)
+    #[cfg(target_feature = "compute_100f")]
+    {
+        val += 100.5;
+    }
+
+    // compute_101f should also have lower capabilities enabled
+    #[cfg(target_feature = "compute_90")]
+    {
+        val += 90.0;
+    }
+
+    #[cfg(target_feature = "compute_80")]
+    {
+        val += 80.0;
+    }
+
+    #[cfg(target_feature = "compute_70")]
+    {
+        val += 70.0;
+    }
+
+    // compute_101f should NOT have architecture-specific features
+    #[cfg(target_feature = "compute_120a")]
+    {
+        val += 120.0;
+    }
+
+    // Prevent DCE - expected value should be 441.0 (101 + 100 + 90 + 80 + 70)
+    core::ptr::write_volatile(result, val);
+}
diff --git a/tests/compiletests/ui/dis/target_feature_family_cc.stderr b/tests/compiletests/ui/dis/target_feature_family_cc.stderr
new file mode 100644
index 00000000..8db06a35
--- /dev/null
+++ b/tests/compiletests/ui/dis/target_feature_family_cc.stderr
@@ -0,0 +1 @@
+error: libnvvm returned an error that was not previously caught by the verifier: InvalidOption
\ No newline at end of file
diff --git a/tests/compiletests/ui/glam/mat4_operations.rs b/tests/compiletests/ui/glam/mat4_operations.rs
new file mode 100644
index 00000000..e54d2d10
--- /dev/null
+++ b/tests/compiletests/ui/glam/mat4_operations.rs
@@ -0,0 +1,51 @@
+// build-pass
+
+// This test verifies glam Mat4 operations work correctly in CUDA kernels
+
+use cuda_std::glam::{Mat4, Vec3, Vec4};
+use cuda_std::kernel;
+
+#[kernel]
+pub unsafe fn mat4_transform_operations(
+    matrix: Mat4,
+    point: Vec3,
+    vector: Vec4,
+    result_point: *mut Vec3,
+    result_vector: *mut Vec4,
+    result_determinant: *mut f32,
+) {
+    // Transform a 3D point (w=1 implied)
+    let transformed_point = matrix.transform_point3(point);
+    *result_point = transformed_point;
+
+    // Transform a 4D vector
+    let transformed_vector = matrix * vector;
+    *result_vector = transformed_vector;
+
+    // Calculate determinant
+    let det = matrix.determinant();
+    *result_determinant = det;
+}
+
+#[kernel]
+pub unsafe fn mat4_construction(
+    translation: Vec3,
+    scale: Vec3,
+    angle_radians: f32,
+    axis: Vec3,
+    result_translation: *mut Mat4,
+    result_scale: *mut Mat4,
+    result_rotation: *mut Mat4,
+) {
+    // Create translation matrix
+    let trans_mat = Mat4::from_translation(translation);
+    *result_translation = trans_mat;
+
+    // Create scale matrix
+    let scale_mat = Mat4::from_scale(scale);
+    *result_scale = scale_mat;
+
+    // Create rotation matrix
+    let rot_mat = Mat4::from_axis_angle(axis, angle_radians);
+    *result_rotation = rot_mat;
+}
diff --git a/tests/compiletests/ui/glam/vec3_operations.rs b/tests/compiletests/ui/glam/vec3_operations.rs
new file mode 100644
index 00000000..9ab43513
--- /dev/null
+++ b/tests/compiletests/ui/glam/vec3_operations.rs
@@ -0,0 +1,42 @@
+// build-pass
+
+// This test verifies glam Vec3 operations work correctly in CUDA kernels
+
+use cuda_std::glam::Vec3;
+use cuda_std::kernel;
+
+#[kernel]
+pub unsafe fn vec3_basic_ops(
+    a: Vec3,
+    b: Vec3,
+    result_add: *mut Vec3,
+    result_dot: *mut f32,
+    result_cross: *mut Vec3,
+) {
+    // Vector addition
+    let sum = a + b;
+    *result_add = sum;
+
+    // Dot product
+    let dot = a.dot(b);
+    *result_dot = dot;
+
+    // Cross product
+    let cross = a.cross(b);
+    *result_cross = cross;
+}
+
+#[kernel]
+pub unsafe fn vec3_normalization(
+    input: Vec3,
+    result_normalized: *mut Vec3,
+    result_length: *mut f32,
+) {
+    // Get length
+    let len = input.length();
+    *result_length = len;
+
+    // Normalize
+    let normalized = input.normalize();
+    *result_normalized = normalized;
+}
diff --git a/tests/compiletests/ui/hello_world.rs b/tests/compiletests/ui/hello_world.rs
new file mode 100644
index 00000000..82174c88
--- /dev/null
+++ b/tests/compiletests/ui/hello_world.rs
@@ -0,0 +1,8 @@
+// build-pass
+
+use cuda_std::kernel;
+
+#[kernel]
+pub unsafe fn add_one(x: *mut f32) {
+    *x = *x + 1.0;
+}
diff --git a/tests/compiletests/ui/lang/compile_fail.rs b/tests/compiletests/ui/lang/compile_fail.rs
new file mode 100644
index 00000000..eb0d1d36
--- /dev/null
+++ b/tests/compiletests/ui/lang/compile_fail.rs
@@ -0,0 +1,6 @@
+use cuda_std::kernel;
+
+#[kernel]
+pub unsafe fn bad_kernel() {
+    let _s = std::fs::File::create("foo.txt");
+}
diff --git a/tests/compiletests/ui/lang/compile_fail.stderr b/tests/compiletests/ui/lang/compile_fail.stderr
new file mode 100644
index 00000000..16555a74
--- /dev/null
+++ b/tests/compiletests/ui/lang/compile_fail.stderr
@@ -0,0 +1,11 @@
+error[E0433]: failed to resolve: use of unresolved module or unlinked crate `std`
+ --> $DIR/compile_fail.rs:5:14
+  |
+5 |     let _s = std::fs::File::create("foo.txt");
+  |              ^^^ use of unresolved module or unlinked crate `std`
+  |
+  = help: you might be missing a crate named `std`
+
+error: aborting due to 1 previous error
+
+For more information about this error, try `rustc --explain E0433`.
diff --git a/tests/compiletests/ui/thread/thread_functions.rs b/tests/compiletests/ui/thread/thread_functions.rs
new file mode 100644
index 00000000..5f732a4b
--- /dev/null
+++ b/tests/compiletests/ui/thread/thread_functions.rs
@@ -0,0 +1,17 @@
+// build-pass
+
+// This test verifies CUDA thread functions are available and working
+
+use cuda_std::kernel;
+use cuda_std::thread;
+
+#[kernel]
+pub unsafe fn thread_functions_test() {
+    // Thread identification functions
+    let _tid = thread::thread_idx_x();
+    let _bid = thread::block_idx_x();
+    let _bdim = thread::block_dim_x();
+
+    // Synchronization function
+    thread::sync_threads();
+}