diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml
index 2e548ec9..1e74d754 100644
--- a/.github/workflows/ci_linux.yml
+++ b/.github/workflows/ci_linux.yml
@@ -163,7 +163,6 @@ jobs:
               --exclude cust
           '
 
-      # Exclude cust_raw because it triggers hundreds of warnings.
       - name: Check documentation
         run: |
           docker exec "$CONTAINER_NAME" bash -lc 'set -euo pipefail
diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml
index 63082087..7923fb66 100644
--- a/.github/workflows/ci_windows.yml
+++ b/.github/workflows/ci_windows.yml
@@ -106,7 +106,6 @@ jobs:
         run: cargo test --workspace --exclude blastoff --exclude cudnn --exclude cudnn-sys --exclude cust
 
       # Exclude crates that require cuDNN, not available on Windows CI: cudnn, cudnn-sys.
-      # Exclude cust_raw because it triggers hundreds of warnings.
       - name: Check documentation
         env:
           RUSTDOCFLAGS: -Dwarnings
diff --git a/.github/workflows/container_images.yml b/.github/workflows/container_images.yml
index 7485812e..4fe650bd 100644
--- a/.github/workflows/container_images.yml
+++ b/.github/workflows/container_images.yml
@@ -33,9 +33,15 @@ jobs:
           - name: Ubuntu-24.04/CUDA-12.8.1
             image: "rust-gpu/rust-cuda-ubuntu24-cuda12"
             dockerfile: ./container/ubuntu24-cuda12/Dockerfile
+          - name: Ubuntu-24.04/CUDA-13.0.2
+            image: "rust-gpu/rust-cuda-ubuntu24-cuda13"
+            dockerfile: ./container/ubuntu24-cuda13/Dockerfile
           - name: RockyLinux-9/CUDA-12.8.1
             image: "rust-gpu/rust-cuda-rockylinux9-cuda12"
             dockerfile: ./container/rockylinux9-cuda12/Dockerfile
+          - name: RockyLinux-9/CUDA-13.0.2
+            image: "rust-gpu/rust-cuda-rockylinux9-cuda13"
+            dockerfile: ./container/rockylinux9-cuda13/Dockerfile
     steps:
       - name: Free up space
         # Without this the job will likely run out of disk space.
@@ -153,8 +159,12 @@ jobs:
         variance:
           - name: Ubuntu-24.04/CUDA-12.8.1
             image: "rust-gpu/rust-cuda-ubuntu24-cuda12"
+          - name: Ubuntu-24.04/CUDA-13.0.2
+            image: "rust-gpu/rust-cuda-ubuntu24-cuda13"
           - name: RockyLinux-9/CUDA-12.8.1
             image: "rust-gpu/rust-cuda-rockylinux9-cuda12"
+          - name: RockyLinux-9/CUDA-13.0.2
+            image: "rust-gpu/rust-cuda-rockylinux9-cuda13"
     steps:
       - name: Set artifact name
         run: |
diff --git a/README.md b/README.md
index c069a024..1e410303 100644
--- a/README.md
+++ b/README.md
@@ -2,116 +2,36 @@
   <h1>The Rust CUDA Project</h1>
 
   <p>
-    <strong>An ecosystem of libraries and tools for writing and executing extremely fast GPU code fully in
-    <a href="https://www.rust-lang.org/">Rust</a></strong>
+    <strong>An ecosystem of libraries and tools for writing and executing extremely fast GPU code
+    fully in <a href="https://www.rust-lang.org/">Rust</a></strong>
   </p>
-
-  <h3>
-    <a href="https://rust-gpu.github.io/rust-cuda/index.html">Guide</a>
-    <span> | </span>
-    <a href="https://rust-gpu.github.io/rust-cuda/guide/getting_started.html">Getting Started</a>
-    <span> | </span>
-    <a href="https://rust-gpu.github.io/rust-cuda/features.html">Features</a>
-  </h3>
-<strong>⚠️ The project is still in early development, expect bugs, safety issues, and things that don't work ⚠️</strong>
 </div>
 
-<br/>
-
 > [!IMPORTANT]
 > This project is no longer dormant and is [being
 > rebooted](https://rust-gpu.github.io/blog/2025/01/27/rust-cuda-reboot). Read the [latest status update](https://rust-gpu.github.io/blog/2025/08/11/rust-cuda-update).
 > Please contribute!
+>
+> The project is still in early development, however. Expect bugs, safety issues, and things that
+> don't work.
 
-## Goal
-
-The Rust CUDA Project is a project aimed at making Rust a tier-1 language for extremely fast GPU computing
-using the CUDA Toolkit. It provides tools for compiling Rust to extremely fast PTX code as well as libraries
-for using existing CUDA libraries with it.
-
-## Background
-
-Historically, general purpose high performance GPU computing has been done using the CUDA toolkit. The CUDA toolkit primarily
-provides a way to use Fortran/C/C++ code for GPU computing in tandem with CPU code with a single source. It also provides
-many libraries, tools, forums, and documentation to supplement the single-source CPU/GPU code.
-
-CUDA is exclusively an NVIDIA-only toolkit. Many tools have been proposed for cross-platform GPU computing such as
-OpenCL, Vulkan Computing, and HIP. However, CUDA remains the most used toolkit for such tasks by far. This is why it is
-imperative to make Rust a viable option for use with the CUDA toolkit.
-
-However, CUDA with Rust has been a historically very rocky road. The only viable option until now has been to use the LLVM PTX
-backend, however, the LLVM PTX backend does not always work and would generate invalid PTX for many common Rust operations, and
-in recent years it has been shown time and time again that a specialized solution is needed for Rust on the GPU with the advent
-of projects such as rust-gpu (for Rust -> SPIR-V).
-
-Our hope is that with this project we can push the Rust GPU computing industry forward and make Rust an excellent language
-for such tasks. Rust offers plenty of benefits such as `__restrict__` performance benefits for every kernel, An excellent module/crate system,
-delimiting of unsafe areas of CPU/GPU code with `unsafe`, high level wrappers to low level CUDA libraries, etc.
-
-## Structure
-
-The scope of the Rust CUDA Project is quite broad, it spans the entirety of the CUDA ecosystem, with libraries and tools to make it
-usable using Rust. Therefore, the project contains many crates for all corners of the CUDA ecosystem.
-
-The current line-up of libraries is the following:
-
-- `rustc_codegen_nvvm` Which is a rustc backend that targets NVVM IR (a subset of LLVM IR) for the [libnvvm](https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html) library.
-  - Generates highly optimized PTX code which can be loaded by the CUDA Driver API to execute on the GPU.
-  - For the near future it will be CUDA-only, but it may be used to target amdgpu in the future.
-- `cuda_std` for GPU-side functions and utilities, such as thread index queries, memory allocation, warp intrinsics, etc.
-  - _Not_ a low level library, provides many utility functions to make it easier to write cleaner and more reliable GPU kernels.
-  - Closely tied to `rustc_codegen_nvvm` which exposes GPU features through it internally.
-- [`cudnn`](https://github.com/Rust-GPU/rust-cuda/tree/master/crates/cudnn) for a collection of GPU-accelerated primitives for deep neural networks.
-- `cust` for CPU-side CUDA features such as launching GPU kernels, GPU memory allocation, device queries, etc.
-  - High level with features such as RAII and Rust Results that make it easier and cleaner to manage the interface to the GPU.
-  - A high level wrapper for the CUDA Driver API, the lower level version of the more common CUDA Runtime API used from C++.
-  - Provides much more fine grained control over things like kernel concurrency and module loading than the C++ Runtime API.
-- `gpu_rand` for GPU-friendly random number generation, currently only implements xoroshiro RNGs from `rand_xoshiro`.
-- `optix` for CPU-side hardware raytracing and denoising using the CUDA OptiX library.
-
-In addition to many "glue" crates for things such as high level wrappers for certain smaller CUDA libraries.
-
-## Related Projects
-
-Other projects related to using Rust on the GPU:
-
-- 2016: [glassful](https://github.com/kmcallister/glassful) Subset of Rust that compiles to GLSL.
-- 2017: [inspirv-rust](https://github.com/msiglreith/inspirv-rust) Experimental Rust MIR -> SPIR-V Compiler.
-- 2018: [nvptx](https://github.com/japaric-archived/nvptx) Rust to PTX compiler using the `nvptx` target for rustc (using the LLVM PTX backend).
-- 2020: [accel](https://github.com/termoshtt/accel) Higher-level library that relied on the same mechanism that `nvptx` does.
-- 2020: [rlsl](https://github.com/MaikKlein/rlsl) Experimental Rust -> SPIR-V compiler (predecessor to rust-gpu)
-- 2020: [rust-gpu](https://github.com/Rust-GPU/rust-gpu) `rustc` compiler backend to compile Rust to SPIR-V for use in shaders, similar mechanism as our project.
-
-## Usage
-```bash
-## setup your environment like:
-### export OPTIX_ROOT=/opt/NVIDIA-OptiX-SDK-9.0.0-linux64-x86_64
-### export OPTIX_ROOT_DIR=/opt/NVIDIA-OptiX-SDK-9.0.0-linux64-x86_64
-
-## build proj
-cargo build
-```
-
-## Use Rust CUDA in Container Environments
-
-The distribution related Dockerfile are located in `container` folder.
-Taking ubuntu 24.04 as an example, run the following command in repository root:
-```bash
-docker build -f ./container/ubuntu24-cuda12/Dockerfile -t rust-cuda-ubuntu24 .
-docker run --rm --runtime=nvidia --gpus all -it rust-cuda-ubuntu24
-```
+## Documentation
 
-A sample `.devcontainer.json` file is also included, configured for Ubuntu 24.02. Copy this to `.devcontainer/devcontainer.json` to make additional customizations.
+Please see [The Rust CUDA Guide](https://rust-gpu.github.io/rust-cuda/) for documentation on Rust
+CUDA.
 
 ## License
 
 Licensed under either of
 
-- Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+- Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
+  http://www.apache.org/licenses/LICENSE-2.0)
 - MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
 
 at your discretion.
 
 ### Contribution
 
-Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions.
+Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in
+the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any
+additional terms or conditions.
diff --git a/container/rockylinux9-cuda13/Dockerfile b/container/rockylinux9-cuda13/Dockerfile
new file mode 100644
index 00000000..dc428186
--- /dev/null
+++ b/container/rockylinux9-cuda13/Dockerfile
@@ -0,0 +1,92 @@
+FROM nvcr.io/nvidia/cuda:13.0.2-cudnn-devel-rockylinux9 AS llvm-builder
+
+RUN dnf -y install \
+    --nobest \
+    --allowerasing \
+    --setopt=install_weak_deps=False \
+    openssl-devel \
+    pkgconfig \
+    which \
+    xz \
+    zlib-devel \
+    libffi-devel \
+    ncurses-devel \
+    libxml2-devel \
+    libedit-devel \
+    python3 \
+    make \
+    cmake && \
+    dnf clean all
+
+WORKDIR /data/llvm7
+
+# Download and build LLVM 7.1.0 for all architectures.
+RUN curl -sSf -L -O https://github.com/llvm/llvm-project/releases/download/llvmorg-7.1.0/llvm-7.1.0.src.tar.xz && \
+    tar -xf llvm-7.1.0.src.tar.xz && \
+    cd llvm-7.1.0.src && \
+    mkdir build && cd build && \
+    ARCH=$(uname -m) && \
+    if [ "$ARCH" = "x86_64" ]; then \
+        TARGETS="X86;NVPTX"; \
+    else \
+        TARGETS="AArch64;NVPTX"; \
+    fi && \
+    cmake \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DLLVM_TARGETS_TO_BUILD="$TARGETS" \
+        -DLLVM_BUILD_LLVM_DYLIB=ON \
+        -DLLVM_LINK_LLVM_DYLIB=ON \
+        -DLLVM_ENABLE_ASSERTIONS=OFF \
+        -DLLVM_ENABLE_BINDINGS=OFF \
+        -DLLVM_INCLUDE_EXAMPLES=OFF \
+        -DLLVM_INCLUDE_TESTS=OFF \
+        -DLLVM_INCLUDE_BENCHMARKS=OFF \
+        -DLLVM_ENABLE_ZLIB=ON \
+        -DLLVM_ENABLE_TERMINFO=ON \
+        -DCMAKE_INSTALL_PREFIX=/opt/llvm-7 \
+        .. && \
+    make -j$(nproc) && \
+    make install && \
+    cd ../.. && \
+    rm -rf llvm-7.1.0.src* && \
+    dnf clean all
+
+FROM nvcr.io/nvidia/cuda:13.0.2-cudnn-devel-rockylinux9
+
+RUN dnf -y install \
+    --nobest \
+    --allowerasing \
+    --setopt=install_weak_deps=False \
+    clang \
+    openssl-devel \
+    fontconfig-devel \
+    libX11-devel \
+    libXcursor-devel \
+    libXi-devel \
+    libXrandr-devel \
+    libxml2-devel \
+    ncurses-devel \
+    pkgconfig \
+    which \
+    xz \
+    zlib-devel \
+    cmake && \
+    dnf clean all
+
+COPY --from=llvm-builder /opt/llvm-7 /opt/llvm-7
+RUN ln -s /opt/llvm-7/bin/llvm-config /usr/bin/llvm-config && \
+    ln -s /opt/llvm-7/bin/llvm-config /usr/bin/llvm-config-7
+
+# Get Rust (install rustup; toolchain installed from rust-toolchain.toml below)
+RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y --profile minimal --default-toolchain none
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Setup the workspace
+WORKDIR /data/rust-cuda
+RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \
+    rustup show
+
+# Add nvvm to LD_LIBRARY_PATH.
+ENV LD_LIBRARY_PATH="/usr/local/cuda/nvvm/lib64:${LD_LIBRARY_PATH}"
+ENV LLVM_LINK_STATIC=1
+ENV RUST_LOG=info
diff --git a/container/ubuntu24-cuda13/Dockerfile b/container/ubuntu24-cuda13/Dockerfile
new file mode 100644
index 00000000..be2a2f73
--- /dev/null
+++ b/container/ubuntu24-cuda13/Dockerfile
@@ -0,0 +1,89 @@
+FROM nvcr.io/nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04 AS llvm-builder
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \
+    build-essential \
+    clang \
+    curl \
+    libffi-dev \
+    libedit-dev \
+    libncurses5-dev \
+    libssl-dev \
+    libtinfo-dev \
+    libxml2-dev \
+    cmake \
+    ninja-build \
+    pkg-config \
+    python3 \
+    xz-utils \
+    zlib1g-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /data/llvm7
+
+# Download and build LLVM 7.1.0 for all architectures.
+RUN curl -sSf -L -O https://github.com/llvm/llvm-project/releases/download/llvmorg-7.1.0/llvm-7.1.0.src.tar.xz && \
+    tar -xf llvm-7.1.0.src.tar.xz && \
+    cd llvm-7.1.0.src && \
+    mkdir build && cd build && \
+    ARCH=$(dpkg --print-architecture) && \
+    if [ "$ARCH" = "amd64" ]; then \
+        TARGETS="X86;NVPTX"; \
+    else \
+        TARGETS="AArch64;NVPTX"; \
+    fi && \
+    cmake -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DLLVM_TARGETS_TO_BUILD="$TARGETS" \
+        -DLLVM_BUILD_LLVM_DYLIB=ON \
+        -DLLVM_LINK_LLVM_DYLIB=ON \
+        -DLLVM_ENABLE_ASSERTIONS=OFF \
+        -DLLVM_ENABLE_BINDINGS=OFF \
+        -DLLVM_INCLUDE_EXAMPLES=OFF \
+        -DLLVM_INCLUDE_TESTS=OFF \
+        -DLLVM_INCLUDE_BENCHMARKS=OFF \
+        -DLLVM_ENABLE_ZLIB=ON \
+        -DLLVM_ENABLE_TERMINFO=ON \
+        -DCMAKE_INSTALL_PREFIX=/opt/llvm-7 \
+        .. && \
+    ninja -j$(nproc) && \
+    ninja install && \
+    cd ../.. && \
+    rm -rf llvm-7.1.0.src*
+
+FROM nvcr.io/nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \
+    build-essential \
+    clang \
+    curl \
+    libssl-dev \
+    libtinfo-dev \
+    pkg-config \
+    xz-utils \
+    zlib1g-dev \
+    cmake \
+    libfontconfig-dev \
+    libx11-xcb-dev \
+    libxcursor-dev \
+    libxi-dev \
+    libxinerama-dev \
+    libxrandr-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY --from=llvm-builder /opt/llvm-7 /opt/llvm-7
+RUN ln -s /opt/llvm-7/bin/llvm-config /usr/bin/llvm-config && \
+    ln -s /opt/llvm-7/bin/llvm-config /usr/bin/llvm-config-7
+
+# Get Rust (install rustup; toolchain installed from rust-toolchain.toml below)
+RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y --profile minimal --default-toolchain none
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Setup the workspace
+WORKDIR /data/rust-cuda
+RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \
+    rustup show
+
+# Add nvvm to LD_LIBRARY_PATH.
+ENV LD_LIBRARY_PATH="/usr/local/cuda/nvvm/lib64:${LD_LIBRARY_PATH}"
+ENV LLVM_LINK_STATIC=1
+ENV RUST_LOG=info
diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs
index cb19c6e7..c6706ea0 100644
--- a/crates/nvvm/src/lib.rs
+++ b/crates/nvvm/src/lib.rs
@@ -312,8 +312,10 @@ pub enum NvvmArch {
     /// This default value of 7.5 corresponds to Turing and later devices. We default to this
     /// because it is the minimum supported by CUDA 13.0 while being in the middle of the range
     /// supported by CUDA 12.x.
-    // WARNING: If you change the default, consider updating the `--target-arch` values used for
-    // compiletests in `ci_linux.yml` and `.github/workflows/ci_{linux,windows}.yml`.
+    // WARNING: If you change the default, consider updating:
+    // - The `--target-arch` values used for compiletests in `ci_linux.yml` and
+    //   `.github/workflows/ci_{linux,windows}.yml`.
+    // - The CUDA versions used in `setup_cuda_environment` in `compiletests`.
     #[default]
     Compute75,
     Compute80,
diff --git a/guide/src/README.md b/guide/src/README.md
deleted file mode 100644
index c2bce261..00000000
--- a/guide/src/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Introduction
-
-Welcome to the Rust CUDA guide! Let's dive right in.
diff --git a/guide/src/SUMMARY.md b/guide/src/SUMMARY.md
index 21cd8d26..4c5fe3d8 100644
--- a/guide/src/SUMMARY.md
+++ b/guide/src/SUMMARY.md
@@ -1,8 +1,6 @@
 # Summary 
 
-- [Introduction](README.md)
-- [Supported Features](features.md)
-- [Frequently Asked Questions](faq.md)
+- [Introduction](introduction.md)
 - [Guide](guide/README.md)
   - [Getting Started](guide/getting_started.md)
   - [Compute Capability Gating](guide/compute_capabilities.md)
@@ -18,3 +16,9 @@
   - [Types](nvvm/types.md)
   - [PTX Generation](nvvm/ptxgen.md)
   - [Debugging](nvvm/debugging.md)
+
+----
+
+[Supported Features](features.md)
+[Frequently Asked Questions](faq.md)
+
diff --git a/guide/assets/nsight.png b/guide/src/assets/nsight.png
similarity index 100%
rename from guide/assets/nsight.png
rename to guide/src/assets/nsight.png
diff --git a/guide/assets/streams.svg b/guide/src/assets/streams.svg
similarity index 100%
rename from guide/assets/streams.svg
rename to guide/src/assets/streams.svg
diff --git a/guide/src/guide/getting_started.md b/guide/src/guide/getting_started.md
index 07b872d5..762e17cb 100644
--- a/guide/src/guide/getting_started.md
+++ b/guide/src/guide/getting_started.md
@@ -276,12 +276,24 @@ There are two ways to build and run this example: natively, and with docker.
 
 ### Native
 
-If you have all the required libraries installed, try building with `cargo build`. If you get an
-error "libnvvm.so.4: cannot open shared object file", you will need to adjust `LD_LIBRARY_PATH`,
-something like this:
+If you have all the required libraries installed, try building with `cargo build`.
+
+If you get an error "libnvvm.so.4: cannot open shared object file", you will need to adjust
+`LD_LIBRARY_PATH`, something like this:
 ```
 export LD_LIBRARY_PATH="/usr/local/cuda/nvvm/lib64:${LD_LIBRARY_PATH}"
 ```
+
+If you get an error "error: couldn't load codegen backend" on Windows, you will need to adjust
+`PATH`, something like this with CUDA 12:
+```
+$env:PATH += ";C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\nvvm\bin"
+```
+or this with CUDA 13:
+```
+$env:PATH += ";C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\nvvm\bin\x64"
+```
+
 You should then be able to `cargo run`, and see the expected output:
 ```
 c = [3.0, 5.0, 7.0, 9.0]
@@ -328,6 +340,9 @@ is recognized.
   `make`ing and running the [`deviceQuery`] sample. If all is well it will print various details
   about your GPU.
 
+A sample `.devcontainer.json` file is also included, configured for Ubuntu 24.04. Copy this to
+`.devcontainer/devcontainer.json` to make additional customizations.
+
 [`deviceQuery`]: https://github.com/NVIDIA/cuda-samples/tree/ba04faaf7328dbcc87bfc9acaf17f951ee5ddcf3/Samples/deviceQuery
 
 ## More examples
diff --git a/guide/src/guide/kernel_abi.md b/guide/src/guide/kernel_abi.md
index 5330d207..1ab5cd04 100644
--- a/guide/src/guide/kernel_abi.md
+++ b/guide/src/guide/kernel_abi.md
@@ -52,15 +52,15 @@ by reference (by allocating a device box):
 
 ```rs
 let foo = Foo { 
-  a: 5,
-  b: 6,
-  c: 7
+    a: 5,
+    b: 6,
+    c: 7
 };
 
 unsafe {
-  launch!(
-    module.kernel<<<1, 1, 0, stream>>>(foo)
-  )?;
+    launch!(
+        module.kernel<<<1, 1, 0, stream>>>(foo)
+    )?;
 }
 ```
 
@@ -68,15 +68,15 @@ And not
 
 ```rs
 let foo = DeviceBox::new(Foo { 
-  a: 5,
-  b: 6,
-  c: 7
+    a: 5,
+    b: 6,
+    c: 7
 });
 
 unsafe {
-  launch!(
-    module.kernel<<<1, 1, 0, stream>>>(foo.as_device_ptr())
-  )?;
+    launch!(
+        module.kernel<<<1, 1, 0, stream>>>(foo.as_device_ptr())
+    )?;
 }
 ```
 
diff --git a/guide/src/introduction.md b/guide/src/introduction.md
new file mode 100644
index 00000000..8fc1eeae
--- /dev/null
+++ b/guide/src/introduction.md
@@ -0,0 +1,81 @@
+# Introduction
+
+Welcome to the Rust CUDA Guide!
+
+## Goal
+
+The Rust CUDA Project is a project aimed at making Rust a tier-1 language for GPU computing using
+the CUDA Toolkit. It provides tools for compiling Rust to fast PTX code as well as libraries for
+using existing CUDA libraries with it.
+
+## Background
+
+Historically, general-purpose high-performance GPU computing has been done using the CUDA toolkit.
+The CUDA toolkit primarily provides a way to use Fortran/C/C++ code for GPU computing in tandem
+with CPU code with a single source. It also provides many libraries, tools, forums, and
+documentation to supplement the single-source CPU/GPU code.
+
+CUDA is exclusively an NVIDIA-only toolkit. Many tools have been proposed for cross-platform GPU
+computing such as OpenCL, Vulkan Computing, and HIP. However, CUDA remains the most used toolkit
+for such tasks by far. This is why it is imperative to make Rust a viable option for use with the
+CUDA toolkit.
+
+However, CUDA with Rust has been a historically very rocky road. The only viable option until now
+has been to use the LLVM PTX backend. However, the LLVM PTX backend does not always work and would
+generate invalid PTX for many common Rust operations. In recent years it has been shown time and
+time again that a specialized solution is needed for Rust on the GPU with the advent of projects
+such as rust-gpu (for translating Rust to SPIR-V).
+
+Our hope is that with this project we can push the Rust on GPUs forward and make Rust an excellent
+language for such tasks. Rust offers plenty of benefits such as `__restrict__` performance benefits
+for every kernel, an excellent module/crate system, delimiting of unsafe areas of CPU/GPU code with
+`unsafe`, high-level wrappers to low-level CUDA libraries, etc.
+
+## Structure
+
+The scope of the Rust CUDA Project is broad, spanning the entirety of the CUDA ecosystem, with
+libraries and tools to make it usable using Rust. Therefore, the project contains many crates for
+all corners of the CUDA ecosystem.
+
+- `rustc_codegen_nvvm` is a rustc backend that targets NVVM IR (a subset of LLVM IR) for the
+  [libnvvm](https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html) library.
+  - Generates highly optimized PTX code which can be loaded by the CUDA Driver API to execute on
+    the GPU.
+  - For now it is CUDA-only, but it may be used to target AMD GPUs in the future.
+- `cuda_std` contains GPU-side functions and utilities, such as thread index queries, memory
+  allocation, warp intrinsics, etc.
+  - It is _not_ a low level library. It provides many utility functions to make it easier to write
+    cleaner and more reliable GPU kernels.
+  - It is Closely tied to `rustc_codegen_nvvm` which exposes GPU features through it internally.
+- `cust` contains CPU-side CUDA features such as launching GPU kernels, GPU memory allocation,
+  device queries, etc.
+  - It is a high-level wrapper for the CUDA Driver API, the lower level alternative to the more
+    common CUDA Runtime API used from C++. It provides more fine-grained control over things like
+    kernel concurrency and module loading than the Runtime API.
+  - High-level Rust features such as RAII and `Result` make it easier and cleaner to manage
+    the interface to the GPU.
+- `cudnn` is a collection of GPU-accelerated primitives for deep neural networks.
+- `gpu_rand` does GPU-friendly random number generation. It currently only implements xoroshiro
+  RNGs from `rand_xoshiro`.
+- `optix` provides CPU-side hardware raytracing and denoising using the CUDA OptiX library.
+  (This library is currently commented out because the OptiX SDK is difficult to install.)
+
+There are also several "glue" crates for things such as high level wrappers for certain smaller
+CUDA libraries.
+
+## Related Projects
+
+Other projects related to using Rust on the GPU:
+
+- 2016: [glassful](https://github.com/kmcallister/glassful) translates a subset of Rust to GLSL.
+- 2017: [inspirv-rust](https://github.com/msiglreith/inspirv-rust) is an experimental
+  Rust-MIR-to-SPIR-V compiler.
+- 2018: [nvptx](https://github.com/japaric-archived/nvptx) is a Rust-to-PTX compiler using the
+  `nvptx` target for rustc (using the LLVM PTX backend).
+- 2020: [accel](https://github.com/termoshtt/accel) is a higher-level library that relied on the
+  same mechanism that `nvptx` does.
+- 2020: [rlsl](https://github.com/MaikKlein/rlsl) is an experimental Rust-to-SPIR-V compiler
+  (and a predecessor to rust-gpu).
+- 2020: [rust-gpu](https://github.com/Rust-GPU/rust-gpu) is a `rustc` compiler backend to compile
+  Rust to SPIR-V for use in shaders. Like Rust CUDA, it is part of the broader [Rust
+  GPU](https://rust-gpu.github.io/) project.
diff --git a/guide/src/nvvm/debugging.md b/guide/src/nvvm/debugging.md
index 6c0491a9..23b3578d 100644
--- a/guide/src/nvvm/debugging.md
+++ b/guide/src/nvvm/debugging.md
@@ -33,8 +33,10 @@ which I will add to the project soon.
 ## Miscompilations 
 
 Miscompilations are rare but annoying. They usually result in one of two things happening:
-- CUDA rejecting the PTX as a whole (throwing an InvalidPtx error). This is rare but the most common cause is declaring invalid
-extern functions (just grep for `extern` in the PTX file and check if it's odd functions that aren't CUDA syscalls like vprintf, malloc, free, etc).
+- CUDA rejecting the PTX as a whole (throwing an `InvalidPtx` error). Run `ptxas` on the `.ptx`
+  file to get a more informative error message. This is rare but the most common cause is declaring
+  invalid extern functions (just grep for `extern` in the PTX file and check if it's odd functions
+  that aren't CUDA syscalls like vprintf, malloc, free, etc).
 - The PTX containing invalid behavior. This is very specific and rare but if you find this, the best way to debug it is:
   - Try to get a minimal working example so we don't have to search through megabytes of LLVM IR/PTX.
   - Use `RUSTFLAGS="--emit=llvm-ir"` and find `crate_name.ll` in `target/nvptx64-nvidia-cuda/<debug/release>/deps/` and attach it in any bug report.
@@ -51,4 +53,4 @@ If you set up the codegen backend for debug, it should give you a mapping from R
 
 Here is an example of the screen you should see:
 
-![](../../../assets/nsight.png)
+![](../../assets/nsight.png)
diff --git a/tests/compiletests/src/main.rs b/tests/compiletests/src/main.rs
index 0e345813..b6888e95 100644
--- a/tests/compiletests/src/main.rs
+++ b/tests/compiletests/src/main.rs
@@ -576,30 +576,41 @@ fn setup_cuda_environment() {
     // Set library path to include CUDA NVVM libraries
     let lib_path_var = dylib_path_envvar();
 
-    // Try to find CUDA installation
-    let cuda_paths = vec![
-        "/usr/local/cuda/nvvm/lib64",
-        "/usr/local/cuda-12/nvvm/lib64",
-        "/usr/local/cuda-11/nvvm/lib64",
-        "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.8\\nvvm\\lib\\x64",
-        "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.0\\nvvm\\lib\\x64",
-        "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.8\\nvvm\\lib\\x64",
-    ];
-
     let mut found_cuda_paths = Vec::new();
 
     // Check CUDA_PATH environment variable
     if let Ok(cuda_path) = env::var("CUDA_PATH") {
-        let nvvm_path = Path::new(&cuda_path).join("nvvm").join("lib64");
-        if nvvm_path.exists() {
-            found_cuda_paths.push(nvvm_path.to_string_lossy().to_string());
+        #[cfg(unix)]
+        {
+            let nvvm_path = Path::new(&cuda_path).join("nvvm").join("lib64");
+            if nvvm_path.exists() {
+                found_cuda_paths.push(nvvm_path.to_string_lossy().to_string());
+            }
         }
-        let nvvm_path_win = Path::new(&cuda_path).join("nvvm").join("lib").join("x64");
-        if nvvm_path_win.exists() {
-            found_cuda_paths.push(nvvm_path_win.to_string_lossy().to_string());
+        #[cfg(windows)]
+        {
+            let nvvm_path = Path::new(&cuda_path).join("nvvm").join("lib").join("x64");
+            if nvvm_path.exists() {
+                found_cuda_paths.push(nvvm_path.to_string_lossy().to_string());
+            }
         }
     }
 
+    // Try to find CUDA installation
+    #[cfg(unix)]
+    let cuda_paths = vec![
+        "/usr/local/cuda/nvvm/lib64",
+        "/usr/local/cuda-13/nvvm/lib64",
+        "/usr/local/cuda-12/nvvm/lib64",
+    ];
+    #[cfg(windows)]
+    let cuda_paths = vec![
+        "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\nvvm\\lib\\x64",
+        "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.9\\nvvm\\lib\\x64",
+        "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.8\\nvvm\\lib\\x64",
+        "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.0\\nvvm\\lib\\x64",
+    ];
+
     // Check standard paths
     for path in &cuda_paths {
         if Path::new(path).exists() {