NVIDIA
diff --git a/‎.github/workflows/test-wheel-linux.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/test-wheel-linux.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/test-matrix.yml‎
Lines changed: 2 additions & 0 deletions b/‎ci/test-matrix.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cuda_bindings/docs/build_docs.sh‎
Lines changed: 4 additions & 0 deletions b/‎cuda_bindings/docs/build_docs.sh‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cuda_bindings/docs/source/conf.py‎
Lines changed: 53 additions & 1 deletion b/‎cuda_bindings/docs/source/conf.py‎
Lines changed: 53 additions & 1 deletion
diff --git a/‎cuda_bindings/docs/source/contribute.rst‎
Lines changed: 14 additions & 9 deletions b/‎cuda_bindings/docs/source/contribute.rst‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎cuda_bindings/docs/source/examples.rst‎
Lines changed: 68 additions & 0 deletions b/‎cuda_bindings/docs/source/examples.rst‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎cuda_bindings/docs/source/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎cuda_bindings/docs/source/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cuda_bindings/docs/source/install.rst‎
Lines changed: 2 additions & 2 deletions b/‎cuda_bindings/docs/source/install.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cuda_bindings/docs/source/overview.rst‎
Lines changed: 9 additions & 5 deletions b/‎cuda_bindings/docs/source/overview.rst‎
Lines changed: 9 additions & 5 deletions
@@ -70,12 +70,12 @@ jobs:
           echo "OLD_BRANCH=${OLD_BRANCH}" >> "$GITHUB_OUTPUT"
 
   test:
-    name: py${{ matrix.PY_VER }}, ${{ matrix.CUDA_VER }}, ${{ (matrix.LOCAL_CTK == '1' && 'local') || 'wheels' }}, ${{ matrix.GPU }}${{ matrix.GPU_COUNT != '1' && format('(x{0})', matrix.GPU_COUNT) || '' }}
+    name: py${{ matrix.PY_VER }}, ${{ matrix.CUDA_VER }}, ${{ (matrix.LOCAL_CTK == '1' && 'local') || 'wheels' }}, ${{ matrix.GPU }}${{ matrix.GPU_COUNT != '1' && format('(x{0})', matrix.GPU_COUNT) || '' }}${{ matrix.FLAVOR && format(', {0}', matrix.FLAVOR) || '' }}
     needs: compute-matrix
     strategy:
       fail-fast: false
       matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
-    runs-on: "linux-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}"
+    runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}"
     # The build stage could fail but we want the CI to keep moving.
     if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
     # Our self-hosted runners require a container
 
@@ -120,6 +120,7 @@ instance/
 # Sphinx documentation
 docs_src/_build/
 */docs/source/generated/
+*/docs/source/module/generated/
 
 # PyBuilder
 .pybuilder/
 
@@ -60,6 +60,8 @@ linux:
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.2.0', LOCAL_CTK: '1', GPU: 'h100',       GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.2.0', LOCAL_CTK: '1', GPU: 't4',         GPU_COUNT: '2', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.2.0', LOCAL_CTK: '1', GPU: 'h100',       GPU_COUNT: '2', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.11',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4',         GPU_COUNT: '1', DRIVER: 'latest', FLAVOR: 'wsl' }
+    - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.2.0', LOCAL_CTK: '0', GPU: 'rtx4090',    GPU_COUNT: '1', DRIVER: 'latest', FLAVOR: 'wsl' }
   nightly: []
 
 windows:
 
@@ -25,6 +25,10 @@ if [[ -z "${SPHINX_CUDA_BINDINGS_VER}" ]]; then
                                       | awk -F'+' '{print $1}')
 fi
 
+if [[ "${LATEST_ONLY}" == "1" && -z "${BUILD_PREVIEW:-}" && -z "${BUILD_LATEST:-}" ]]; then
+    export BUILD_LATEST=1
+fi
+
 # build the docs (in parallel)
 SPHINXOPTS="-j 4 -d build/.doctrees" make html
 
 
@@ -9,6 +9,7 @@
 
 # -- Path setup --------------------------------------------------------------
 
+import inspect
 import os
 import sys
 from pathlib import Path
@@ -26,6 +27,15 @@
 release = os.environ["SPHINX_CUDA_BINDINGS_VER"]
 
 
+def _github_examples_ref():
+    if int(os.environ.get("BUILD_PREVIEW", 0)) or int(os.environ.get("BUILD_LATEST", 0)):
+        return "main"
+    return f"v{release}"
+
+
+GITHUB_EXAMPLES_REF = _github_examples_ref()
+
+
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
@@ -94,11 +104,15 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
+html_static_path = []  # ["_static"] does not exist in our environment
 
 # skip cmdline prompts
 copybutton_exclude = ".linenos, .gp"
 
+rst_epilog = f"""
+.. |cuda_bindings_github_ref| replace:: {GITHUB_EXAMPLES_REF}
+"""
+
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
@@ -107,7 +121,45 @@
     "cufile": ("https://docs.nvidia.com/gpudirect-storage/api-reference-guide/", None),
 }
 
+
+def _sanitize_generated_docstring(lines):
+    doc_lines = inspect.cleandoc("\n".join(lines)).splitlines()
+    if not doc_lines:
+        return
+
+    if "(" in doc_lines[0] and ")" in doc_lines[0]:
+        doc_lines = doc_lines[1:]
+        while doc_lines and not doc_lines[0].strip():
+            doc_lines.pop(0)
+
+    if not doc_lines:
+        lines[:] = []
+        return
+
+    lines[:] = [".. code-block:: text", ""]
+    lines.extend(f"   {line}" if line else "   " for line in doc_lines)
+
+
+def autodoc_process_docstring(app, what, name, obj, options, lines):
+    if name.startswith("cuda.bindings."):
+        _sanitize_generated_docstring(lines)
+
+
+def rewrite_source(app, docname, source):
+    text = source[0]
+
+    if docname.startswith("release/"):
+        text = text.replace(".. module:: cuda.bindings\n\n", "", 1)
+
+    source[0] = text
+
+
 suppress_warnings = [
     # for warnings about multiple possible targets, see NVIDIA/cuda-python#152
     "ref.python",
 ]
+
+
+def setup(app):
+    app.connect("autodoc-process-docstring", autodoc_process_docstring)
+    app.connect("source-read", rewrite_source)
@@ -4,12 +4,17 @@
 Contributing
 ============
 
-Thank you for your interest in contributing to ``cuda-bindings``! Based on the type of contribution, it will fall into two categories:
-
-1. You want to report a bug, feature request, or documentation issue
-    - File an `issue <https://github.com/NVIDIA/cuda-python/issues/new/choose>`_ describing what you encountered or what you want to see changed.
-    - The NVIDIA team will evaluate the issues and triage them, scheduling
-    them for a release. If you believe the issue needs priority attention
-    comment on the issue to notify the team.
-2. You want to implement a feature, improvement, or bug fix:
-    - At this time we do not accept code contributions.
+Thank you for your interest in contributing to ``cuda-bindings``! Based on the
+type of contribution, it will fall into two categories:
+
+1. You want to report a bug, feature request, or documentation issue.
+
+   File an `issue <https://github.com/NVIDIA/cuda-python/issues/new/choose>`_
+   describing what you encountered or what you want to see changed. The NVIDIA
+   team will evaluate the issue, triage it, and schedule it for a release. If
+   you believe the issue needs priority attention, comment on the issue to
+   notify the team.
+
+2. You want to implement a feature, improvement, or bug fix.
+
+   At this time we do not accept code contributions.
@@ -0,0 +1,68 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+Examples
+========
+
+This page links to the ``cuda.bindings`` examples shipped in the
+`cuda-python repository <https://github.com/NVIDIA/cuda-python/tree/|cuda_bindings_github_ref|/cuda_bindings/examples>`_.
+Use it as a quick index when you want a runnable sample for a specific API area
+or CUDA feature.
+
+Introduction
+------------
+
+- `clock_nvrtc.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/clock_nvrtc.py>`_
+  uses NVRTC-compiled CUDA code and the device clock to time a reduction
+  kernel.
+- `simple_cubemap_texture.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/simple_cubemap_texture.py>`_
+  demonstrates cubemap texture sampling and transformation.
+- `simple_p2p.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/simple_p2p.py>`_
+  shows peer-to-peer memory access and transfers between multiple GPUs.
+- `simple_zero_copy.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/simple_zero_copy.py>`_
+  uses zero-copy mapped host memory for vector addition.
+- `system_wide_atomics.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/system_wide_atomics.py>`_
+  demonstrates system-wide atomic operations on managed memory.
+- `vector_add_drv.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/vector_add_drv.py>`_
+  uses the CUDA Driver API and unified virtual addressing for vector addition.
+- `vector_add_mmap.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/vector_add_mmap.py>`_
+  uses virtual memory management APIs such as ``cuMemCreate`` and
+  ``cuMemMap`` for vector addition.
+
+Concepts and techniques
+-----------------------
+
+- `stream_ordered_allocation.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/2_Concepts_and_Techniques/stream_ordered_allocation.py>`_
+  demonstrates ``cudaMallocAsync`` and ``cudaFreeAsync`` together with
+  memory-pool release thresholds.
+
+CUDA features
+-------------
+
+- `global_to_shmem_async_copy.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/3_CUDA_Features/global_to_shmem_async_copy.py>`_
+  compares asynchronous global-to-shared-memory copy strategies in matrix
+  multiplication kernels.
+- `simple_cuda_graphs.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/3_CUDA_Features/simple_cuda_graphs.py>`_
+  shows both manual CUDA graph construction and stream-capture-based replay.
+
+Libraries and tools
+-------------------
+
+- `conjugate_gradient_multi_block_cg.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/4_CUDA_Libraries/conjugate_gradient_multi_block_cg.py>`_
+  implements a conjugate-gradient solver with cooperative groups and
+  multi-block synchronization.
+- `nvidia_smi.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/4_CUDA_Libraries/nvidia_smi.py>`_
+  uses NVML to implement a Python subset of ``nvidia-smi``.
+
+Advanced and interoperability
+-----------------------------
+
+- `iso_fd_modelling.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/extra/iso_fd_modelling.py>`_
+  runs isotropic finite-difference wave propagation across multiple GPUs with
+  peer-to-peer halo exchange.
+- `jit_program.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/extra/jit_program.py>`_
+  JIT-compiles a SAXPY kernel with NVRTC and launches it through the Driver
+  API.
+- `numba_emm_plugin.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/extra/numba_emm_plugin.py>`_
+  shows how to back Numba's EMM interface with the NVIDIA CUDA Python Driver
+  API.
@@ -11,6 +11,7 @@
    release
    install
    overview
+   examples
    motivation
    environment_variables
    api
 
@@ -78,7 +78,7 @@ Installing from Source
 ----------------------
 
 Requirements
-^^^^^^^^^^^^
+~~~~~~~~~~~~
 
 * CUDA Toolkit headers[^1]
 * CUDA Runtime static library[^2]
@@ -100,7 +100,7 @@ See `Environment Variables <environment_variables.rst>`_ for a description of ot
    Only ``cydriver``, ``cyruntime`` and ``cynvrtc`` are impacted by the header requirement.
 
 Editable Install
-^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~
 
 You can use:
 
 
@@ -25,13 +25,14 @@ code into
 `PTX <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html>`_ and
 then extract the function to be called at a later point in the application. You
 construct your device code in the form of a string and compile it with
-`NVRTC <http://docs.nvidia.com/cuda/nvrtc/index.html>`_, a runtime compilation
+`NVRTC <https://docs.nvidia.com/cuda/nvrtc/index.html>`_, a runtime compilation
 library for CUDA C++. Using the NVIDIA `Driver
-API <http://docs.nvidia.com/cuda/cuda-driver-api/index.html>`_, manually create a
+API <https://docs.nvidia.com/cuda/cuda-driver-api/index.html>`_, manually create a
 CUDA context and all required resources on the GPU, then launch the compiled
 CUDA C++ code and retrieve the results from the GPU. Now that you have an
 overview, jump into a commonly used example for parallel programming:
-`SAXPY <https://developer.nvidia.com/blog/six-ways-saxpy/>`_.
+`SAXPY <https://developer.nvidia.com/blog/six-ways-saxpy/>`_. For more
+end-to-end samples, see the :doc:`examples` page.
 
 The first thing to do is import the `Driver
 API <https://docs.nvidia.com/cuda/cuda-driver-api/index.html>`_ and
@@ -427,7 +428,7 @@ Putting it all together:
    )
 
 The final step is to construct a ``kernelParams`` argument that fulfills all of the launch API conditions. This is made easy because each array object comes
-with a `ctypes <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ctypes.html#numpy.ndarray.ctypes>`_ data attribute that returns the underlying ``void*`` pointer value.
+with NumPy's `ctypes data attribute <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ctypes.html#numpy.ndarray.ctypes>`_ that returns the underlying ``void*`` pointer value.
 
 By having the final array object contain all pointers, we fulfill the contiguous array requirement:
 
@@ -520,7 +521,10 @@ CUDA objects
 
 Certain CUDA kernels use native CUDA types as their parameters such as ``cudaTextureObject_t``. These types require special handling since they're neither a primitive ctype nor a custom user type. Since ``cuda.bindings`` exposes each of them as Python classes, they each implement ``getPtr()`` and ``__int__()``. These two callables used to support the NumPy and ctypes approach. The difference between each call is further described under `Tips and Tricks <https://nvidia.github.io/cuda-python/cuda-bindings/latest/tips_and_tricks.html#>`_.
 
-For this example, lets use the ``transformKernel`` from `examples/0_Introduction/simpleCubemapTexture_test.py <https://github.com/NVIDIA/cuda-python/blob/main/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py>`_:
+For this example, lets use the ``transformKernel`` from
+`simple_cubemap_texture.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/simple_cubemap_texture.py>`_.
+The :doc:`examples` page links to more samples covering textures, graphs,
+memory mapping, and multi-GPU workflows.
 
 .. code-block:: python