pytorch · gchalump · Apr 20, 2026
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -315,7 +315,7 @@ __configure_fbgemm_gpu_build_cuda () {
         local arch_list="9.0a"
       fi
 
-    elif  [[ $cuda_version_nvcc == *"V13.0"* ]] ||
+    elif  [[ $cuda_version_nvcc == *"V13"* ]] ||
           [[ $cuda_version_nvcc == *"V12.9"* ]] ||
           [[ $cuda_version_nvcc == *"V12.8"* ]]; then
       # NOTE: If we reach this point, then we are building the package for
@@ -524,6 +524,29 @@ __build_fbgemm_gpu_set_run_multicore () {
     export run_multicore=""
     if [[ $core =~ $re && $sockets =~ $re ]]; then
       local n_core=$((core * sockets))
+
+      # Cap parallelism based on available memory to avoid OOM (exit code 137)
+      # on memory-constrained CI runners.  Each NVCC compilation job can use
+      # 2-4 GB when targeting multiple GPU architectures (e.g. 8.0;9.0a;10.0a).
+      local mem_gb=0
+      if [ -f /proc/meminfo ]; then
+        # shellcheck disable=SC2155
+        local mem_kb=$(awk '/MemAvailable/ {print $2}' /proc/meminfo 2>/dev/null || echo "0")
+        mem_gb=$((mem_kb / 1024 / 1024))
+      fi
+
+      if [[ $mem_gb -gt 0 ]]; then
+        # Allow ~4 GB per parallel compilation job
+        local mem_jobs=$((mem_gb / 4))
+        if [[ $mem_jobs -lt 1 ]]; then
+          mem_jobs=1
+        fi
+        if [[ $mem_jobs -lt $n_core ]]; then
+          echo "[BUILD] Capping parallelism from ${n_core} to ${mem_jobs} (available memory: ~${mem_gb} GB)"
+          n_core=$mem_jobs
+        fi
+      fi
+
       export run_multicore="-j ${n_core}"
     fi
   fi

diff --git a/.github/scripts/fbgemm_gpu_integration.bash b/.github/scripts/fbgemm_gpu_integration.bash
@@ -284,12 +284,14 @@ integration_fbgemm_gpu_install_matrix_run () {
       12.8.1
       12.9.1
       13.0.2
+      13.2.0
     )
   elif [ "$variant_type" == "genai" ]; then
     local variant_versions=(
       12.6.3
       12.8.1
       13.0.2
+      13.2.0
     )
   elif [ "$variant_type" == "rocm" ]; then
     local variant_versions=(

diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py
@@ -304,10 +304,10 @@ def cuda_versions(self) -> List[str]:
             # FBGEMM HSTU is expensive, so conserve CI resources
             return ["12.8.1"]
         elif self.target == TARGET_GENAI:
-            return ["12.6.3", "12.8.1", "12.9.1", "13.0.2"]
+            return ["12.6.3", "12.8.1", "12.9.1", "13.0.2", "13.2.0"]
         else:
             # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
-            return ["12.6.3", "12.8.1", "12.9.1", "13.0.2"]
+            return ["12.6.3", "12.8.1", "12.9.1", "13.0.2", "13.2.0"]
 
     def rocm_versions(self) -> List[str]:
         if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH:

diff --git a/.github/scripts/nova_dir.bash b/.github/scripts/nova_dir.bash
@@ -22,7 +22,8 @@ fi
 ## Overwrite existing ENV VAR in Nova
 if [[ "$CONDA_ENV" != "" ]]; then export CONDA_RUN="conda run --no-capture-output -p ${CONDA_ENV}" && echo "$CONDA_RUN"; fi
 
-if [[ "$CU_VERSION" == "cu130" ]] ||
+if [[ "$CU_VERSION" == "cu132" ]] ||
+     [[ "$CU_VERSION" == "cu130" ]] ||
      [[ "$CU_VERSION" == "cu129" ]] ||
      [[ "$CU_VERSION" == "cu128" ]]; then
     export TORCH_CUDA_ARCH_LIST="8.0;9.0a;10.0a;12.0a"

diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
@@ -35,9 +35,21 @@ __set_cuda_symlinks_envvars () {
 
     echo "[INSTALL] Copying nvtx3 headers ..."
     # shellcheck disable=SC2086
-    print_exec cp -r ${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/* ${conda_prefix}/include/
-    # shellcheck disable=SC2086
-    print_exec cp -r ${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/* ${new_cuda_home}/include/
+    if compgen -G "${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/*" > /dev/null 2>&1; then
+      # Copy nvtx3 headers from nsight-compute if available
+      # shellcheck disable=SC2086
+      print_exec cp -r ${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/* ${conda_prefix}/include/
+      # shellcheck disable=SC2086
+      print_exec cp -r ${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/* ${new_cuda_home}/include/
+    elif [ -d "${conda_prefix}/include/nvtx3" ]; then
+      # nvtx3 headers already available from cuda-nvtx package
+      echo "[INSTALL] nvtx3 headers already present in ${conda_prefix}/include/nvtx3 (from cuda-nvtx)"
+      if [ ! -d "${new_cuda_home}/include/nvtx3" ]; then
+        print_exec cp -r "${conda_prefix}/include/nvtx3" "${new_cuda_home}/include/"
+      fi
+    else
+      echo "[INSTALL] WARNING: nvtx3 headers not found in nsight-compute or cuda-nvtx"
+    fi
   fi
 
   echo "[INSTALL] Appending libcuda.so path to LD_LIBRARY_PATH ..."
@@ -220,8 +232,24 @@ install_cuda () {
       cuda-nvrtc-dev \
       cuda-cupti-dev \
       cuda-profiler-api \
-      cuda-opencl-dev \
-      nsight-compute) || return 1
+      cuda-opencl-dev) || return 1
+
+    # NOTE: nsight-compute is installed separately as best-effort because for
+    # newer CUDA versions (e.g. 13.2+), it may have unresolvable dependency
+    # conflicts on conda-forge (libxkbcommon -> libxml2-16 vs clangxx ->
+    # libllvm16 -> libxml2 <2.14).  The nvtx3 headers it provides are handled
+    # in __set_cuda_symlinks_envvars with a fallback to cuda-nvtx.
+    #
+    # Skip the install entirely for known-broken versions to avoid the conda
+    # solver OOM (exit code 137) on memory-constrained CI runners.
+    if [[ "$cuda_version" =~ ^13\.[2-9].*$ ]]; then
+      echo "[INSTALL] Skipping nsight-compute for CUDA ${cuda_version} (known dependency conflict on conda-forge)"
+    else
+      # shellcheck disable=SC2086
+      (exec_with_retries 3 conda install ${env_prefix} -c conda-forge --override-channels -y \
+        "cuda-version=${cuda_version%.*}" \
+        nsight-compute) || echo "[INSTALL] WARNING: nsight-compute could not be installed, skipping (nvtx3 headers will be sourced from cuda-nvtx)"
+    fi
   fi
 
   # Set the symlinks and environment variables not covered by conda install

diff --git a/.github/workflows/fbgemm_gpu_release_cuda.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -34,7 +34,7 @@ on:
         description: CUDA Version to Use for Building Artifact
         type: choice
         required: false
-        options: [ "12.6.3", "12.8.1", "12.9.1", "13.0.2" ]
+        options: [ "12.6.3", "12.8.1", "12.9.1", "13.0.2", "13.2.0" ]
         default: "13.0.2"
       publish-to-pypi:
         description: Publish Artifact to PyPI

diff --git a/.github/workflows/fbgemm_gpu_release_genai.yml b/.github/workflows/fbgemm_gpu_release_genai.yml
@@ -34,7 +34,7 @@ on:
         description: CUDA Version to Use for Building Artifact
         type: choice
         required: false
-        options: [ "12.6.3", "12.8.1", "12.9.1", "13.0.2" ]
+        options: [ "12.6.3", "12.8.1", "12.9.1", "13.0.2", "13.2.0" ]
         default: "13.0.2"
       publish-to-pypi:
         description: Publish Artifact to PyPI
@@ -72,7 +72,7 @@ jobs:
           { arch: x86, instance: "linux.12xlarge.memory" },
         ]
         python-version: [ "3.10", "3.11", "3.12", "3.13", "3.14" ]
-        cuda-version: [ "12.6.3", "12.8.1", "13.0.2" ]
+        cuda-version: [ "12.6.3", "12.8.1", "13.0.2", "13.2.0" ]
 
     steps:
     - name: Setup Build Container
@@ -146,7 +146,7 @@ jobs:
           { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
         python-version: [ "3.10", "3.11", "3.12", "3.13", "3.14" ]
-        cuda-version: [ "12.6.3", "12.8.1", "13.0.2" ]
+        cuda-version: [ "12.6.3", "12.8.1", "13.0.2", "13.2.0" ]
     needs: build_artifact
 
     steps: