Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ __configure_fbgemm_gpu_build_cuda () {
local arch_list="9.0a"
fi

elif [[ $cuda_version_nvcc == *"V13.0"* ]] ||
elif [[ $cuda_version_nvcc == *"V13"* ]] ||
[[ $cuda_version_nvcc == *"V12.9"* ]] ||
[[ $cuda_version_nvcc == *"V12.8"* ]]; then
# NOTE: If we reach this point, then we are building the package for
Expand Down Expand Up @@ -524,6 +524,29 @@ __build_fbgemm_gpu_set_run_multicore () {
export run_multicore=""
if [[ $core =~ $re && $sockets =~ $re ]]; then
local n_core=$((core * sockets))

# Cap parallelism based on available memory to avoid OOM (exit code 137)
# on memory-constrained CI runners. Each NVCC compilation job can use
# 2-4 GB when targeting multiple GPU architectures (e.g. 8.0;9.0a;10.0a).
local mem_gb=0
if [ -f /proc/meminfo ]; then
# shellcheck disable=SC2155
local mem_kb=$(awk '/MemAvailable/ {print $2}' /proc/meminfo 2>/dev/null || echo "0")
mem_gb=$((mem_kb / 1024 / 1024))
fi

if [[ $mem_gb -gt 0 ]]; then
# Allow ~4 GB per parallel compilation job
local mem_jobs=$((mem_gb / 4))
if [[ $mem_jobs -lt 1 ]]; then
mem_jobs=1
fi
if [[ $mem_jobs -lt $n_core ]]; then
echo "[BUILD] Capping parallelism from ${n_core} to ${mem_jobs} (available memory: ~${mem_gb} GB)"
n_core=$mem_jobs
fi
fi

export run_multicore="-j ${n_core}"
fi
fi
Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/fbgemm_gpu_integration.bash
Original file line number Diff line number Diff line change
Expand Up @@ -284,12 +284,14 @@ integration_fbgemm_gpu_install_matrix_run () {
12.8.1
12.9.1
13.0.2
13.2.0
)
elif [ "$variant_type" == "genai" ]; then
local variant_versions=(
12.6.3
12.8.1
13.0.2
13.2.0
)
elif [ "$variant_type" == "rocm" ]; then
local variant_versions=(
Expand Down
4 changes: 2 additions & 2 deletions .github/scripts/generate_ci_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,10 +304,10 @@ def cuda_versions(self) -> List[str]:
# FBGEMM HSTU is expensive, so conserve CI resources
return ["12.8.1"]
elif self.target == TARGET_GENAI:
return ["12.6.3", "12.8.1", "12.9.1", "13.0.2"]
return ["12.6.3", "12.8.1", "12.9.1", "13.0.2", "13.2.0"]
else:
# GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
return ["12.6.3", "12.8.1", "12.9.1", "13.0.2"]
return ["12.6.3", "12.8.1", "12.9.1", "13.0.2", "13.2.0"]

def rocm_versions(self) -> List[str]:
if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH:
Expand Down
3 changes: 2 additions & 1 deletion .github/scripts/nova_dir.bash
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ fi
## Overwrite existing ENV VAR in Nova
if [[ "$CONDA_ENV" != "" ]]; then export CONDA_RUN="conda run --no-capture-output -p ${CONDA_ENV}" && echo "$CONDA_RUN"; fi

if [[ "$CU_VERSION" == "cu130" ]] ||
if [[ "$CU_VERSION" == "cu132" ]] ||
[[ "$CU_VERSION" == "cu130" ]] ||
[[ "$CU_VERSION" == "cu129" ]] ||
[[ "$CU_VERSION" == "cu128" ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0a;10.0a;12.0a"
Expand Down
38 changes: 33 additions & 5 deletions .github/scripts/utils_cuda.bash
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,21 @@ __set_cuda_symlinks_envvars () {

echo "[INSTALL] Copying nvtx3 headers ..."
# shellcheck disable=SC2086
print_exec cp -r ${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/* ${conda_prefix}/include/
# shellcheck disable=SC2086
print_exec cp -r ${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/* ${new_cuda_home}/include/
if compgen -G "${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/*" > /dev/null 2>&1; then
# Copy nvtx3 headers from nsight-compute if available
# shellcheck disable=SC2086
print_exec cp -r ${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/* ${conda_prefix}/include/
# shellcheck disable=SC2086
print_exec cp -r ${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/* ${new_cuda_home}/include/
elif [ -d "${conda_prefix}/include/nvtx3" ]; then
# nvtx3 headers already available from cuda-nvtx package
echo "[INSTALL] nvtx3 headers already present in ${conda_prefix}/include/nvtx3 (from cuda-nvtx)"
if [ ! -d "${new_cuda_home}/include/nvtx3" ]; then
print_exec cp -r "${conda_prefix}/include/nvtx3" "${new_cuda_home}/include/"
fi
else
echo "[INSTALL] WARNING: nvtx3 headers not found in nsight-compute or cuda-nvtx"
fi
fi

echo "[INSTALL] Appending libcuda.so path to LD_LIBRARY_PATH ..."
Expand Down Expand Up @@ -220,8 +232,24 @@ install_cuda () {
cuda-nvrtc-dev \
cuda-cupti-dev \
cuda-profiler-api \
cuda-opencl-dev \
nsight-compute) || return 1
cuda-opencl-dev) || return 1

# NOTE: nsight-compute is installed separately as best-effort because for
# newer CUDA versions (e.g. 13.2+), it may have unresolvable dependency
# conflicts on conda-forge (libxkbcommon -> libxml2-16 vs clangxx ->
# libllvm16 -> libxml2 <2.14). The nvtx3 headers it provides are handled
# in __set_cuda_symlinks_envvars with a fallback to cuda-nvtx.
#
# Skip the install entirely for known-broken versions to avoid the conda
# solver OOM (exit code 137) on memory-constrained CI runners.
if [[ "$cuda_version" =~ ^13\.[2-9].*$ ]]; then
echo "[INSTALL] Skipping nsight-compute for CUDA ${cuda_version} (known dependency conflict on conda-forge)"
else
# shellcheck disable=SC2086
(exec_with_retries 3 conda install ${env_prefix} -c conda-forge --override-channels -y \
"cuda-version=${cuda_version%.*}" \
nsight-compute) || echo "[INSTALL] WARNING: nsight-compute could not be installed, skipping (nvtx3 headers will be sourced from cuda-nvtx)"
fi
fi

# Set the symlinks and environment variables not covered by conda install
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_release_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ on:
description: CUDA Version to Use for Building Artifact
type: choice
required: false
options: [ "12.6.3", "12.8.1", "12.9.1", "13.0.2" ]
options: [ "12.6.3", "12.8.1", "12.9.1", "13.0.2", "13.2.0" ]
default: "13.0.2"
publish-to-pypi:
description: Publish Artifact to PyPI
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/fbgemm_gpu_release_genai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ on:
description: CUDA Version to Use for Building Artifact
type: choice
required: false
options: [ "12.6.3", "12.8.1", "12.9.1", "13.0.2" ]
options: [ "12.6.3", "12.8.1", "12.9.1", "13.0.2", "13.2.0" ]
default: "13.0.2"
publish-to-pypi:
description: Publish Artifact to PyPI
Expand Down Expand Up @@ -72,7 +72,7 @@ jobs:
{ arch: x86, instance: "linux.12xlarge.memory" },
]
python-version: [ "3.10", "3.11", "3.12", "3.13", "3.14" ]
cuda-version: [ "12.6.3", "12.8.1", "13.0.2" ]
cuda-version: [ "12.6.3", "12.8.1", "13.0.2", "13.2.0" ]

steps:
- name: Setup Build Container
Expand Down Expand Up @@ -146,7 +146,7 @@ jobs:
{ arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
]
python-version: [ "3.10", "3.11", "3.12", "3.13", "3.14" ]
cuda-version: [ "12.6.3", "12.8.1", "13.0.2" ]
cuda-version: [ "12.6.3", "12.8.1", "13.0.2", "13.2.0" ]
needs: build_artifact

steps:
Expand Down
Loading