diff --git a/mooncake-transfer-engine/fabric_allocator.cmake b/mooncake-transfer-engine/fabric_allocator.cmake index 52537d1b2a..62ee1095bf 100644 --- a/mooncake-transfer-engine/fabric_allocator.cmake +++ b/mooncake-transfer-engine/fabric_allocator.cmake @@ -27,6 +27,7 @@ function(add_fabric_allocator_build_target) if(FAB_ENABLE_BUILD) add_custom_command( TARGET ${FAB_TARGET_NAME} + POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR} COMMAND bash ${FAB_BUILD_SCRIPT} ${FAB_BUILD_ARGS} ${CMAKE_CURRENT_BINARY_DIR} "${_include_dirs_str}" diff --git a/mooncake-transfer-engine/tent/CMakeLists.txt b/mooncake-transfer-engine/tent/CMakeLists.txt index 82bced275c..b5bcc3795e 100644 --- a/mooncake-transfer-engine/tent/CMakeLists.txt +++ b/mooncake-transfer-engine/tent/CMakeLists.txt @@ -8,4 +8,6 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) add_subdirectory(src) add_subdirectory(plugins) -add_subdirectory(tests) +if (BUILD_UNIT_TESTS) + add_subdirectory(tests) +endif() diff --git a/mooncake-transfer-engine/tent/src/runtime/transfer_engine_impl.cpp b/mooncake-transfer-engine/tent/src/runtime/transfer_engine_impl.cpp index 869ff38e9f..4ef66f4855 100644 --- a/mooncake-transfer-engine/tent/src/runtime/transfer_engine_impl.cpp +++ b/mooncake-transfer-engine/tent/src/runtime/transfer_engine_impl.cpp @@ -26,7 +26,6 @@ #include "tent/common/config.h" #include "tent/common/status.h" -#include "tent/metastore/redis.h" #include "tent/runtime/control_plane.h" #include "tent/runtime/segment.h" #include "tent/runtime/segment_tracker.h" @@ -43,6 +42,11 @@ namespace mooncake { namespace tent { +namespace { +constexpr uint8_t kRedisMaxDbIndex = 255; +constexpr uint8_t kRedisDefaultDbIndex = 0; +} + struct Batch { Batch() : max_size(0) { sub_batch.fill(nullptr); } @@ -314,14 +318,14 @@ Status TransferEngineImpl::construct() { CHECK_STATUS(topology_->discover({loader})); // Validate redis_db_index range (0-255) - uint8_t db_index = REDIS_DEFAULT_DB_INDEX; + uint8_t db_index = kRedisDefaultDbIndex; if (redis_db_index_config >= 0 && - redis_db_index_config <= REDIS_MAX_DB_INDEX) { + redis_db_index_config <= kRedisMaxDbIndex) { db_index = static_cast(redis_db_index_config); } else { LOG(WARNING) << "Invalid Redis DB index: " << redis_db_index_config << ", using default " - << static_cast(REDIS_DEFAULT_DB_INDEX); + << static_cast(kRedisDefaultDbIndex); } metadata_ = std::make_shared( diff --git a/mooncake-transfer-engine/tent/src/transport/nvlink/nvlink_transport.cpp b/mooncake-transfer-engine/tent/src/transport/nvlink/nvlink_transport.cpp index 5a4d7b0937..e018c91f14 100644 --- a/mooncake-transfer-engine/tent/src/transport/nvlink/nvlink_transport.cpp +++ b/mooncake-transfer-engine/tent/src/transport/nvlink/nvlink_transport.cpp @@ -33,6 +33,28 @@ namespace mooncake { namespace tent { +namespace { + +Status setCudaDeviceForLocation(const LocationParser& location, + int& saved_dev) { + saved_dev = -1; + CHECK_CUDA(cudaGetDevice(&saved_dev)); + if (location.index() >= 0 && saved_dev != location.index()) { + CHECK_CUDA(cudaSetDevice(location.index())); + } + return Status::OK(); +} + +Status restoreCudaDeviceForLocation(const LocationParser& location, + int saved_dev) { + if (saved_dev >= 0 && location.index() >= 0 && + saved_dev != location.index()) { + CHECK_CUDA(cudaSetDevice(saved_dev)); + } + return Status::OK(); +} + +} // namespace NVLinkTransport::NVLinkTransport() : installed_(false) {} @@ -228,10 +250,25 @@ Status NVLinkTransport::addMemoryBuffer(BufferDesc& desc, const MemoryOptions& options) { LocationParser location(desc.location); if (location.type() == "cuda") { - // If the memory region is allocated using cuMemAlloc, - // we cannot use cudaIpcGetMemHandle, so skip it + // MNNVL allocations are exported by MnnvlTransport instead of CUDA IPC. if (options.type == MNNVL) return Status::OK(); + int saved_dev = -1; + CHECK_STATUS(setCudaDeviceForLocation(location, saved_dev)); + + // VMM allocations have driver allocation handles, but + // cudaIpcGetMemHandle only supports cudaMalloc-backed pointers. + CUmemGenericAllocationHandle generic_handle; + CUresult retain_result = + cuMemRetainAllocationHandle(&generic_handle, (void*)desc.addr); + if (retain_result == CUDA_SUCCESS) { + cuMemRelease(generic_handle); + CHECK_STATUS(restoreCudaDeviceForLocation(location, saved_dev)); + LOG(INFO) << "NVLinkTransport: memory region " << (void*)desc.addr + << " is not cudaMalloc-backed; skip CUDA IPC export."; + return Status::OK(); + } + // Resolve the true cudaMalloc base address. Caching allocators // (e.g. PyTorch) sub-allocate tensors within larger cudaMalloc // segments. cudaIpcGetMemHandle returns a handle for the whole @@ -244,6 +281,7 @@ Status NVLinkTransport::addMemoryBuffer(BufferDesc& desc, LOG(ERROR) << "NVLinkTransport: cuMemGetAddressRange failed for " << "addr 0x" << std::hex << desc.addr << std::dec << " (error " << cu_err << ")"; + CHECK_STATUS(restoreCudaDeviceForLocation(location, saved_dev)); return Status::InternalError( "cuMemGetAddressRange failed" LOC_MARK); } @@ -255,12 +293,24 @@ Status NVLinkTransport::addMemoryBuffer(BufferDesc& desc, desc.addr = (uint64_t)base_ptr; desc.length = alloc_size; desc.transports.push_back(TransportType::NVLINK); + CHECK_STATUS(restoreCudaDeviceForLocation(location, saved_dev)); return Status::OK(); } } cudaIpcMemHandle_t handle; - CHECK_CUDA(cudaIpcGetMemHandle(&handle, (void*)base_ptr)); + auto cuda_err = cudaIpcGetMemHandle(&handle, (void*)base_ptr); + CHECK_STATUS(restoreCudaDeviceForLocation(location, saved_dev)); + if (cuda_err != cudaSuccess) { + LOG(ERROR) << "NVLinkTransport: cudaIpcGetMemHandle failed for " + << "addr 0x" << std::hex << desc.addr << ", base 0x" + << (uint64_t)base_ptr << std::dec << ", device " + << location.index() << ": " + << cudaGetErrorString(cuda_err); + return Status::InternalError( + std::string("cudaIpcGetMemHandle(&handle, (void*)base_ptr): ") + + cudaGetErrorString(cuda_err) + LOC_MARK); + } desc.addr = (uint64_t)base_ptr; desc.length = alloc_size; desc.shm_path = @@ -287,10 +337,14 @@ Status NVLinkTransport::removeMemoryBuffer(BufferDesc& desc) { if (location.type() == "cuda") { // Resolve base the same way we did in addMemoryBuffer, so we // remove the right entry even for sub-allocated addresses. + int saved_dev = -1; + CHECK_STATUS(setCudaDeviceForLocation(location, saved_dev)); + CUdeviceptr base_ptr = 0; size_t alloc_size = 0; CUresult cu_err = cuMemGetAddressRange(&base_ptr, &alloc_size, (CUdeviceptr)desc.addr); + CHECK_STATUS(restoreCudaDeviceForLocation(location, saved_dev)); uint64_t key = desc.addr; if (cu_err == CUDA_SUCCESS) { diff --git a/scripts/build_local_cuda_tent.sh b/scripts/build_local_cuda_tent.sh new file mode 100755 index 0000000000..9517c5dcc9 --- /dev/null +++ b/scripts/build_local_cuda_tent.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/.." >/dev/null 2>&1 && pwd)" + +DEPS_PREFIX="${MOONCAKE_DEPS:-/home/inf-daole/.local/mooncake-deps}" +CUDA_ROOT="${CUDA_HOME:-/usr/local/cuda}" +BUILD_DIR="${BUILD_DIR:-${REPO_ROOT}/build}" +INSTALL_PREFIX="${INSTALL_PREFIX:-${REPO_ROOT}/install-cuda-tent}" +JOBS="${JOBS:-$(nproc)}" +BUILD_TYPE="${BUILD_TYPE:-RelWithDebInfo}" +BUILD_EXAMPLES="${BUILD_EXAMPLES:-ON}" +WHEEL_OUTPUT="${WHEEL_OUTPUT:-dist}" +if [[ -z "${PYTHON_EXECUTABLE:-}" && -x "${REPO_ROOT}/.venv/bin/python" ]]; then + PYTHON_EXECUTABLE="${REPO_ROOT}/.venv/bin/python" +else + PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}" +fi +RUN_INSTALL=0 +RUN_WHEEL=0 + +usage() { + cat <&2 + usage >&2 + exit 2 + ;; + esac + shift +done + +if [[ ! -d "${DEPS_PREFIX}" ]]; then + echo "Dependency prefix not found: ${DEPS_PREFIX}" >&2 + exit 1 +fi + +if [[ ! -d "${CUDA_ROOT}" ]]; then + echo "CUDA toolkit root not found: ${CUDA_ROOT}" >&2 + exit 1 +fi + +CUDA_TARGET_LIB_DIR="" +for candidate in \ + "${CUDA_ROOT}/targets/sbsa-linux/lib" \ + "${CUDA_ROOT}/targets/aarch64-linux/lib" \ + "${CUDA_ROOT}/targets/x86_64-linux/lib" \ + "${CUDA_ROOT}/lib64" \ + "${CUDA_ROOT}/lib"; do + if [[ -d "${candidate}" ]]; then + CUDA_TARGET_LIB_DIR="${candidate}" + break + fi +done + +if [[ -z "${CUDA_TARGET_LIB_DIR}" ]]; then + echo "Could not find CUDA library directory under ${CUDA_ROOT}" >&2 + exit 1 +fi + +CUDA_STUB_LIB_DIR="${CUDA_TARGET_LIB_DIR}/stubs" +CUDA_LINK_FLAGS="" +if [[ -d "${CUDA_STUB_LIB_DIR}" ]]; then + CUDA_LINK_FLAGS="-L${CUDA_STUB_LIB_DIR}" +fi + +PYTHON_BIN_DIR="$(cd -- "$(dirname -- "${PYTHON_EXECUTABLE}")" >/dev/null 2>&1 && pwd)" +export PATH="${PYTHON_BIN_DIR}:${DEPS_PREFIX}/bin:${DEPS_PREFIX}/go/bin:${CUDA_ROOT}/bin:${PATH}" +export LD_LIBRARY_PATH="${CUDA_TARGET_LIB_DIR}:${CUDA_STUB_LIB_DIR:-}:${DEPS_PREFIX}/lib:${LD_LIBRARY_PATH:-}" +export LIBRARY_PATH="${CUDA_TARGET_LIB_DIR}:${CUDA_STUB_LIB_DIR:-}:${DEPS_PREFIX}/lib:${LIBRARY_PATH:-}" +export PKG_CONFIG_PATH="${DEPS_PREFIX}/lib/pkgconfig:${PKG_CONFIG_PATH:-}" +export CMAKE_PREFIX_PATH="${DEPS_PREFIX}:${CMAKE_PREFIX_PATH:-}" +export CMAKE_INCLUDE_PATH="${DEPS_PREFIX}/include:${CMAKE_INCLUDE_PATH:-}" +export CMAKE_LIBRARY_PATH="${DEPS_PREFIX}/lib:${CMAKE_LIBRARY_PATH:-}" +export CUDAToolkit_ROOT="${CUDA_ROOT}" +export CUDA_HOME="${CUDA_ROOT}" + +if [[ "${CLEAN}" -eq 1 ]]; then + rm -rf -- "${BUILD_DIR}" +fi + +cmake -S "${REPO_ROOT}" -B "${BUILD_DIR}" -G Ninja \ + -DCMAKE_BUILD_TYPE="${BUILD_TYPE}" \ + -DCMAKE_PREFIX_PATH="${DEPS_PREFIX}" \ + -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" \ + -DCMAKE_INCLUDE_PATH="${DEPS_PREFIX}/include" \ + -DCMAKE_LIBRARY_PATH="${DEPS_PREFIX}/lib" \ + -DCUDAToolkit_ROOT="${CUDA_ROOT}" \ + -DPython3_EXECUTABLE="${PYTHON_EXECUTABLE}" \ + -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \ + -DCMAKE_EXE_LINKER_FLAGS="${CUDA_LINK_FLAGS}" \ + -DCMAKE_SHARED_LINKER_FLAGS="${CUDA_LINK_FLAGS}" \ + -DBUILD_UNIT_TESTS=OFF \ + -DBUILD_BENCHMARK=OFF \ + -DBUILD_EXAMPLES="${BUILD_EXAMPLES}" \ + -DWITH_STORE_RUST=OFF \ + -DWITH_STORE_GO=OFF \ + -DWITH_P2P_STORE=OFF \ + -DWITH_EP=OFF \ + -DUSE_CUDA=ON \ + -DWITH_NVIDIA_PEERMEM=OFF \ + -DUSE_MNNVL=ON \ + -DUSE_TENT=ON \ + -DWITH_STORE=ON + +if [[ "${CONFIGURE_ONLY}" -eq 1 ]]; then + exit 0 +fi + +cmake --build "${BUILD_DIR}" --parallel "${JOBS}" + +if [[ "${RUN_INSTALL}" -eq 1 ]]; then + cmake --install "${BUILD_DIR}" +fi + +if [[ "${RUN_WHEEL}" -eq 1 ]]; then + BUILD_DIR="${BUILD_DIR}" OUTPUT_DIR="${WHEEL_OUTPUT}" PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" "${SCRIPT_DIR}/build_wheel.sh" +fi diff --git a/scripts/build_wheel.sh b/scripts/build_wheel.sh index 3b0456ab0b..848972ed41 100755 --- a/scripts/build_wheel.sh +++ b/scripts/build_wheel.sh @@ -6,17 +6,23 @@ set -e # Exit immediately if a command exits with a non-zero status set -x -# Get Python version from environment variable or argument -PYTHON_VERSION=${PYTHON_VERSION:-${1:-$(python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")}} +# Get Python interpreter and version from environment variable or argument +if [ -z "${PYTHON_EXECUTABLE:-}" ] && [ -x "$(pwd)/.venv/bin/python" ]; then + PYTHON_EXECUTABLE="$(pwd)/.venv/bin/python" +else + PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python}" +fi +PYTHON_VERSION=${PYTHON_VERSION:-${1:-$(${PYTHON_EXECUTABLE} -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")}} # Get output directory from environment variable or argument OUTPUT_DIR=${OUTPUT_DIR:-${2:-"dist"}} # CMake build directory (default: build). EP/PG extensions are staged under # ${BUILD_DIR}/ep_pg_staging when the project was built with -DWITH_EP=ON. BUILD_DIR="${BUILD_DIR:-build}" -echo "Building wheel for Python ${PYTHON_VERSION} with output directory ${OUTPUT_DIR}" +DEPS_PREFIX="${MOONCAKE_DEPS:-/home/inf-daole/.local/mooncake-deps}" +echo "Building wheel for Python ${PYTHON_VERSION} (${PYTHON_EXECUTABLE}) with output directory ${OUTPUT_DIR}" # Ensure LD_LIBRARY_PATH includes /usr/local/lib -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/$(pwd)/build/mooncake-common:/usr/local/lib +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:$(pwd)/${BUILD_DIR}/mooncake-common:${DEPS_PREFIX}/lib:/usr/local/lib echo "Cleaning wheel-build directory" rm -rf mooncake-wheel/mooncake_transfer_engine* @@ -29,20 +35,20 @@ echo "Creating directory structure..." cp mooncake-integration/fabric_allocator_utils.py mooncake-wheel/mooncake/fabric_allocator_utils.py # Copy engine.so to mooncake directory (will be imported by transfer module) -cp build/mooncake-integration/engine.*.so mooncake-wheel/mooncake/engine.so +cp "${BUILD_DIR}"/mooncake-integration/engine.*.so mooncake-wheel/mooncake/engine.so # Copy libasio.so to mooncake directory (runtime dependency of engine.so) -cp build/mooncake-common/libasio.so mooncake-wheel/mooncake/libasio.so +cp "${BUILD_DIR}"/mooncake-common/libasio.so mooncake-wheel/mooncake/libasio.so # Copy store.so to mooncake directory -if [ -f build/mooncake-integration/store.*.so ]; then +if ls "${BUILD_DIR}"/mooncake-integration/store.*.so >/dev/null 2>&1; then echo "Copying store.so..." - cp build/mooncake-integration/store.*.so mooncake-wheel/mooncake/store.so + cp "${BUILD_DIR}"/mooncake-integration/store.*.so mooncake-wheel/mooncake/store.so echo "Copying master binary..." # Copy master binary - cp build/mooncake-store/src/mooncake_master mooncake-wheel/mooncake/ + cp "${BUILD_DIR}"/mooncake-store/src/mooncake_master mooncake-wheel/mooncake/ # Copy client binary - cp build/mooncake-store/src/mooncake_client mooncake-wheel/mooncake/ + cp "${BUILD_DIR}"/mooncake-store/src/mooncake_client mooncake-wheel/mooncake/ # Copy async_store.py cp mooncake-integration/store/async_store.py mooncake-wheel/mooncake/async_store.py else @@ -50,36 +56,36 @@ else fi # Copy libmooncake_store.so to mooncake directory (only when BUILD_SHARED_LIBS is set) -if [ -f build/mooncake-store/src/libmooncake_store.so ]; then +if [ -f "${BUILD_DIR}/mooncake-store/src/libmooncake_store.so" ]; then echo "Copying libmooncake_store.so..." - cp build/mooncake-store/src/libmooncake_store.so mooncake-wheel/mooncake/libmooncake_store.so + cp "${BUILD_DIR}"/mooncake-store/src/libmooncake_store.so mooncake-wheel/mooncake/libmooncake_store.so fi # Copy libtransfer_engine.so to mooncake directory (only when USE_ETCD is set) -if [ -f build/mooncake-common/etcd/libetcd_wrapper.so ]; then +if [ -f "${BUILD_DIR}/mooncake-common/etcd/libetcd_wrapper.so" ]; then echo "Copying libetcd_wrapper.so..." - cp build/mooncake-common/etcd/libetcd_wrapper.so mooncake-wheel/mooncake/libetcd_wrapper.so + cp "${BUILD_DIR}"/mooncake-common/etcd/libetcd_wrapper.so mooncake-wheel/mooncake/libetcd_wrapper.so fi # Copy libtransfer_engine.so to mooncake directory (only when BUILD_SHARED_LIBS is set) -if [ -f build/mooncake-transfer-engine/src/libtransfer_engine.so ]; then +if [ -f "${BUILD_DIR}/mooncake-transfer-engine/src/libtransfer_engine.so" ]; then echo "Copying libtransfer_engine.so..." - cp build/mooncake-transfer-engine/src/libtransfer_engine.so mooncake-wheel/mooncake/libtransfer_engine.so + cp "${BUILD_DIR}"/mooncake-transfer-engine/src/libtransfer_engine.so mooncake-wheel/mooncake/libtransfer_engine.so fi # Copy ascend_transport.so to mooncake directory (only when USE_ASCEND_DIRECT is set) -if [ -f build/mooncake-transfer-engine/src/transport/ascend_transport/ascend_transport.so ]; then +if [ -f "${BUILD_DIR}/mooncake-transfer-engine/src/transport/ascend_transport/ascend_transport.so" ]; then echo "Copying ascend_transport.so..." - cp build/mooncake-transfer-engine/src/transport/ascend_transport/ascend_transport.so mooncake-wheel/mooncake/ascend_transport.so + cp "${BUILD_DIR}"/mooncake-transfer-engine/src/transport/ascend_transport/ascend_transport.so mooncake-wheel/mooncake/ascend_transport.so fi # Copy nvlink-allocator.so to mooncake directory (only if it exists - CUDA builds only) -if [ -f build/mooncake-transfer-engine/nvlink-allocator/nvlink_allocator.so ] \ +if [ -f "${BUILD_DIR}/mooncake-transfer-engine/nvlink-allocator/nvlink_allocator.so" ] \ || [ -f /usr/lib/libaccl_barex.so ] \ || [ -f /usr/lib64/libaccl_barex.so ]; then - if [ -f build/mooncake-transfer-engine/nvlink-allocator/nvlink_allocator.so ]; then + if [ -f "${BUILD_DIR}/mooncake-transfer-engine/nvlink-allocator/nvlink_allocator.so" ]; then echo "Copying CUDA nvlink_allocator.so..." - cp build/mooncake-transfer-engine/nvlink-allocator/nvlink_allocator.so mooncake-wheel/mooncake/nvlink_allocator.so + cp "${BUILD_DIR}"/mooncake-transfer-engine/nvlink-allocator/nvlink_allocator.so mooncake-wheel/mooncake/nvlink_allocator.so fi echo "Copying allocator libraries..." # Copy allocator.py @@ -89,9 +95,9 @@ else fi # Copy ubshmem_fabric_allocator.so to mooncake directory (only if it exists - NPU builds only) -if [ -f build/mooncake-transfer-engine/ubshmem-allocator/ubshmem_fabric_allocator.so ]; then +if [ -f "${BUILD_DIR}/mooncake-transfer-engine/ubshmem-allocator/ubshmem_fabric_allocator.so" ]; then echo "Copying NPU ubshmem_fabric_allocator.so..." - cp build/mooncake-transfer-engine/ubshmem-allocator/ubshmem_fabric_allocator.so mooncake-wheel/mooncake/ubshmem_fabric_allocator.so + cp "${BUILD_DIR}"/mooncake-transfer-engine/ubshmem-allocator/ubshmem_fabric_allocator.so mooncake-wheel/mooncake/ubshmem_fabric_allocator.so echo "Copying NPU allocator libraries..." # Copy allocator_ascend_npu.py cp mooncake-integration/allocator_ascend_npu.py mooncake-wheel/mooncake/allocator_ascend_npu.py @@ -101,10 +107,14 @@ fi echo "Copying transfer_engine_bench..." # Copy transfer_engine_bench -cp build/mooncake-transfer-engine/example/transfer_engine_bench mooncake-wheel/mooncake/ +if [ -f "${BUILD_DIR}/mooncake-transfer-engine/example/transfer_engine_bench" ]; then + cp "${BUILD_DIR}"/mooncake-transfer-engine/example/transfer_engine_bench mooncake-wheel/mooncake/ +else + echo "Skipping transfer_engine_bench (not built - likely BUILD_EXAMPLES=OFF)" +fi -if [ -f "build/mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so" ]; then - cp build/mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so mooncake-wheel/mooncake/ +if [ -f "${BUILD_DIR}/mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so" ]; then + cp "${BUILD_DIR}"/mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so mooncake-wheel/mooncake/ echo "Copying ascend_transport_mem libraries..." else echo "Skipping libascend_transport_mem.so (not built - Ascend disabled)" @@ -173,13 +183,13 @@ rm -rf ${OUTPUT_DIR}/ mkdir -p ${OUTPUT_DIR} echo "Installing required build packages" -if command -v pip &>/dev/null; then - python${PYTHON_VERSION} -m pip install --upgrade pip build setuptools wheel auditwheel +if "${PYTHON_EXECUTABLE}" -m pip --version &>/dev/null; then + "${PYTHON_EXECUTABLE}" -m pip install --upgrade pip build setuptools wheel auditwheel elif command -v uv &>/dev/null; then - uv pip install --upgrade pip - uv pip install build setuptools wheel auditwheel + uv pip install --python "${PYTHON_EXECUTABLE}" --upgrade pip + uv pip install --python "${PYTHON_EXECUTABLE}" build setuptools wheel auditwheel else - echo "Error: Neither python${PYTHON_VERSION}, pip nor uv found" + echo "Error: ${PYTHON_EXECUTABLE} has no pip, and uv was not found" exit 1 fi @@ -234,16 +244,17 @@ case "$ARCH" in ;; esac -# Set platform tag if not already set -PLATFORM_TAG=${PLATFORM_TAG:-"manylinux_${GLIBC_VERSION}_${ARCH_SUFFIX}"} +# Set platform tag if not already set. "auto" lets auditwheel account for +# bundled libraries from a local dependency prefix. +PLATFORM_TAG=${PLATFORM_TAG:-"auto"} echo "Detected architecture: $ARCH_SUFFIX" echo "Detected glibc version: $GLIBC_VERSION" echo "Using platform tag: $PLATFORM_TAG" echo "Repairing wheel with auditwheel for platform: $PLATFORM_TAG" -python${PYTHON_VERSION} -m build --wheel --outdir ${OUTPUT_DIR} -auditwheel repair ${OUTPUT_DIR}/*.whl \ +"${PYTHON_EXECUTABLE}" -m build --wheel --outdir ${OUTPUT_DIR} +"${PYTHON_EXECUTABLE}" -m auditwheel repair ${OUTPUT_DIR}/*.whl \ --exclude libcurl.so* \ --exclude libibverbs.so* \ --exclude libmlx5.so* \ @@ -338,7 +349,7 @@ if [ -d "$CUDA_EP_STAGING_DIR" ] && ls "$CUDA_EP_STAGING_DIR"/*.so &>/dev/null; if [ -n "$REPAIRED_WHEEL" ]; then echo "Injecting CUDA extension .so files into repaired wheel..." WHEEL_UNPACK_DIR=$(mktemp -d) - python${PYTHON_VERSION} -m wheel unpack "$REPAIRED_WHEEL" -d "$WHEEL_UNPACK_DIR" + "${PYTHON_EXECUTABLE}" -m wheel unpack "$REPAIRED_WHEEL" -d "$WHEEL_UNPACK_DIR" UNPACKED_PKG_DIR=$(find "$WHEEL_UNPACK_DIR" -mindepth 1 -maxdepth 1 -type d | head -1) for so_file in "$CUDA_EP_STAGING_DIR"/*.so; do if [ -f "$so_file" ]; then @@ -347,7 +358,7 @@ if [ -d "$CUDA_EP_STAGING_DIR" ] && ls "$CUDA_EP_STAGING_DIR"/*.so &>/dev/null; fi done rm "$REPAIRED_WHEEL" - python${PYTHON_VERSION} -m wheel pack "$UNPACKED_PKG_DIR" -d "${REPAIRED_DIR}/" + "${PYTHON_EXECUTABLE}" -m wheel pack "$UNPACKED_PKG_DIR" -d "${REPAIRED_DIR}/" rm -rf "$WHEEL_UNPACK_DIR" fi else