Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mooncake-transfer-engine/fabric_allocator.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ function(add_fabric_allocator_build_target)
if(FAB_ENABLE_BUILD)
add_custom_command(
TARGET ${FAB_TARGET_NAME}
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}
COMMAND bash ${FAB_BUILD_SCRIPT} ${FAB_BUILD_ARGS}
${CMAKE_CURRENT_BINARY_DIR} "${_include_dirs_str}"
Expand Down
4 changes: 3 additions & 1 deletion mooncake-transfer-engine/tent/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

add_subdirectory(src)
add_subdirectory(plugins)
add_subdirectory(tests)
if (BUILD_UNIT_TESTS)
add_subdirectory(tests)
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@

#include "tent/common/config.h"
#include "tent/common/status.h"
#include "tent/metastore/redis.h"
#include "tent/runtime/control_plane.h"
#include "tent/runtime/segment.h"
#include "tent/runtime/segment_tracker.h"
Expand All @@ -43,6 +42,11 @@
namespace mooncake {
namespace tent {

namespace {
constexpr uint8_t kRedisMaxDbIndex = 255;
constexpr uint8_t kRedisDefaultDbIndex = 0;
}

struct Batch {
Batch() : max_size(0) { sub_batch.fill(nullptr); }

Expand Down Expand Up @@ -314,14 +318,14 @@ Status TransferEngineImpl::construct() {
CHECK_STATUS(topology_->discover({loader}));

// Validate redis_db_index range (0-255)
uint8_t db_index = REDIS_DEFAULT_DB_INDEX;
uint8_t db_index = kRedisDefaultDbIndex;
if (redis_db_index_config >= 0 &&
redis_db_index_config <= REDIS_MAX_DB_INDEX) {
redis_db_index_config <= kRedisMaxDbIndex) {
db_index = static_cast<uint8_t>(redis_db_index_config);
} else {
LOG(WARNING) << "Invalid Redis DB index: " << redis_db_index_config
<< ", using default "
<< static_cast<int>(REDIS_DEFAULT_DB_INDEX);
<< static_cast<int>(kRedisDefaultDbIndex);
}

metadata_ = std::make_shared<ControlService>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,28 @@

namespace mooncake {
namespace tent {
namespace {

Status setCudaDeviceForLocation(const LocationParser& location,
int& saved_dev) {
saved_dev = -1;
CHECK_CUDA(cudaGetDevice(&saved_dev));
if (location.index() >= 0 && saved_dev != location.index()) {
CHECK_CUDA(cudaSetDevice(location.index()));
}
return Status::OK();
}

Status restoreCudaDeviceForLocation(const LocationParser& location,
int saved_dev) {
if (saved_dev >= 0 && location.index() >= 0 &&
saved_dev != location.index()) {
CHECK_CUDA(cudaSetDevice(saved_dev));
}
return Status::OK();
}

} // namespace

NVLinkTransport::NVLinkTransport() : installed_(false) {}

Expand Down Expand Up @@ -228,10 +250,25 @@ Status NVLinkTransport::addMemoryBuffer(BufferDesc& desc,
const MemoryOptions& options) {
LocationParser location(desc.location);
if (location.type() == "cuda") {
// If the memory region is allocated using cuMemAlloc,
// we cannot use cudaIpcGetMemHandle, so skip it
// MNNVL allocations are exported by MnnvlTransport instead of CUDA IPC.
if (options.type == MNNVL) return Status::OK();

int saved_dev = -1;
CHECK_STATUS(setCudaDeviceForLocation(location, saved_dev));

// VMM allocations have driver allocation handles, but
// cudaIpcGetMemHandle only supports cudaMalloc-backed pointers.
CUmemGenericAllocationHandle generic_handle;
CUresult retain_result =
cuMemRetainAllocationHandle(&generic_handle, (void*)desc.addr);
if (retain_result == CUDA_SUCCESS) {
cuMemRelease(generic_handle);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The return value of cuMemRelease is ignored. While this is a cleanup step, it is good practice to check the CUresult to ensure that the allocation handle is correctly released and to detect potential issues in the CUDA driver state.

CHECK_STATUS(restoreCudaDeviceForLocation(location, saved_dev));
LOG(INFO) << "NVLinkTransport: memory region " << (void*)desc.addr
<< " is not cudaMalloc-backed; skip CUDA IPC export.";
return Status::OK();
}

// Resolve the true cudaMalloc base address. Caching allocators
// (e.g. PyTorch) sub-allocate tensors within larger cudaMalloc
// segments. cudaIpcGetMemHandle returns a handle for the whole
Expand All @@ -244,6 +281,7 @@ Status NVLinkTransport::addMemoryBuffer(BufferDesc& desc,
LOG(ERROR) << "NVLinkTransport: cuMemGetAddressRange failed for "
<< "addr 0x" << std::hex << desc.addr << std::dec
<< " (error " << cu_err << ")";
CHECK_STATUS(restoreCudaDeviceForLocation(location, saved_dev));
return Status::InternalError(
"cuMemGetAddressRange failed" LOC_MARK);
}
Expand All @@ -255,12 +293,24 @@ Status NVLinkTransport::addMemoryBuffer(BufferDesc& desc,
desc.addr = (uint64_t)base_ptr;
desc.length = alloc_size;
desc.transports.push_back(TransportType::NVLINK);
CHECK_STATUS(restoreCudaDeviceForLocation(location, saved_dev));
return Status::OK();
}
}

cudaIpcMemHandle_t handle;
CHECK_CUDA(cudaIpcGetMemHandle(&handle, (void*)base_ptr));
auto cuda_err = cudaIpcGetMemHandle(&handle, (void*)base_ptr);
CHECK_STATUS(restoreCudaDeviceForLocation(location, saved_dev));
if (cuda_err != cudaSuccess) {
LOG(ERROR) << "NVLinkTransport: cudaIpcGetMemHandle failed for "
<< "addr 0x" << std::hex << desc.addr << ", base 0x"
<< (uint64_t)base_ptr << std::dec << ", device "
<< location.index() << ": "
<< cudaGetErrorString(cuda_err);
return Status::InternalError(
std::string("cudaIpcGetMemHandle(&handle, (void*)base_ptr): ") +
cudaGetErrorString(cuda_err) + LOC_MARK);
}
desc.addr = (uint64_t)base_ptr;
desc.length = alloc_size;
desc.shm_path =
Expand All @@ -287,10 +337,14 @@ Status NVLinkTransport::removeMemoryBuffer(BufferDesc& desc) {
if (location.type() == "cuda") {
// Resolve base the same way we did in addMemoryBuffer, so we
// remove the right entry even for sub-allocated addresses.
int saved_dev = -1;
CHECK_STATUS(setCudaDeviceForLocation(location, saved_dev));

CUdeviceptr base_ptr = 0;
size_t alloc_size = 0;
CUresult cu_err = cuMemGetAddressRange(&base_ptr, &alloc_size,
(CUdeviceptr)desc.addr);
CHECK_STATUS(restoreCudaDeviceForLocation(location, saved_dev));

uint64_t key = desc.addr;
if (cu_err == CUDA_SUCCESS) {
Expand Down
154 changes: 154 additions & 0 deletions scripts/build_local_cuda_tent.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
REPO_ROOT="$(cd -- "${SCRIPT_DIR}/.." >/dev/null 2>&1 && pwd)"

DEPS_PREFIX="${MOONCAKE_DEPS:-/home/inf-daole/.local/mooncake-deps}"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The default value for DEPS_PREFIX contains a hardcoded path specific to a user's home directory (/home/inf-daole). This makes the script less portable and may cause build failures for other developers. Consider using a more generic default or leaving it empty to require explicit configuration.

CUDA_ROOT="${CUDA_HOME:-/usr/local/cuda}"
BUILD_DIR="${BUILD_DIR:-${REPO_ROOT}/build}"
INSTALL_PREFIX="${INSTALL_PREFIX:-${REPO_ROOT}/install-cuda-tent}"
JOBS="${JOBS:-$(nproc)}"
BUILD_TYPE="${BUILD_TYPE:-RelWithDebInfo}"
BUILD_EXAMPLES="${BUILD_EXAMPLES:-ON}"
WHEEL_OUTPUT="${WHEEL_OUTPUT:-dist}"
if [[ -z "${PYTHON_EXECUTABLE:-}" && -x "${REPO_ROOT}/.venv/bin/python" ]]; then
PYTHON_EXECUTABLE="${REPO_ROOT}/.venv/bin/python"
else
PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}"
fi
RUN_INSTALL=0
RUN_WHEEL=0

usage() {
cat <<USAGE
Usage: $(basename "$0") [--clean] [--install] [--wheel] [--configure-only] [--help]

Environment overrides:
MOONCAKE_DEPS Dependency prefix (default: ${DEPS_PREFIX})
CUDA_HOME CUDA toolkit root (default: ${CUDA_ROOT})
BUILD_DIR CMake build directory (default: ${BUILD_DIR})
INSTALL_PREFIX User-writable install prefix (default: ${INSTALL_PREFIX})
JOBS Parallel build jobs (default: nproc)
BUILD_TYPE CMake build type (default: ${BUILD_TYPE})
BUILD_EXAMPLES Build example binaries for wheel CLIs (default: ${BUILD_EXAMPLES})
WHEEL_OUTPUT Wheel output directory, relative to mooncake-wheel (default: ${WHEEL_OUTPUT})
PYTHON_EXECUTABLE Python interpreter for CMake and wheel builds (default: ${PYTHON_EXECUTABLE})
USAGE
}

CLEAN=0
CONFIGURE_ONLY=0
while [[ $# -gt 0 ]]; do
case "$1" in
--clean)
CLEAN=1
;;
--install)
RUN_INSTALL=1
;;
--wheel)
RUN_WHEEL=1
;;
--configure-only)
CONFIGURE_ONLY=1
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
shift
done

if [[ ! -d "${DEPS_PREFIX}" ]]; then
echo "Dependency prefix not found: ${DEPS_PREFIX}" >&2
exit 1
fi

if [[ ! -d "${CUDA_ROOT}" ]]; then
echo "CUDA toolkit root not found: ${CUDA_ROOT}" >&2
exit 1
fi

CUDA_TARGET_LIB_DIR=""
for candidate in \
"${CUDA_ROOT}/targets/sbsa-linux/lib" \
"${CUDA_ROOT}/targets/aarch64-linux/lib" \
"${CUDA_ROOT}/targets/x86_64-linux/lib" \
"${CUDA_ROOT}/lib64" \
"${CUDA_ROOT}/lib"; do
if [[ -d "${candidate}" ]]; then
CUDA_TARGET_LIB_DIR="${candidate}"
break
fi
done

if [[ -z "${CUDA_TARGET_LIB_DIR}" ]]; then
echo "Could not find CUDA library directory under ${CUDA_ROOT}" >&2
exit 1
fi

CUDA_STUB_LIB_DIR="${CUDA_TARGET_LIB_DIR}/stubs"
CUDA_LINK_FLAGS=""
if [[ -d "${CUDA_STUB_LIB_DIR}" ]]; then
CUDA_LINK_FLAGS="-L${CUDA_STUB_LIB_DIR}"
fi

PYTHON_BIN_DIR="$(cd -- "$(dirname -- "${PYTHON_EXECUTABLE}")" >/dev/null 2>&1 && pwd)"
export PATH="${PYTHON_BIN_DIR}:${DEPS_PREFIX}/bin:${DEPS_PREFIX}/go/bin:${CUDA_ROOT}/bin:${PATH}"
export LD_LIBRARY_PATH="${CUDA_TARGET_LIB_DIR}:${CUDA_STUB_LIB_DIR:-}:${DEPS_PREFIX}/lib:${LD_LIBRARY_PATH:-}"
export LIBRARY_PATH="${CUDA_TARGET_LIB_DIR}:${CUDA_STUB_LIB_DIR:-}:${DEPS_PREFIX}/lib:${LIBRARY_PATH:-}"
export PKG_CONFIG_PATH="${DEPS_PREFIX}/lib/pkgconfig:${PKG_CONFIG_PATH:-}"
export CMAKE_PREFIX_PATH="${DEPS_PREFIX}:${CMAKE_PREFIX_PATH:-}"
export CMAKE_INCLUDE_PATH="${DEPS_PREFIX}/include:${CMAKE_INCLUDE_PATH:-}"
export CMAKE_LIBRARY_PATH="${DEPS_PREFIX}/lib:${CMAKE_LIBRARY_PATH:-}"
export CUDAToolkit_ROOT="${CUDA_ROOT}"
export CUDA_HOME="${CUDA_ROOT}"

if [[ "${CLEAN}" -eq 1 ]]; then
rm -rf -- "${BUILD_DIR}"
fi

cmake -S "${REPO_ROOT}" -B "${BUILD_DIR}" -G Ninja \
-DCMAKE_BUILD_TYPE="${BUILD_TYPE}" \
-DCMAKE_PREFIX_PATH="${DEPS_PREFIX}" \
-DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" \
-DCMAKE_INCLUDE_PATH="${DEPS_PREFIX}/include" \
-DCMAKE_LIBRARY_PATH="${DEPS_PREFIX}/lib" \
-DCUDAToolkit_ROOT="${CUDA_ROOT}" \
-DPython3_EXECUTABLE="${PYTHON_EXECUTABLE}" \
-DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
-DCMAKE_EXE_LINKER_FLAGS="${CUDA_LINK_FLAGS}" \
-DCMAKE_SHARED_LINKER_FLAGS="${CUDA_LINK_FLAGS}" \
-DBUILD_UNIT_TESTS=OFF \
-DBUILD_BENCHMARK=OFF \
-DBUILD_EXAMPLES="${BUILD_EXAMPLES}" \
-DWITH_STORE_RUST=OFF \
-DWITH_STORE_GO=OFF \
-DWITH_P2P_STORE=OFF \
-DWITH_EP=OFF \
-DUSE_CUDA=ON \
-DWITH_NVIDIA_PEERMEM=OFF \
-DUSE_MNNVL=ON \
-DUSE_TENT=ON \
-DWITH_STORE=ON

if [[ "${CONFIGURE_ONLY}" -eq 1 ]]; then
exit 0
fi

cmake --build "${BUILD_DIR}" --parallel "${JOBS}"

if [[ "${RUN_INSTALL}" -eq 1 ]]; then
cmake --install "${BUILD_DIR}"
fi

if [[ "${RUN_WHEEL}" -eq 1 ]]; then
BUILD_DIR="${BUILD_DIR}" OUTPUT_DIR="${WHEEL_OUTPUT}" PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" "${SCRIPT_DIR}/build_wheel.sh"
fi
Loading
Loading