refactor: simplify CMakelists changes and temporalily bypass the hardcoded distributed logic in main.cc

kilinchange · kilinchange · commit ffdce208a96b · 2026-04-16T08:06:21.000Z
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -9,23 +9,19 @@ option(USE_OMP "Use OpenMP as backend for Eigen" ON)
 option(USE_NCCL "Build project for distributed running on CUDA using NCCL" ON)
 option(USE_MCCL "Build project for distributed running on MACA using MCCL" ON)
 
-# ------------------------------------------------------------------------------
-# MACA toolchain override (must happen before project())
-# ------------------------------------------------------------------------------
-# When targeting MetaX MACA, the C/C++ compiler must be mxcc so that .maca
-# sources and device code can be compiled by the MACA toolchain.
+project(infini_train VERSION 0.5.0 LANGUAGES CXX)
+
+# Switch to mxcc after project() so that third-party libs (glog, gflags) are
+# configured with the host compiler and their feature-detection checks pass.
 if(USE_MACA)
   set(MACA_PATH $ENV{MACA_PATH})
   if(NOT MACA_PATH)
-    message(FATAL_ERROR "USE_MACA=ON but environment variable MACA_PATH is not set. "
-                        "Please export MACA_PATH (e.g. /opt/maca) before configuring.")
+    message(FATAL_ERROR "USE_MACA=ON but environment variable MACA_PATH is not set.")
   endif()
   set(CMAKE_C_COMPILER   "${MACA_PATH}/mxgpu_llvm/bin/mxcc")
   set(CMAKE_CXX_COMPILER "${MACA_PATH}/mxgpu_llvm/bin/mxcc")
 endif()
 
-project(infini_train VERSION 0.5.0 LANGUAGES CXX)
-
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
@@ -45,45 +41,8 @@ include_directories(${gflags_SOURCE_DIR}/include)
 set(WITH_GFLAGS OFF CACHE BOOL "Disable glog finding system gflags" FORCE)
 set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE)
 set(BUILD_TESTING OFF CACHE BOOL "Disable glog unit tests" FORCE)
-# Build glog as a static lib so its symbols are always visible at link time.
-# Under mxcc the default symbol visibility is hidden, which causes the shared
-# libglog.so to export no symbols and produces "undefined reference" errors.
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build glog as static library" FORCE)
 
-# Under MACA/mxcc, cmake's feature-detection test compilations do not find
-# standard POSIX system headers (mxcc has a non-standard sysroot probe path).
-# Pre-set glog's HAVE_* cache variables so that glog skips its fallback type /
-# symbol definitions, which would otherwise conflict with the real system
-# headers during the actual build.
-if(USE_MACA)
-  set(HAVE_SYS_TYPES_H    1 CACHE INTERNAL "")
-  set(HAVE_UNISTD_H        1 CACHE INTERNAL "")
-  set(HAVE_DLFCN_H         1 CACHE INTERNAL "")
-  set(HAVE_GLOB_H          1 CACHE INTERNAL "")
-  set(HAVE_PWD_H           1 CACHE INTERNAL "")
-  set(HAVE_SYS_TIME_H      1 CACHE INTERNAL "")
-  set(HAVE_SYS_UTSNAME_H   1 CACHE INTERNAL "")
-  set(HAVE_SYS_WAIT_H      1 CACHE INTERNAL "")
-  set(HAVE_SYS_SYSCALL_H   1 CACHE INTERNAL "")
-  set(HAVE_SYSLOG_H        1 CACHE INTERNAL "")
-  set(HAVE_UCONTEXT_H      1 CACHE INTERNAL "")
-  # check_type_size() uses two internal variables: the size value and a sentinel
-  # "HAVE_HAVE_<VAR>" that marks the check as done. Pre-setting only the value
-  # is insufficient — the sentinel must also be set so the check skips entirely.
-  set(HAVE_MODE_T          4    CACHE INTERNAL "")  # 4 bytes on Linux
-  set(HAVE_HAVE_MODE_T     TRUE CACHE INTERNAL "")
-  set(HAVE_SSIZE_T         8    CACHE INTERNAL "")  # 8 bytes on 64-bit Linux
-  set(HAVE_HAVE_SSIZE_T    TRUE CACHE INTERNAL "")
-  set(HAVE_PREAD           1 CACHE INTERNAL "")
-  set(HAVE_PWRITE          1 CACHE INTERNAL "")
-  set(HAVE_POSIX_FADVISE   1 CACHE INTERNAL "")
-  set(HAVE_SIGACTION       1 CACHE INTERNAL "")
-  set(HAVE_SIGALTSTACK     1 CACHE INTERNAL "")
-  set(HAVE_FCNTL           1 CACHE INTERNAL "")
-  set(HAVE_DLADDR          1 CACHE INTERNAL "")
-  set(HAVE___CXA_DEMANGLE  1 CACHE INTERNAL "")
-endif()
-
 add_subdirectory(third_party/glog)
 include_directories(${glog_SOURCE_DIR}/src)
 
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -146,7 +146,9 @@ void Train(const nn::parallel::Rank &rank) {
     const ProcessGroup *pp_pg = nullptr;
 
     if (rank.IsParallel()) {
-        device = Device(Device::DeviceType::kCUDA, rank.thread_rank());
+        auto parallel_device_type =
+            (FLAGS_device == kDeviceMACA) ? Device::DeviceType::kMACA : Device::DeviceType::kCUDA;
+        device = Device(parallel_device_type, rank.thread_rank());
         auto *pg_factory = ProcessGroupFactory::Instance(device.type());
 
         if (ddp_world_size > 1) {
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
@@ -93,13 +93,16 @@ namespace {
 const std::unordered_set<std::string> kSupportedModels = {"llama3"};
 constexpr char kDeviceCPU[] = "cpu";
 constexpr char kDeviceCUDA[] = "cuda";
+constexpr char kDeviceMACA[] = "maca";
 constexpr char kDtypeFP32[] = "float32";
 constexpr char kDtypeBF16[] = "bfloat16";
 } // namespace
 
 DEFINE_validator(model, [](const char *, const std::string &value) { return kSupportedModels.contains(value); });
 DEFINE_validator(device,
-                 [](const char *, const std::string &value) { return value == kDeviceCPU || value == kDeviceCUDA; });
+                 [](const char *, const std::string &value) {
+                     return value == kDeviceCPU || value == kDeviceCUDA || value == kDeviceMACA;
+                 });
 
 void Train(const nn::parallel::Rank &rank) {
     using namespace nn::parallel;
@@ -129,7 +132,9 @@ void Train(const nn::parallel::Rank &rank) {
     const ProcessGroup *pp_pg = nullptr;
 
     if (rank.IsParallel()) {
-        device = Device(Device::DeviceType::kCUDA, rank.thread_rank());
+        auto parallel_device_type =
+            (FLAGS_device == kDeviceMACA) ? Device::DeviceType::kMACA : Device::DeviceType::kCUDA;
+        device = Device(parallel_device_type, rank.thread_rank());
         auto *pg_factory = ProcessGroupFactory::Instance(device.type());
 
         if (ddp_world_size > 1) {
@@ -154,7 +159,13 @@ void Train(const nn::parallel::Rank &rank) {
             nn::parallel::pp_rank = pp_rank;
         }
     } else {
-        device = FLAGS_device == kDeviceCPU ? Device() : Device(Device::DeviceType::kCUDA, 0);
+        if (FLAGS_device == kDeviceCPU) {
+            device = Device();
+        } else if (FLAGS_device == kDeviceMACA) {
+            device = Device(Device::DeviceType::kMACA, 0);
+        } else {
+            device = Device(Device::DeviceType::kCUDA, 0);
+        }
     }
 
     // calculate gradient accumulation from the desired total batch size and the current run configuration