fix: resolve MACA build and runtime issues to enable GPT-2 training

kilinchange · kilinchange · commit a140494d505e · 2026-04-14T08:01:53.000Z
CMakeLists.txt:
- Pre-set HAVE_MODE_T/HAVE_SSIZE_T and their sentinel variables
  (HAVE_HAVE_MODE_T/HAVE_HAVE_SSIZE_T) before add_subdirectory(glog),
  since mxcc cmake feature-detection probes cannot find standard POSIX
  headers; without the sentinels check_type_size re-runs and overwrites
  the pre-set values, causing glog to emit conflicting fallback typedefs
- Add BUILD_TESTING=OFF to skip glog unit tests (-fPIE unsupported by mxcc)
- Add BUILD_SHARED_LIBS=OFF to build glog as a static library; mxcc
  defaults to hidden symbol visibility, making libglog.so export nothing

datatype.h:
- Add is_bfloat16&lt;T&gt; and is_fp16&lt;T&gt; type traits with USE_CUDA/USE_MACA
  specializations, needed by common_cpu.h Cast and init.cc ARANGE_CASE

common/cpu/common_cpu.h:
- Route fp16/bf16 destinations through float in Cast&lt;T&gt;(), avoiding
  ambiguous integer→__half/__maca_bfloat16 conversion on MACA

kernels/maca/{stack,concat,slice,transform,elementwise,split,gather}.maca:
- Add reinterpret_cast&lt;void **&gt; to all mcMallocAsync(&amp;ptr, ...) calls;
  MACA's mcMallocAsync requires void** but typed pointers were passed
- Fix mcDevAttrMultiProcessorCount → mcDeviceAttributeMultiProcessorCount
  in elementwise.maca (correct MACA enum name)

optimizer.cc:
- Change Fill&lt;T&gt;(0) → Fill&lt;T&gt;(0.f) for Adam m/v initialization;
  __half(0) is ambiguous on MACA (only float/double ctors available)

nn/init.cc:
- Replace std::iota + static_cast&lt;TYPE&gt;(start) in ARANGE_CASE with an
  explicit loop via static_cast&lt;float&gt; to avoid ambiguous integer→fp16/
  bf16 conversion for kBFLOAT16/kFLOAT16 cases

example/gpt2/main.cc:
- Add kDeviceMACA constant, update --device validator to accept "maca",
  and add Device::DeviceType::kMACA branch in device selection
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -44,6 +44,46 @@ include_directories(${gflags_SOURCE_DIR}/include)
 # glog
 set(WITH_GFLAGS OFF CACHE BOOL "Disable glog finding system gflags" FORCE)
 set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE)
+set(BUILD_TESTING OFF CACHE BOOL "Disable glog unit tests" FORCE)
+# Build glog as a static lib so its symbols are always visible at link time.
+# Under mxcc the default symbol visibility is hidden, which causes the shared
+# libglog.so to export no symbols and produces "undefined reference" errors.
+set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build glog as static library" FORCE)
+
+# Under MACA/mxcc, cmake's feature-detection test compilations do not find
+# standard POSIX system headers (mxcc has a non-standard sysroot probe path).
+# Pre-set glog's HAVE_* cache variables so that glog skips its fallback type /
+# symbol definitions, which would otherwise conflict with the real system
+# headers during the actual build.
+if(USE_MACA)
+  set(HAVE_SYS_TYPES_H    1 CACHE INTERNAL "")
+  set(HAVE_UNISTD_H        1 CACHE INTERNAL "")
+  set(HAVE_DLFCN_H         1 CACHE INTERNAL "")
+  set(HAVE_GLOB_H          1 CACHE INTERNAL "")
+  set(HAVE_PWD_H           1 CACHE INTERNAL "")
+  set(HAVE_SYS_TIME_H      1 CACHE INTERNAL "")
+  set(HAVE_SYS_UTSNAME_H   1 CACHE INTERNAL "")
+  set(HAVE_SYS_WAIT_H      1 CACHE INTERNAL "")
+  set(HAVE_SYS_SYSCALL_H   1 CACHE INTERNAL "")
+  set(HAVE_SYSLOG_H        1 CACHE INTERNAL "")
+  set(HAVE_UCONTEXT_H      1 CACHE INTERNAL "")
+  # check_type_size() uses two internal variables: the size value and a sentinel
+  # "HAVE_HAVE_<VAR>" that marks the check as done. Pre-setting only the value
+  # is insufficient — the sentinel must also be set so the check skips entirely.
+  set(HAVE_MODE_T          4    CACHE INTERNAL "")  # 4 bytes on Linux
+  set(HAVE_HAVE_MODE_T     TRUE CACHE INTERNAL "")
+  set(HAVE_SSIZE_T         8    CACHE INTERNAL "")  # 8 bytes on 64-bit Linux
+  set(HAVE_HAVE_SSIZE_T    TRUE CACHE INTERNAL "")
+  set(HAVE_PREAD           1 CACHE INTERNAL "")
+  set(HAVE_PWRITE          1 CACHE INTERNAL "")
+  set(HAVE_POSIX_FADVISE   1 CACHE INTERNAL "")
+  set(HAVE_SIGACTION       1 CACHE INTERNAL "")
+  set(HAVE_SIGALTSTACK     1 CACHE INTERNAL "")
+  set(HAVE_FCNTL           1 CACHE INTERNAL "")
+  set(HAVE_DLADDR          1 CACHE INTERNAL "")
+  set(HAVE___CXA_DEMANGLE  1 CACHE INTERNAL "")
+endif()
+
 add_subdirectory(third_party/glog)
 include_directories(${glog_SOURCE_DIR}/src)
 
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -98,6 +98,7 @@ const std::unordered_set<std::string> kSupportedModels
     = {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "d12", "d24", "d36", "d48"};
 constexpr char kDeviceCPU[] = "cpu";
 constexpr char kDeviceCUDA[] = "cuda";
+constexpr char kDeviceMACA[] = "maca";
 constexpr char kDtypeFP32[] = "float32";
 constexpr char kDtypeBF16[] = "bfloat16";
 
@@ -112,8 +113,9 @@ const std::unordered_map<std::string, nn::TransformerConfig> kModelToConfigs = {
 } // namespace
 
 DEFINE_validator(model, [](const char *, const std::string &value) { return kSupportedModels.contains(value); });
-DEFINE_validator(device,
-                 [](const char *, const std::string &value) { return value == kDeviceCPU || value == kDeviceCUDA; });
+DEFINE_validator(device, [](const char *, const std::string &value) {
+    return value == kDeviceCPU || value == kDeviceCUDA || value == kDeviceMACA;
+});
 
 void Train(const nn::parallel::Rank &rank) {
     using namespace nn::parallel;
@@ -169,7 +171,13 @@ void Train(const nn::parallel::Rank &rank) {
             nn::parallel::pp_rank = pp_rank;
         }
     } else {
-        device = FLAGS_device == kDeviceCPU ? Device() : Device(Device::DeviceType::kCUDA, 0);
+        if (FLAGS_device == kDeviceCPU) {
+            device = Device();
+        } else if (FLAGS_device == kDeviceMACA) {
+            device = Device(Device::DeviceType::kMACA, 0);
+        } else {
+            device = Device(Device::DeviceType::kCUDA, 0);
+        }
     }
 
     // calculate gradient accumulation from the desired total batch size and the current run configuration
diff --git a/infini_train/include/common/cpu/common_cpu.h b/infini_train/include/common/cpu/common_cpu.h
@@ -3,6 +3,8 @@
 #include <type_traits>
 #include <utility>
 
+#include "infini_train/include/datatype.h"
+
 namespace infini_train::common::cpu {
 /**
  * Converts a value between arbitrary types. This offers perfect
@@ -16,7 +18,12 @@ namespace infini_train::common::cpu {
 template <typename DST, typename SRC> DST Cast(SRC &&x) {
     static_assert(!std::is_reference_v<DST>, "Cast cannot return reference types");
 
-    // TODO(lzm): add cpu-version fp16 and bf16
-    return (DST)(std::forward<SRC>(x));
+    using Dst = std::remove_cv_t<std::remove_reference_t<DST>>;
+    if constexpr (is_bfloat16<Dst>::value || is_fp16<Dst>::value) {
+        // TODO(lzm): add cpu-version fp16 and bf16
+        return Dst(static_cast<float>(std::forward<SRC>(x)));
+    } else {
+        return static_cast<DST>(std::forward<SRC>(x));
+    }
 }
 } // namespace infini_train::common::cpu
diff --git a/infini_train/include/datatype.h b/infini_train/include/datatype.h
@@ -103,6 +103,20 @@ template <> struct TypeMap<DataType::kFLOAT16> {
 #endif
 #undef DEFINE_DATA_TYPE_MAPPING
 
+template <typename T> struct is_bfloat16 : std::false_type {};
+#if defined(USE_CUDA)
+template <> struct is_bfloat16<nv_bfloat16> : std::true_type {};
+#elif defined(USE_MACA)
+template <> struct is_bfloat16<__maca_bfloat16> : std::true_type {};
+#endif
+
+template <typename T> struct is_fp16 : std::false_type {};
+#if defined(USE_CUDA)
+template <> struct is_fp16<half> : std::true_type {};
+#elif defined(USE_MACA)
+template <> struct is_fp16<__half> : std::true_type {};
+#endif
+
 // Extends std::is_floating_point to support CUDA floating-point types.
 template <typename T> struct is_floating_point_ext : std::is_floating_point<T> {};
 
diff --git a/infini_train/src/kernels/maca/concat.maca b/infini_train/src/kernels/maca/concat.maca
@@ -112,11 +112,11 @@ std::shared_ptr<Tensor> ConcatForward(const std::vector<std::shared_ptr<Tensor>>
             const T **device_input_ptrs = nullptr;
             int64_t *device_offsets = nullptr;
 
-            MACA_CHECK(mcMallocAsync(&device_input_ptrs, sizeof(T *) * num_inputs, stream));
+            MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&device_input_ptrs), sizeof(T *) * num_inputs, stream));
             MACA_CHECK(mcMemcpyAsync(device_input_ptrs, host_input_ptrs.data(), sizeof(T *) * num_inputs,
                                        mcMemcpyHostToDevice, stream));
 
-            MACA_CHECK(mcMallocAsync(&device_offsets, sizeof(int64_t) * (num_inputs + 1), stream));
+            MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&device_offsets), sizeof(int64_t) * (num_inputs + 1), stream));
             MACA_CHECK(mcMemcpyAsync(device_offsets, host_offsets.data(), sizeof(int64_t) * (num_inputs + 1),
                                        mcMemcpyHostToDevice, stream));
 
@@ -218,11 +218,11 @@ std::vector<std::shared_ptr<Tensor>> ConcatBackward(const std::shared_ptr<Tensor
             T **device_ptrs = nullptr;
             int64_t *device_offsets = nullptr;
 
-            MACA_CHECK(mcMallocAsync(&device_ptrs, sizeof(T *) * num_inputs, stream));
+            MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&device_ptrs), sizeof(T *) * num_inputs, stream));
             MACA_CHECK(mcMemcpyAsync(device_ptrs, host_ptrs.data(), sizeof(T *) * num_inputs, mcMemcpyHostToDevice,
                                        stream));
 
-            MACA_CHECK(mcMallocAsync(&device_offsets, sizeof(int64_t) * (num_inputs + 1), stream));
+            MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&device_offsets), sizeof(int64_t) * (num_inputs + 1), stream));
             MACA_CHECK(mcMemcpyAsync(device_offsets, host_offsets.data(), sizeof(int64_t) * (num_inputs + 1),
                                        mcMemcpyHostToDevice, stream));
 
diff --git a/infini_train/src/kernels/maca/elementwise.maca b/infini_train/src/kernels/maca/elementwise.maca
@@ -427,7 +427,7 @@ void BinaryBackwardBhistLaunch(FuncA fn_a, FuncB fn_b, T *outA, T *outB, const T
 
     // Workspace layout: [grid, K] floats.
     float *work = nullptr;
-    MACA_CHECK(mcMallocAsync(&work, static_cast<size_t>(grid) * static_cast<size_t>(K) * sizeof(float), stream));
+    MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&work), static_cast<size_t>(grid) * static_cast<size_t>(K) * sizeof(float), stream));
 
     // Pass 1: per-block histogram accumulation.
     const size_t smem_bytes = static_cast<size_t>(K + (K >> 5)) * sizeof(float);
@@ -439,7 +439,7 @@ void BinaryBackwardBhistLaunch(FuncA fn_a, FuncB fn_b, T *outA, T *outB, const T
     int dev = 0;
     int sm_count = 0;
     MACA_CHECK(mcGetDevice(&dev));
-    MACA_CHECK(mcDeviceGetAttribute(&sm_count, mcDevAttrMultiProcessorCount, dev));
+    MACA_CHECK(mcDeviceGetAttribute(&sm_count, mcDeviceAttributeMultiProcessorCount, dev));
 
     const int RED_THREADS = 256;
     const int oneD_blocks = (K + RED_THREADS - 1) / RED_THREADS;
@@ -457,7 +457,7 @@ void BinaryBackwardBhistLaunch(FuncA fn_a, FuncB fn_b, T *outA, T *outB, const T
         // 2D tiling path: slice the workspace and accumulate using float atomics.
         constexpr int kTileHeight = 128; // rows per CTA; tune between 128 and 256 if needed
         float *outB_accum = nullptr;
-        MACA_CHECK(mcMallocAsync(&outB_accum, static_cast<size_t>(K) * sizeof(float), stream));
+        MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&outB_accum), static_cast<size_t>(K) * sizeof(float), stream));
         MACA_CHECK(mcMemsetAsync(outB_accum, 0, static_cast<size_t>(K) * sizeof(float), stream));
 
         const dim3 rblock(RED_THREADS, 1, 1);
diff --git a/infini_train/src/kernels/maca/gather.maca b/infini_train/src/kernels/maca/gather.maca
@@ -84,7 +84,7 @@ std::shared_ptr<Tensor> IndexGatherForward(const std::shared_ptr<Tensor> &input,
     const int64_t gather_dim_size = in_dims[dim];
 
     int64_t *dev_buf = nullptr;
-    MACA_CHECK(mcMallocAsync(&dev_buf, (3 * num_dims) * sizeof(int64_t), stream));
+    MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&dev_buf), (3 * num_dims) * sizeof(int64_t), stream));
     int64_t *out_dims_dev = dev_buf + 0 * num_dims;
     int64_t *in_strides_dev = dev_buf + 1 * num_dims;
     int64_t *out_strides_dev = dev_buf + 2 * num_dims;
@@ -193,7 +193,7 @@ std::shared_ptr<Tensor> IndexGatherBackward(const std::shared_ptr<Tensor> &grad_
                              infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                              ->maca_stream();
 
-    MACA_CHECK(mcMallocAsync(&dev_buf, total_i64 * sizeof(int64_t), stream));
+    MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&dev_buf), total_i64 * sizeof(int64_t), stream));
     int64_t *out_dims_dev = dev_buf;
     int64_t *in_strides_dev = out_dims_dev + n_out;
     int64_t *out_strides_dev = in_strides_dev + n_in_strides;
diff --git a/infini_train/src/kernels/maca/slice.maca b/infini_train/src/kernels/maca/slice.maca
@@ -73,7 +73,7 @@ std::shared_ptr<Tensor> SliceForward(const std::shared_ptr<Tensor> &input, const
                              infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                              ->maca_stream();
 
-    mcMallocAsync(&new_dims_dev,
+    mcMallocAsync(reinterpret_cast<void **>(&new_dims_dev),
                     (ends.size() + starts.size() + steps.size() + dims.size() + new_dims.size()) * sizeof(int64_t),
                     stream);
     starts_dev = new_dims_dev + ends.size();
@@ -167,7 +167,7 @@ std::shared_ptr<Tensor> SliceBackward(const std::shared_ptr<Tensor> &grad_output
     const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
                              infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                              ->maca_stream();
-    mcMallocAsync(&new_dims_dev,
+    mcMallocAsync(reinterpret_cast<void **>(&new_dims_dev),
                     (ends.size() + starts.size() + steps.size() + dims.size() + new_dims.size()) * sizeof(int64_t),
                     stream);
     starts_dev = new_dims_dev + ends.size();
diff --git a/infini_train/src/kernels/maca/split.maca b/infini_train/src/kernels/maca/split.maca
@@ -133,7 +133,7 @@ std::shared_ptr<Tensor> LaunchSplitBackward(const std::vector<int64_t> &input_di
     void *device_ptr;
     const T **device_grad_output_ptrs;
     int64_t *device_H_outs;
-    mcMallocAsync(&device_ptr, (sizeof(T *) + sizeof(int64_t)) * num_splits, stream);
+    mcMallocAsync(reinterpret_cast<void **>(&device_ptr), (sizeof(T *) + sizeof(int64_t)) * num_splits, stream);
     device_grad_output_ptrs = (const T **)(device_ptr);
     device_H_outs = reinterpret_cast<int64_t *>(device_grad_output_ptrs + num_splits);
 
diff --git a/infini_train/src/kernels/maca/stack.maca b/infini_train/src/kernels/maca/stack.maca
@@ -67,7 +67,7 @@ std::shared_ptr<Tensor> StackForward(const std::vector<std::shared_ptr<Tensor>>
             for (const auto &t : inputs) { host_input_ptrs.push_back(static_cast<const T *>(t->DataPtr())); }
 
             const T **device_input_ptrs;
-            mcMallocAsync(&device_input_ptrs, sizeof(T *) * num_inputs, stream);
+            mcMallocAsync(reinterpret_cast<void **>(&device_input_ptrs), sizeof(T *) * num_inputs, stream);
             mcMemcpyAsync(device_input_ptrs, host_input_ptrs.data(), sizeof(T *) * num_inputs, mcMemcpyHostToDevice,
                             stream);
 
@@ -136,7 +136,7 @@ std::vector<std::shared_ptr<Tensor>> StackBackward(const std::vector<int64_t> &i
             for (auto &t : grads) { host_ptrs.push_back(static_cast<T *>(t->DataPtr())); }
 
             T **device_ptrs;
-            mcMallocAsync(&device_ptrs, sizeof(T *) * num_inputs, stream);
+            mcMallocAsync(reinterpret_cast<void **>(&device_ptrs), sizeof(T *) * num_inputs, stream);
             mcMemcpyAsync(device_ptrs, host_ptrs.data(), sizeof(T *) * num_inputs, mcMemcpyHostToDevice, stream);
 
             StackBackwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
diff --git a/infini_train/src/kernels/maca/transform.maca b/infini_train/src/kernels/maca/transform.maca
@@ -252,7 +252,7 @@ std::shared_ptr<Tensor> TransposeForward(const std::shared_ptr<Tensor> &input, i
     // Allocate device memory for dims and strides
     // TODO(zbl): avoid using mcMalloc?
     int64_t *device_buffer;
-    mcMallocAsync(&device_buffer, 3 * ndim * sizeof(int64_t), stream);
+    mcMallocAsync(reinterpret_cast<void **>(&device_buffer), 3 * ndim * sizeof(int64_t), stream);
 
     int64_t *in_dims_dev = device_buffer;
     int64_t *in_strides_dev = device_buffer + ndim;
diff --git a/infini_train/src/nn/init.cc b/infini_train/src/nn/init.cc
@@ -204,7 +204,7 @@ std::shared_ptr<Tensor> Zeros(const std::shared_ptr<Tensor> &tensor) {
 #define ARANGE_CASE(DATA_TYPE, TYPE)                                                                                   \
     case DATA_TYPE: {                                                                                                  \
         std::vector<TYPE> buffer(num_elements);                                                                        \
-        std::iota(buffer.begin(), buffer.end(), static_cast<TYPE>(start));                                             \
+        for (int64_t i = 0; i < num_elements; ++i) { buffer[i] = static_cast<TYPE>(static_cast<float>(start + i)); }   \
         impl->MemcpyAsync(tensor->DataPtr(), buffer.data(), num_elements * sizeof(TYPE), kind, stream);                \
         break;                                                                                                         \
     }
diff --git a/infini_train/src/optimizer.cc b/infini_train/src/optimizer.cc
@@ -41,8 +41,8 @@ Adam::Adam(const std::vector<std::shared_ptr<Tensor>> &params, float learning_ra
         DispatchFunc<INFINI_ALL_TYPES>(
             param->Dtype(),
             [this]<typename T>() {
-                m_.back()->Fill<T>(0);
-                v_.back()->Fill<T>(0);
+                m_.back()->Fill<T>(0.f);
+                v_.back()->Fill<T>(0.f);
             },
             "CUDA Adam");
     }