fix: remove unnecessary changes

kilinchange · kilinchange · commit 53042efa0c8a · 2026-04-10T09:24:37.000Z
diff --git a/infini_train/include/datatype.h b/infini_train/include/datatype.h
@@ -98,62 +98,17 @@ enum class DataType : int8_t {
     kFLOAT64,
 };
 
-size_t DTypeSize(DataType data_type);
-
-extern const std::unordered_map<DataType, std::string> kDataTypeToDesc;
-
-// -----------------------------------------------------------------------------
-// Compile-time type mapping infrastructure
-// -----------------------------------------------------------------------------
-// Baseline framework scalar/storage mapping.
-// This is the single source of truth for:
-//   - framework DataType -> C++ type mapping
-//   - CPU default type mapping
-//   - backend type-map fallback for dtypes without backend-native overrides
-template <DataType DType> struct TypeMap;
-
-template <DataType DType> using TypeMap_t = typename TypeMap<DType>::type;
-
-// -----------------------------------------------------------------------------
-// Compile-time reverse mapping: framework C++ type -> DataType
-// -----------------------------------------------------------------------------
-template <typename T> struct DataTypeMap;
-
-template <typename T> inline constexpr DataType DataTypeMap_v = DataTypeMap<T>::value;
-
-// Macro to define baseline mapping + reverse mapping
-#define DEFINE_DEFAULT_DATA_TYPE_MAPPING(ENUM_VALUE, CPP_TYPE)                                                         \
-    template <> struct TypeMap<DataType::ENUM_VALUE> {                                                                 \
-        using type = CPP_TYPE;                                                                                         \
-    };                                                                                                                 \
-    template <> struct DataTypeMap<CPP_TYPE> {                                                                         \
-        static constexpr DataType value = DataType::ENUM_VALUE;                                                        \
-    };
-
-DEFINE_DEFAULT_DATA_TYPE_MAPPING(kUINT8, uint8_t)
-DEFINE_DEFAULT_DATA_TYPE_MAPPING(kINT8, int8_t)
-DEFINE_DEFAULT_DATA_TYPE_MAPPING(kUINT16, uint16_t)
-DEFINE_DEFAULT_DATA_TYPE_MAPPING(kINT16, int16_t)
-DEFINE_DEFAULT_DATA_TYPE_MAPPING(kUINT32, uint32_t)
-DEFINE_DEFAULT_DATA_TYPE_MAPPING(kINT32, int32_t)
-DEFINE_DEFAULT_DATA_TYPE_MAPPING(kUINT64, uint64_t)
-DEFINE_DEFAULT_DATA_TYPE_MAPPING(kINT64, int64_t)
-DEFINE_DEFAULT_DATA_TYPE_MAPPING(kFLOAT32, float)
-DEFINE_DEFAULT_DATA_TYPE_MAPPING(kFLOAT64, double)
-
-#undef DEFINE_DEFAULT_DATA_TYPE_MAPPING
-
-// ---------------------------------------------------------------------------
-// Low-precision types: reverse mapping ONLY (DataTypeMap).
-// TypeMap<kFLOAT16> / TypeMap<kBFLOAT16> are intentionally NOT defined here.
-// Backend TypeMaps must explicitly provide these mappings; the default TypeMap
-// will static_assert at compile time if dispatch reaches an unmapped dtype.
-// ---------------------------------------------------------------------------
-template <> struct DataTypeMap<FP16> {
-    static constexpr DataType value = DataType::kFLOAT16;
+inline const std::unordered_map<DataType, size_t> kDataTypeToSize = {
+    {DataType::kUINT8, 1},    {DataType::kINT8, 1},    {DataType::kUINT16, 2},  {DataType::kINT16, 2},
+    {DataType::kUINT32, 4},   {DataType::kINT32, 4},   {DataType::kUINT64, 8},  {DataType::kINT64, 8},
+    {DataType::kBFLOAT16, 2}, {DataType::kFLOAT16, 2}, {DataType::kFLOAT32, 4}, {DataType::kFLOAT64, 8},
 };
-template <> struct DataTypeMap<BF16> {
-    static constexpr DataType value = DataType::kBFLOAT16;
+
+inline const std::unordered_map<DataType, std::string> kDataTypeToDesc = {
+    {DataType::kUINT8, "uint8"},   {DataType::kINT8, "int8"},     {DataType::kUINT16, "uint16"},
+    {DataType::kINT16, "int16"},   {DataType::kUINT32, "uint32"}, {DataType::kINT32, "int32"},
+    {DataType::kUINT64, "uint64"}, {DataType::kINT64, "int64"},   {DataType::kBFLOAT16, "bf16"},
+    {DataType::kFLOAT16, "fp16"},  {DataType::kFLOAT32, "fp32"},  {DataType::kFLOAT64, "fp64"},
 };
 
 // =============================================================================
diff --git a/infini_train/include/dispatcher.h b/infini_train/include/dispatcher.h
@@ -8,8 +8,6 @@
 
 #include "infini_train/include/autocast.h"
 #include "infini_train/include/device.h"
-// FIXEM(dcj): should not include this
-#include "infini_train/include/dtype_dispatch.h"
 #ifdef PROFILE_MODE
 #include "infini_train/include/profiler.h"
 #endif
diff --git a/infini_train/include/dtype_dispatch.h b/infini_train/include/dtype_dispatch.h
@@ -234,8 +234,7 @@ auto DispatchByTypeMap(DataType dtype, Functor &&func, std::string_view context_
         if constexpr (IsDataTypeInList_v<DType, DataTypeList<AllowedDTypes...>>) {                                     \
             static_assert(HasMappedType_v<TypeMap, DType>,                                                             \
                           "TypeMap does not provide explicit mapping for this dtype. "                                 \
-                          "If this is a backend dispatch, register the dtype in the backend TypeMap; "                 \
-                          "if this is DispatchFunc, the dtype is not supported by the default TypeMap.");              \
+                          "Register the dtype in the backend TypeMap (e.g., CpuTypeMap / CudaTypeMap).");              \
             return std::forward<Functor>(func).template operator()<MappedType_t<TypeMap, DType>>(                      \
                 std::forward<Args>(args)...);                                                                          \
         } else {                                                                                                       \
@@ -283,8 +282,7 @@ struct TypeMapDispatcher {
         if constexpr (IsDataTypeInList_v<DType, CurrentList>) {                                                        \
             static_assert(HasMappedType_v<TypeMap, DType>,                                                             \
                           "TypeMap does not provide explicit mapping for this dtype. "                                 \
-                          "If this is a backend dispatch, register the dtype in the backend TypeMap; "                 \
-                          "if this is DispatchFunc, the dtype is not supported by the default TypeMap.");              \
+                          "Register the dtype in the backend TypeMap (e.g., CpuTypeMap / CudaTypeMap).");              \
             using T = MappedType_t<TypeMap, DType>;                                                                    \
             return TypeMapDispatcher<TypeMap, Index + 1, AllowedListTuple, ResolvedTypes..., T>::call(                 \
                 dtypes, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);                 \
@@ -334,24 +332,4 @@ auto DispatchByTypeMap(const std::vector<DataType> &dtypes, Functor &&func, std:
         dtypes, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);
 }
 
-// -----------------------------------------------------------------------------
-// Default framework dispatch using TypeMap
-// -----------------------------------------------------------------------------
-// TypeMap only covers standard types (int/uint/float32/float64).
-// Low-precision types (FP16/BF16) are intentionally unmapped — use a
-// backend-specific dispatch (DispatchCudaFunc, DispatchCpuFunc, …) instead.
-// -----------------------------------------------------------------------------
-template <DataType... AllowedDTypes, typename Functor, typename... Args>
-auto DispatchFunc(DataType dtype, Functor &&func, std::string_view context_identifier = "", Args &&...args) {
-    return DispatchByTypeMap<TypeMap, AllowedDTypes...>(dtype, std::forward<Functor>(func), context_identifier,
-                                                        std::forward<Args>(args)...);
-}
-
-template <typename... AllowedTypeLists, typename Functor, typename... Args>
-auto DispatchFunc(const std::vector<DataType> &dtypes, Functor &&func, std::string_view context_identifier = "",
-                  Args &&...args) {
-    return DispatchByTypeMap<TypeMap, AllowedTypeLists...>(dtypes, std::forward<Functor>(func), context_identifier,
-                                                           std::forward<Args>(args)...);
-}
-
 } // namespace infini_train
diff --git a/infini_train/src/core/runtime/cuda/cuda_dispatch.h b/infini_train/src/core/runtime/cuda/cuda_dispatch.h
@@ -1,11 +1,11 @@
 #pragma once
 
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-
 #include <utility>
 #include <vector>
 
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
 #include "infini_train/include/core/backend_type_map.h"
 #include "infini_train/include/dtype_dispatch.h"
 
diff --git a/infini_train/src/datatype.cc b/infini_train/src/datatype.cc
@@ -163,46 +163,6 @@ BF16 &BF16::operator++() {
     return *this;
 }
 
-// -----------------------------------------------------------------------------
-// DataType metadata
-// -----------------------------------------------------------------------------
-size_t DTypeSize(DataType data_type) {
-    switch (data_type) {
-    case DataType::kUINT8:
-        return 1;
-    case DataType::kINT8:
-        return 1;
-    case DataType::kUINT16:
-        return 2;
-    case DataType::kINT16:
-        return 2;
-    case DataType::kUINT32:
-        return 4;
-    case DataType::kINT32:
-        return 4;
-    case DataType::kUINT64:
-        return 8;
-    case DataType::kINT64:
-        return 8;
-    case DataType::kBFLOAT16:
-        return 2;
-    case DataType::kFLOAT16:
-        return 2;
-    case DataType::kFLOAT32:
-        return 4;
-    case DataType::kFLOAT64:
-        return 8;
-    }
-    return 0; // unreachable
-}
-
-const std::unordered_map<DataType, std::string> kDataTypeToDesc = {
-    {DataType::kUINT8, "uint8"},   {DataType::kINT8, "int8"},     {DataType::kUINT16, "uint16"},
-    {DataType::kINT16, "int16"},   {DataType::kUINT32, "uint32"}, {DataType::kINT32, "int32"},
-    {DataType::kUINT64, "uint64"}, {DataType::kINT64, "int64"},   {DataType::kBFLOAT16, "bf16"},
-    {DataType::kFLOAT16, "fp16"},  {DataType::kFLOAT32, "fp32"},  {DataType::kFLOAT64, "fp64"},
-};
-
 // -----------------------------------------------------------------------------
 // DataType-level promotion
 // -----------------------------------------------------------------------------
@@ -234,7 +194,7 @@ DataType PromoteDataTypes(DataType a, DataType b) {
     }
 
     // Rule 3: same category — wider wins
-    return DTypeSize(a) >= DTypeSize(b) ? a : b;
+    return kDataTypeToSize.at(a) >= kDataTypeToSize.at(b) ? a : b;
 }
 
 } // namespace infini_train
diff --git a/infini_train/src/kernels/cuda/concat.cu b/infini_train/src/kernels/cuda/concat.cu
@@ -90,6 +90,7 @@ std::shared_ptr<Tensor> ConcatForward(const std::vector<std::shared_ptr<Tensor>>
     const int64_t num_inputs = static_cast<int64_t>(inputs.size());
     const int64_t K_total = out_dims[dim];
 
+    // offsets records the sum of Ks
     // offsets[i] = sum_{j < i} K_j
     std::vector<int64_t> host_offsets(num_inputs + 1, 0);
     for (int64_t i = 0; i < num_inputs; ++i) { host_offsets[i + 1] = host_offsets[i] + Ks[i]; }
diff --git a/infini_train/src/kernels/cuda/elementwise.cu b/infini_train/src/kernels/cuda/elementwise.cu
@@ -6,6 +6,7 @@
 #include "infini_train/include/common/cuda/kernel_helper.cuh"
 #include "infini_train/include/core/runtime/device_guard.h"
 #include "infini_train/include/dispatcher.h"
+#include "infini_train/include/dtype_dispatch.h"
 #include "infini_train/include/tensor.h"
 
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
diff --git a/infini_train/src/nn/parallel/ddp/distributed_optimizer.cc b/infini_train/src/nn/parallel/ddp/distributed_optimizer.cc
@@ -64,8 +64,8 @@ void DistributedOptimizer::BuildShardParamsAndBindGrads() {
                 const size_t piece_numel = local_end - local_start;
                 CHECK_GT(piece_numel, 0);
 
-                const size_t param_piece_offset_bytes = local_start * DTypeSize(bucket_param->Dtype());
-                const size_t grad_piece_offset_bytes = local_start * DTypeSize(bucket_grad->Dtype());
+                const size_t param_piece_offset_bytes = local_start * kDataTypeToSize.at(bucket_param->Dtype());
+                const size_t grad_piece_offset_bytes = local_start * kDataTypeToSize.at(bucket_grad->Dtype());
 
                 auto param_piece = std::make_shared<Tensor>(*bucket_param, param_piece_offset_bytes,
                                                             std::vector<int64_t>{static_cast<int64_t>(piece_numel)});
diff --git a/infini_train/src/nn/parallel/ddp/param_and_grad_buffer.cc b/infini_train/src/nn/parallel/ddp/param_and_grad_buffer.cc
@@ -36,7 +36,7 @@ std::shared_ptr<Tensor> AllocateFlatBuffer(size_t num_elements, DataType data_ty
 
 std::shared_ptr<Tensor> GetBufferView(const std::shared_ptr<Tensor> buffer, size_t start_in_elements,
                                       const std::vector<int64_t> &dims) {
-    return std::make_shared<Tensor>(*buffer, start_in_elements * DTypeSize(buffer->Dtype()), dims);
+    return std::make_shared<Tensor>(*buffer, start_in_elements * kDataTypeToSize.at(buffer->Dtype()), dims);
 };
 
 std::vector<std::shared_ptr<Tensor>> ShardBuffer(const std::shared_ptr<Tensor> buffer, size_t ddp_world_size) {
@@ -451,7 +451,7 @@ void ParamAndGradBuffer::BuildBuckets(DataType param_dtype, DataType grad_dtype)
         // Remap param/grad pointers
         if (param_buffer_) {
             // FIXME(zbl): change tensor buffer
-            param->SetData(*param_buffer_, param_start_index * DTypeSize(param_buffer_->Dtype()), true);
+            param->SetData(*param_buffer_, param_start_index * kDataTypeToSize.at(param_buffer_->Dtype()), true);
         }
 
         auto grad_view = GetBufferView(grad_buffer_, param_start_index, param->Dims());
diff --git a/infini_train/src/nn/parallel/ddp/reducer.cc b/infini_train/src/nn/parallel/ddp/reducer.cc
@@ -18,7 +18,7 @@ namespace {
 void CopyGradToBucket(const std::shared_ptr<Tensor> &grad, const std::shared_ptr<Tensor> &flat,
                       size_t dst_elem_offset) {
     CHECK(grad && flat);
-    const size_t element_size_in_bytes = DTypeSize(grad->Dtype());
+    const size_t element_size_in_bytes = kDataTypeToSize.at(grad->Dtype());
     const size_t bytes = grad->NumElements() * element_size_in_bytes;
     char *dst = static_cast<char *>(flat->DataPtr()) + dst_elem_offset * element_size_in_bytes;
     const void *src = grad->DataPtr();
@@ -33,7 +33,7 @@ void CopyGradToBucket(const std::shared_ptr<Tensor> &grad, const std::shared_ptr
 void CopyBucketToGrad(const std::shared_ptr<Tensor> &flat, const std::shared_ptr<Tensor> &grad,
                       size_t src_elem_offset) {
     CHECK(grad && flat);
-    const size_t element_size_in_bytes = DTypeSize(grad->Dtype());
+    const size_t element_size_in_bytes = kDataTypeToSize.at(grad->Dtype());
     const size_t bytes = grad->NumElements() * element_size_in_bytes;
     const char *src = static_cast<const char *>(flat->DataPtr()) + src_elem_offset * element_size_in_bytes;
     void *dst = grad->DataPtr();
@@ -48,7 +48,7 @@ void CopyBucketToGrad(const std::shared_ptr<Tensor> &flat, const std::shared_ptr
 std::shared_ptr<Tensor> MakeGradView(const std::shared_ptr<Tensor> &contents, size_t offset_elems,
                                      const std::vector<int64_t> &dims) {
     // Return a view of contents (same chunk of memory)
-    auto view = std::make_shared<Tensor>(*contents, offset_elems * DTypeSize(contents->Dtype()), dims);
+    auto view = std::make_shared<Tensor>(*contents, offset_elems * kDataTypeToSize.at(contents->Dtype()), dims);
     return view;
 }
 } // namespace
@@ -118,7 +118,7 @@ std::vector<std::vector<size_t>> ComputeBucketAssignmentBySize(const std::vector
         }
         auto &state = it->second;
 
-        const size_t element_size_in_bytes = DTypeSize(tensor->Dtype());
+        const size_t element_size_in_bytes = kDataTypeToSize.at(tensor->Dtype());
         const size_t bytes = tensor->NumElements() * element_size_in_bytes;
         const size_t cap = bucket_size_limits[state.limit_idx];
 
diff --git a/infini_train/src/tensor.cc b/infini_train/src/tensor.cc
@@ -50,13 +50,13 @@ size_t TensorBuffer::Size() const { return size_; }
 // Tensor implementation
 Tensor::Tensor(const std::vector<int64_t> &dims, DataType dtype, Device device) : dims_(dims), dtype_(dtype) {
     num_elements_ = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<int64_t>());
-    buffer_ = std::make_shared<TensorBuffer>(device, DTypeSize(dtype) * num_elements_);
+    buffer_ = std::make_shared<TensorBuffer>(device, kDataTypeToSize.at(dtype) * num_elements_);
 }
 
 Tensor::Tensor(const Tensor &tensor, size_t offset, const std::vector<int64_t> &dims)
     : buffer_(tensor.buffer_), offset_(tensor.offset_ + offset), dims_(dims),
       num_elements_(std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<int64_t>())), dtype_(tensor.dtype_) {
-    CHECK_LE(offset_ + DTypeSize(dtype_) * num_elements_, buffer_->Size());
+    CHECK_LE(offset_ + kDataTypeToSize.at(dtype_) * num_elements_, buffer_->Size());
 }
 
 Tensor::Tensor(const float *data, const std::vector<int64_t> &dims, DataType dtype, Device device)
@@ -65,7 +65,7 @@ Tensor::Tensor(const float *data, const std::vector<int64_t> &dims, DataType dty
     // TODO(dcj): support more datatype
     CHECK(dtype == DataType::kFLOAT32);
 
-    buffer_ = std::make_shared<TensorBuffer>(device, DTypeSize(dtype) * num_elements_);
+    buffer_ = std::make_shared<TensorBuffer>(device, kDataTypeToSize.at(dtype) * num_elements_);
 
     core::DeviceGuard guard(device);
     auto *impl = core::GetDeviceGuardImpl(device.type());
@@ -96,7 +96,7 @@ void *Tensor::DataPtr() { return reinterpret_cast<uint8_t *>(buffer_->DataPtr())
 
 const void *Tensor::DataPtr() const { return reinterpret_cast<const uint8_t *>(buffer_->DataPtr()) + offset_; }
 
-size_t Tensor::SizeInBytes() const { return DTypeSize(dtype_) * num_elements_; }
+size_t Tensor::SizeInBytes() const { return kDataTypeToSize.at(dtype_) * num_elements_; }
 
 const std::vector<int64_t> &Tensor::Dims() const { return dims_; }