|
| 1 | +#include "glog/logging.h" |
| 2 | + |
| 3 | +#include "infini_train/include/common/cuda/common_cuda.h" |
| 4 | +#include "infini_train/include/core/runtime/device_guard.h" |
| 5 | +#include "infini_train/include/dispatcher.h" |
| 6 | +#include "infini_train/include/tensor.h" |
| 7 | + |
| 8 | +#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h" |
| 9 | +#include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h" |
| 10 | + |
| 11 | +namespace infini_train::kernels::cuda { |
| 12 | + |
| 13 | +template <typename T> |
| 14 | +__global__ void Top1MaskForwardKernel(const T *__restrict__ input, T *__restrict__ output, int64_t rows, |
| 15 | + int64_t num_experts) { |
| 16 | + int64_t row = blockIdx.x * blockDim.x + threadIdx.x; |
| 17 | + if (row >= rows) { |
| 18 | + return; |
| 19 | + } |
| 20 | + |
| 21 | + const int64_t offset = row * num_experts; |
| 22 | + int64_t best_idx = 0; |
| 23 | + float best_value = static_cast<float>(input[offset]); |
| 24 | + for (int64_t expert_idx = 1; expert_idx < num_experts; ++expert_idx) { |
| 25 | + const float value = static_cast<float>(input[offset + expert_idx]); |
| 26 | + if (value > best_value) { |
| 27 | + best_value = value; |
| 28 | + best_idx = expert_idx; |
| 29 | + } |
| 30 | + } |
| 31 | + for (int64_t expert_idx = 0; expert_idx < num_experts; ++expert_idx) { |
| 32 | + output[offset + expert_idx] = expert_idx == best_idx ? input[offset + expert_idx] : T(0.0f); |
| 33 | + } |
| 34 | +} |
| 35 | + |
| 36 | +std::shared_ptr<Tensor> Top1MaskForward(const std::shared_ptr<Tensor> &input) { |
| 37 | + CHECK_GE(input->Dims().size(), 1); |
| 38 | + const auto &dims = input->Dims(); |
| 39 | + const int64_t num_experts = dims.back(); |
| 40 | + CHECK_GT(num_experts, 0); |
| 41 | + const int64_t rows = input->NumElements() / num_experts; |
| 42 | + |
| 43 | + auto output = std::make_shared<Tensor>(dims, input->Dtype(), input->GetDevice()); |
| 44 | + |
| 45 | + auto device = input->GetDevice(); |
| 46 | + const auto &stream = dynamic_cast<infini_train::core::cuda::CudaStream *>( |
| 47 | + infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device)) |
| 48 | + ->cuda_stream(); |
| 49 | + const int threads = 256; |
| 50 | + const int blocks = static_cast<int>((rows + threads - 1) / threads); |
| 51 | + |
| 52 | + core::cuda::DispatchCudaFunc<INFINI_ALL_FLOATING_TYPES>( |
| 53 | + input->Dtype(), |
| 54 | + [=]<typename T>() { |
| 55 | + Top1MaskForwardKernel<T><<<blocks, threads, 0, stream>>>( |
| 56 | + static_cast<const T *>(input->DataPtr()), static_cast<T *>(output->DataPtr()), rows, num_experts); |
| 57 | + }, |
| 58 | + "CUDA Top1MaskForward"); |
| 59 | + |
| 60 | + return output; |
| 61 | +} |
| 62 | + |
| 63 | +template <typename T> |
| 64 | +__global__ void Top1MaskBackwardKernel(const T *__restrict__ grad_output, const T *__restrict__ mask_values, |
| 65 | + T *__restrict__ grad_input, int64_t total_elements) { |
| 66 | + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; |
| 67 | + if (idx >= total_elements) { |
| 68 | + return; |
| 69 | + } |
| 70 | + grad_input[idx] = static_cast<float>(mask_values[idx]) != 0.0f ? grad_output[idx] : T(0.0f); |
| 71 | +} |
| 72 | + |
| 73 | +std::shared_ptr<Tensor> Top1MaskBackward(const std::shared_ptr<Tensor> &grad_output, |
| 74 | + const std::shared_ptr<Tensor> &mask_values) { |
| 75 | + CHECK(grad_output->Dims() == mask_values->Dims()); |
| 76 | + CHECK(grad_output->Dtype() == mask_values->Dtype()); |
| 77 | + auto grad_input = std::make_shared<Tensor>(grad_output->Dims(), grad_output->Dtype(), grad_output->GetDevice()); |
| 78 | + |
| 79 | + auto device = grad_output->GetDevice(); |
| 80 | + const auto &stream = dynamic_cast<infini_train::core::cuda::CudaStream *>( |
| 81 | + infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device)) |
| 82 | + ->cuda_stream(); |
| 83 | + const int64_t total_elements = grad_output->NumElements(); |
| 84 | + const int threads = 256; |
| 85 | + const int blocks = static_cast<int>((total_elements + threads - 1) / threads); |
| 86 | + |
| 87 | + core::cuda::DispatchCudaFunc<INFINI_ALL_FLOATING_TYPES>( |
| 88 | + grad_output->Dtype(), |
| 89 | + [=]<typename T>() { |
| 90 | + Top1MaskBackwardKernel<T><<<blocks, threads, 0, stream>>>( |
| 91 | + static_cast<const T *>(grad_output->DataPtr()), static_cast<const T *>(mask_values->DataPtr()), |
| 92 | + static_cast<T *>(grad_input->DataPtr()), total_elements); |
| 93 | + }, |
| 94 | + "CUDA Top1MaskBackward"); |
| 95 | + |
| 96 | + return grad_input; |
| 97 | +} |
| 98 | + |
| 99 | +} // namespace infini_train::kernels::cuda |
| 100 | + |
| 101 | +#define REGISTER_CUDA_TOP1_MASK_KERNEL(kernel_name) \ |
| 102 | + REGISTER_KERNEL(infini_train::Device::DeviceType::kCUDA, kernel_name, infini_train::kernels::cuda::kernel_name) |
| 103 | + |
| 104 | +REGISTER_CUDA_TOP1_MASK_KERNEL(Top1MaskForward) |
| 105 | +REGISTER_CUDA_TOP1_MASK_KERNEL(Top1MaskBackward) |
| 106 | + |
| 107 | +#undef REGISTER_CUDA_TOP1_MASK_KERNEL |
0 commit comments