feat: support flux model on mlu device. (#1138)

phantomlei3 · a120092009 · web-flow · commit 0f7d2587485e · 2026-04-08T16:12:18.000+08:00
Co-authored-by: a120092009 &lt;zhaoty0121@gmail.com&gt;
diff --git a/xllm/core/framework/dit_model_context.cpp b/xllm/core/framework/dit_model_context.cpp
@@ -74,14 +74,21 @@ const QuantArgs& DiTModelContext::get_quant_args(
   }
 }
 
-#if defined(USE_NPU)
+#if defined(USE_NPU) || defined(USE_CUDA) || defined(USE_MLU)
 ModelContext DiTModelContext::get_model_context(
     const std::string& component) const {
+#if defined(USE_NPU)
   return ModelContext(parallel_args_,
                       get_model_args(component),
                       get_quant_args(component),
                       tensor_options_,
                       context_);
+#else
+  return ModelContext(parallel_args_,
+                      get_model_args(component),
+                      get_quant_args(component),
+                      tensor_options_);
+#endif
 }
 #endif
 
diff --git a/xllm/core/framework/dit_model_context.h b/xllm/core/framework/dit_model_context.h
@@ -44,7 +44,7 @@ class DiTModelContext {
 
   const QuantArgs& get_quant_args(const std::string& component) const;
 
-#if defined(USE_NPU)
+#if defined(USE_NPU) || defined(USE_CUDA) || defined(USE_MLU)
   ModelContext get_model_context(const std::string& component) const;
 #endif
 
diff --git a/xllm/core/framework/parallel_state/CMakeLists.txt b/xllm/core/framework/parallel_state/CMakeLists.txt
@@ -6,7 +6,7 @@ cc_library(
     parallel_state
   HDRS
     mapping_npu.h
-    dit_mapping_npu.h
+    dit_mapping.h
     rank_generator.h
     parallel_args.h
     parallel_state.h
@@ -21,7 +21,7 @@ cc_library(
     dit_collective_communicator.h
   SRCS
     mapping_npu.cpp
-    dit_mapping_npu.cpp
+    dit_mapping.cpp
     parallel_state.cpp
     parallel_state_async.cpp
     process_group.cpp
diff --git a/xllm/core/framework/parallel_state/dit_collective_communicator.cpp b/xllm/core/framework/parallel_state/dit_collective_communicator.cpp
@@ -42,22 +42,20 @@ DiTCollectiveCommunicator::DiTCollectiveCommunicator(int32_t global_rank,
                                                      int32_t dit_sp_size,
                                                      int32_t dit_cfg_size)
     : CollectiveCommunicatorBase(global_rank, world_size) {
-#if defined(USE_NPU)
-  DiTMappingNPU::Options dit_mapping_options;
-  dit_mapping_options.dit_tp_size(dit_tp_size)
-      .dit_sp_size(dit_sp_size)
-      .dit_cfg_size(dit_cfg_size)
-      .dit_dp_size(dit_dp_size);
-  dit_mapping_npu_ = std::make_unique<DiTMappingNPU>(
-      world_size, global_rank, dit_mapping_options);
   parallel_args_ = std::make_unique<ParallelArgs>(global_rank,
                                                   world_size,
                                                   dit_dp_size,
                                                   dit_tp_size,
                                                   dit_sp_size,
                                                   dit_cfg_size,
-                                                  nullptr);
-#endif
+                                                  /*process_group=*/nullptr);
+  DiTMapping::Options dit_mapping_options;
+  dit_mapping_options.dit_tp_size(dit_tp_size)
+      .dit_sp_size(dit_sp_size)
+      .dit_cfg_size(dit_cfg_size)
+      .dit_dp_size(dit_dp_size);
+  dit_mapping_ = std::make_unique<DiTMapping>(
+      world_size, global_rank, dit_mapping_options);
 }
 
 void DiTCollectiveCommunicator::create_process_groups(
@@ -87,14 +85,13 @@ void DiTCollectiveCommunicator::create_process_groups(
 
   parallel_args_->process_group_ = process_group_.get();
 
-  if (tp_size > 1) {
-    auto tp_parallel_info = dit_mapping_npu_->get_parallel_info("tp");
+  if (tp_size > 1 && dit_mapping_) {
+    auto tp_parallel_info = dit_mapping_->get_parallel_info("tp");
     auto group_id = tp_parallel_info.current_group_id();
     auto num_group = tp_parallel_info.num_group();
     auto local_rank = tp_parallel_info.rank();
     auto& rank_per_group = tp_parallel_info.rank_per_group()[group_id];
     int port_offset = group_id + 1;
-#if defined(USE_NPU)
     dit_tp_group_ = create_process_group(global_rank,
                                          local_rank,
                                          rank_per_group,
@@ -105,18 +102,16 @@ void DiTCollectiveCommunicator::create_process_groups(
                                          "tp_group",
                                          device);
     parallel_args_->dit_tp_group_ = dit_tp_group_.get();
-#endif
     port += num_group;
   }
 
-  if (sp_size > 1) {
-    auto sp_parallel_info = dit_mapping_npu_->get_parallel_info("sp");
+  if (sp_size > 1 && dit_mapping_) {
+    auto sp_parallel_info = dit_mapping_->get_parallel_info("sp");
     auto group_id = sp_parallel_info.current_group_id();
     auto num_group = sp_parallel_info.num_group();
     auto local_rank = sp_parallel_info.rank();
     auto& rank_per_group = sp_parallel_info.rank_per_group()[group_id];
     int port_offset = group_id + 1;
-#if defined(USE_NPU)
     dit_sp_group_ = create_process_group(global_rank,
                                          local_rank,
                                          rank_per_group,
@@ -127,18 +122,16 @@ void DiTCollectiveCommunicator::create_process_groups(
                                          "sp_group",
                                          device);
     parallel_args_->dit_sp_group_ = dit_sp_group_.get();
-#endif
     port += num_group;
   }
 
-  if (cfg_size > 1) {
-    auto cfg_parallel_info = dit_mapping_npu_->get_parallel_info("cfg");
+  if (cfg_size > 1 && dit_mapping_) {
+    auto cfg_parallel_info = dit_mapping_->get_parallel_info("cfg");
     auto group_id = cfg_parallel_info.current_group_id();
     auto num_group = cfg_parallel_info.num_group();
     auto local_rank = cfg_parallel_info.rank();
     auto& rank_per_group = cfg_parallel_info.rank_per_group()[group_id];
     int port_offset = group_id + 1;
-#if defined(USE_NPU)
     dit_cfg_group_ = create_process_group(global_rank,
                                           local_rank,
                                           rank_per_group,
@@ -149,18 +142,16 @@ void DiTCollectiveCommunicator::create_process_groups(
                                           "cfg_group",
                                           device);
     parallel_args_->dit_cfg_group_ = dit_cfg_group_.get();
-#endif
     port += num_group;
   }
 
-  if (dp_size > 1) {
-    auto dp_parallel_info = dit_mapping_npu_->get_parallel_info("dp");
+  if (dp_size > 1 && dit_mapping_) {
+    auto dp_parallel_info = dit_mapping_->get_parallel_info("dp");
     auto group_id = dp_parallel_info.current_group_id();
     auto num_group = dp_parallel_info.num_group();
     auto local_rank = dp_parallel_info.rank();
     auto& rank_per_group = dp_parallel_info.rank_per_group()[group_id];
     int port_offset = group_id + 1;
-#if defined(USE_NPU)
     dit_dp_group_ = create_process_group(global_rank,
                                          local_rank,
                                          rank_per_group,
@@ -171,7 +162,6 @@ void DiTCollectiveCommunicator::create_process_groups(
                                          "dp_group",
                                          device);
     parallel_args_->dit_dp_group_ = dit_dp_group_.get();
-#endif
     port += num_group;
   }
 }
diff --git a/xllm/core/framework/parallel_state/dit_collective_communicator.h b/xllm/core/framework/parallel_state/dit_collective_communicator.h
@@ -16,7 +16,7 @@ limitations under the License.
 #pragma once
 
 #include "collective_communicator_base.h"
-#include "dit_mapping_npu.h"
+#include "dit_mapping.h"
 
 namespace xllm {
 
@@ -38,7 +38,7 @@ class DiTCollectiveCommunicator : public CollectiveCommunicatorBase {
   const ParallelArgs* parallel_args() override;
 
  private:
-  std::unique_ptr<DiTMappingNPU> dit_mapping_npu_{nullptr};
+  std::unique_ptr<DiTMapping> dit_mapping_{nullptr};
   std::unique_ptr<ParallelArgs> parallel_args_;
   std::unique_ptr<ProcessGroup> process_group_;
   std::unique_ptr<ProcessGroup> dit_tp_group_;
diff --git a/xllm/core/framework/parallel_state/dit_mapping.cpp b/xllm/core/framework/parallel_state/dit_mapping.cpp
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "dit_mapping_npu.h"
+#include "dit_mapping.h"
 
 #include <glog/logging.h>
 
 namespace xllm {
 
-DiTMappingNPU::DiTMappingNPU(const int32_t world_size,
-                             const int32_t rank,
-                             const Options& options)
+DiTMapping::DiTMapping(const int32_t world_size,
+                       const int32_t rank,
+                       const Options& options)
     : rank_(rank), options_(options), world_size_(world_size) {
   tp_.backend("hccl");
   sp_.backend("hccl");
@@ -41,7 +41,7 @@ DiTMappingNPU::DiTMappingNPU(const int32_t world_size,
   set_group_by_type(dp_, "dp");
 }
 
-void DiTMappingNPU::parse_parallel_info() {
+void DiTMapping::parse_parallel_info() {
   if (options_.dit_tp_size() != -1) {
     tp_.group_size(options_.dit_tp_size());
   }
@@ -56,7 +56,7 @@ void DiTMappingNPU::parse_parallel_info() {
   }
 }
 
-void DiTMappingNPU::validate() {
+void DiTMapping::validate() {
   CHECK(cfg_.group_size() * tp_.group_size() * sp_.group_size() *
             dp_.group_size() ==
         world_size_)
@@ -84,8 +84,8 @@ void DiTMappingNPU::validate() {
                                        ". Please check `cfg` .";
 }
 
-void DiTMappingNPU::set_group_by_type(ParallelInfo& parallel_info,
-                                      const std::string& group_type) {
+void DiTMapping::set_group_by_type(ParallelInfo& parallel_info,
+                                   const std::string& group_type) {
   auto rank_per_group = rank_generator_->get_ranks(group_type);
   parallel_info.rank_per_group(rank_per_group);
   auto group_size = rank_per_group[0].size();
@@ -99,7 +99,7 @@ void DiTMappingNPU::set_group_by_type(ParallelInfo& parallel_info,
   parallel_info.rank(local_rank);
 }
 
-std::tuple<int32_t, int32_t> DiTMappingNPU::get_current_group_id(
+std::tuple<int32_t, int32_t> DiTMapping::get_current_group_id(
     const std::vector<std::vector<int32_t>>& rank_per_group,
     int32_t target_rank_id) {
   for (int32_t idx = 0; idx < rank_per_group.size(); ++idx) {
@@ -112,7 +112,7 @@ std::tuple<int32_t, int32_t> DiTMappingNPU::get_current_group_id(
   return std::make_tuple(-1, -1);
 }
 
-const ParallelInfo& DiTMappingNPU::get_parallel_info(
+const ParallelInfo& DiTMapping::get_parallel_info(
     const std::string& group_type) const {
   if (group_type == "tp") {
     return tp_;
@@ -127,7 +127,7 @@ const ParallelInfo& DiTMappingNPU::get_parallel_info(
   }
 }
 
-nlohmann::json DiTMappingNPU::to_json() {
+nlohmann::json DiTMapping::to_json() {
   nlohmann::json data;
 
   data["SpSize"] = options_.dit_sp_size();
diff --git a/xllm/core/framework/parallel_state/dit_mapping.h b/xllm/core/framework/parallel_state/dit_mapping.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "rank_generator.h"
 namespace xllm {
 
-class DiTMappingNPU final {
+class DiTMapping final {
  public:
   struct Options {
     // cfg size
@@ -36,9 +36,9 @@ class DiTMappingNPU final {
     PROPERTY(int32_t, dit_dp_size) = -1;
   };
 
-  DiTMappingNPU(const int32_t world_size,
-                const int32_t rank,
-                const Options& options);
+  DiTMapping(const int32_t world_size,
+             const int32_t rank,
+             const Options& options);
 
   int32_t get_num_nodes();
 
diff --git a/xllm/core/framework/parallel_state/mlu_process_group.h b/xllm/core/framework/parallel_state/mlu_process_group.h
@@ -55,6 +55,30 @@ class ProcessGroupImpl : public ProcessGroup {
     pg_ = std::make_unique<torch_mlu::ProcessGroupCNCL>(
         store, rank, rank_size, pg_options);
   }
+
+  ProcessGroupImpl(int32_t global_rank,
+                   int32_t local_rank,
+                   const std::vector<int32_t>& group_ranks,
+                   int32_t world_size,
+                   int32_t rank_size,
+                   int32_t port,
+                   const std::string& host,
+                   const std::string& group_name,
+                   const torch::Device& device)
+      : ProcessGroup(global_rank, world_size, device) {
+    c10::intrusive_ptr<torch_mlu::ProcessGroupCNCL::Options> pg_options =
+        torch_mlu::ProcessGroupCNCL::Options::create();
+    pg_options->group_name = group_name;
+    std::vector<size_t> ranks_unsigned;
+    ranks_unsigned.reserve(group_ranks.size());
+    for (int32_t rank : group_ranks) {
+      ranks_unsigned.push_back(static_cast<size_t>(rank));
+    }
+    pg_options->global_ranks_in_group = ranks_unsigned;
+    auto store = create_tcp_store(host, port, local_rank);
+    pg_ = std::make_unique<torch_mlu::ProcessGroupCNCL>(
+        store, local_rank, rank_size, pg_options);
+  }
 };
 
 }  // namespace xllm
diff --git a/xllm/core/framework/parallel_state/process_group.cpp b/xllm/core/framework/parallel_state/process_group.cpp
@@ -198,7 +198,6 @@ std::unique_ptr<ProcessGroup> create_process_group(
       rank, world_size, rank_size, port, trans, host, group_name, device);
 }
 
-#if defined(USE_NPU)
 // TODO: This function is used by DiT models, since the DiT communication group
 // info have already been calculated by rank_generator, we only need to pass the
 // info to create the process groups. For any device that want to reuse the
@@ -224,5 +223,4 @@ std::unique_ptr<ProcessGroup> create_process_group(
                                             group_name,
                                             device);
 }
-#endif
 }  // namespace xllm
diff --git a/xllm/models/dit/clip_text_model.h b/xllm/models/dit/clip_text_model.h
@@ -16,7 +16,9 @@ limitations under the License.
 
 #pragma once
 
+#if defined(USE_NPU)
 #include <atb/atb_infer.h>
+#endif
 #include <c10/core/ScalarType.h>
 #include <torch/torch.h>
 
@@ -27,13 +29,17 @@ limitations under the License.
 #include "core/framework/kv_cache/kv_cache.h"
 #include "core/framework/model/model_input_params.h"
 #include "core/framework/model_context.h"
+#if defined(USE_NPU)
 #include "core/layers/npu/npu_siglip_encoder_layer_impl.h"
+#endif
 #include "models/model_registry.h"
 #include "processors/clip_image_processor.h"
 #include "processors/clip_input_processor.h"
 #include "processors/pywarpper_image_processor.h"
 #include "xllm/core/layers/common/add_matmul.h"
+#if defined(USE_NPU)
 #include "xllm_atb_layers/core/include/atb_speed/log.h"
+#endif
 
 namespace xllm {
 // clip_text_model compatible with huggingface weights
diff --git a/xllm/models/dit/pipeline_flux_base.h b/xllm/models/dit/pipeline_flux_base.h
@@ -14,7 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #pragma once
+
+#if defined(USE_NPU)
 #include <acl/acl.h>
+#endif
 #include <torch/torch.h>
 
 #include <algorithm>
diff --git a/xllm/models/dit/transformer_flux.h b/xllm/models/dit/transformer_flux.h
@@ -59,7 +59,7 @@ inline torch::Tensor apply_rotary_emb(const torch::Tensor& x,
 
 #if defined(USE_NPU)
   return at_npu::native::custom_ops::npu_rotary_mul(x, cos, sin, "interleave");
-#elif defined(USE_CUDA)
+#elif defined(USE_CUDA) || defined(USE_MLU)
   std::vector<int64_t> reshape_shape;
   for (int64_t i = 0; i < x.dim() - 1; ++i) {
     reshape_shape.push_back(x.size(i));
@@ -278,7 +278,7 @@ class FluxSingleAttentionImpl : public torch::nn::Module {
     auto attn_output = std::get<0>(results);
     attn_output = attn_output.to(query.dtype());
     return attn_output.flatten(2);
-#elif defined(USE_CUDA)
+#elif defined(USE_CUDA) || defined(USE_MLU)
     query = query.view({batch_size, -1, attn_heads, head_dim}).transpose(1, 2);
     key = key.view({batch_size, -1, attn_heads, head_dim}).transpose(1, 2);
     value = value.view({batch_size, -1, attn_heads, head_dim}).transpose(1, 2);
@@ -458,7 +458,7 @@ class FluxAttentionImpl : public torch::nn::Module {
     auto attn_output = std::get<0>(results);
 
     attn_output = attn_output.reshape({batch_size, -1, attn_heads * head_dim});
-#elif defined(USE_CUDA) || defined(USE_MUSA)
+#elif defined(USE_CUDA) || defined(USE_MLU) || defined(USE_MUSA)
     // SDPA expects (B, H, S, D); our query1/key1/value1 are (B, S, H, D).
     // Transpose to match diffusers dispatch_attention_fn (permute 0,2,1,3).
     query1 = query1.transpose(1, 2);
diff --git a/xllm/models/models.h b/xllm/models/models.h
diff --git a/xllm/xllm.cpp b/xllm/xllm.cpp

Original file line number	Diff line number	Diff line change
`@@ -198,7 +198,6 @@ std::unique_ptr<ProcessGroup> create_process_group(`
`198`	`198`	`rank, world_size, rank_size, port, trans, host, group_name, device);`
`199`	`199`	`}`
`200`	`200`
`201`		`-#if defined(USE_NPU)`
`202`	`201`	`// TODO: This function is used by DiT models, since the DiT communication group`
`203`	`202`	`// info have already been calculated by rank_generator, we only need to pass the`
`204`	`203`	`// info to create the process groups. For any device that want to reuse the`
`@@ -224,5 +223,4 @@ std::unique_ptr<ProcessGroup> create_process_group(`
`224`	`223`	`group_name,`
`225`	`224`	`device);`
`226`	`225`	`}`
`227`		`-#endif`
`228`	`226`	`} // namespace xllm`