alibaba
diff --git a/‎rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.cc‎
Lines changed: 54 additions & 631 deletions b/‎rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.cc‎
Lines changed: 54 additions & 631 deletions
diff --git a/‎rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.h‎
Lines changed: 22 additions & 73 deletions b/‎rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.h‎
Lines changed: 22 additions & 73 deletions
diff --git a/‎rtp_llm/cpp/normal_engine/NormalExecutor.cc‎
Lines changed: 1 addition & 2 deletions b/‎rtp_llm/cpp/normal_engine/NormalExecutor.cc‎
Lines changed: 1 addition & 2 deletions
@@ -1,13 +1,17 @@
 #pragma once
 
 #include <memory>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "rtp_llm/cpp/cache/CacheConfig.h"
+#include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/cpp/devices/DeviceBase.h"
 #include "rtp_llm/cpp/cache/CacheGroupType.h"
-#include "rtp_llm/cpp/config/ConfigModules.h"
-#include "rtp_llm/cpp/models/SampleInfos.h"
 #include "rtp_llm/cpp/engine_base/stream/StreamGroups.h"
-#include "absl/status/statusor.h"
-#include "absl/status/status.h"
+#include "rtp_llm/cpp/models/SampleInfos.h"
+#include "rtp_llm/cpp/normal_engine/NormalModelInputGatherer.h"
+#include "rtp_llm/cpp/normal_engine/NormalOutputDispatcher.h"
+#include "rtp_llm/cpp/normal_engine/NormalSamplerInputGatherer.h"
 
 namespace rtp_llm {
 
@@ -17,84 +21,29 @@ class NormalBatchStreamProcessor {
                                const PDSepConfig&                 pd_sep_config,
                                const ProfilingDebugLoggingConfig& profiling_debug_logging_config,
                                const CacheConfig&                 cache_config,
-                               bool                               warm_up):
-        num_layers_(model_config.num_layers),
-        vocab_size_(model_config.vocab_size),
-        input_vocab_size_(model_config.input_vocab_size),
-        use_int8_kv_cache_(model_config.attn_config.kv_cache_dtype == rtp_llm::KvCacheDataType::INT8),
-        has_positional_encoding_(model_config.has_positional_encoding),
-        is_multimodal_(model_config.mm_model_config.is_multimodal),
-        mm_position_ids_style_((PositionIdsStyle)model_config.mm_model_config.mm_position_ids_style),
-        position_id_len_factor_(model_config.attn_config.rope_config.index_factor),
-        role_type_(pd_sep_config.role_type),
-        decode_entrance_(pd_sep_config.decode_entrance),
-        block_stride_bytes_(cache_config.kv_block_stride_bytes),
-        scale_stride_bytes_(cache_config.kv_scale_stride_bytes),
-        seq_size_per_block_(cache_config.seq_size_per_block),
-        kernel_seq_size_per_block_(cache_config.kernel_seq_size_per_block),
-        kernel_blocks_per_kv_block_(cache_config.kernelBlocksPerKvBlock()),
-        kv_cache_group_nums_(cache_config.groupNums()),
-        layer_to_kv_cache_group_id_(cache_config.layer_to_group_id),
-        kv_cache_group_types_(cache_config.group_types),
-        warm_up_(warm_up),
-        enable_detail_log_(profiling_debug_logging_config.enable_detail_log),
-        device_(rtp_llm::DeviceFactory::getDefaultDevice()) {}
+                               bool                               warm_up);
 
     virtual absl::Status dispatch(const StreamGroups& stream_groups, const MergedOutput& merge_outputs) const;
     virtual absl::StatusOr<GptModelInputs> gatherModelInput(const StreamGroups& stream_groups) const;
     virtual absl::StatusOr<SamplerInputs>  gatherSamplerInput(const StreamGroups&    stream_groups,
-                                                              const GptModelInputs&  model_inputs,
                                                               const GptModelOutputs& model_output) const;
 
 protected:
-    SamplerInputs allocateSamplerInputs(const StreamGroups&       stream_groups,
-                                        size_t                    total_batch_size_in,
-                                        size_t                    total_batch_size_out,
-                                        const rtp_llm::BufferPtr& sequence_length,
-                                        size_t                    propose_step = 0) const;
-    void          setCommonSamplerInputs(SamplerInputs&                sampler_inputs,
-                                         std::list<GenerateStreamPtr>& all_streams,
-                                         bool                          score_batch  = false,
-                                         size_t                        propose_step = 0) const;
-    void          setLogitsProcessorInputs(SamplerInputs&                sampler_inputs,
-                                           std::list<GenerateStreamPtr>& all_streams,
-                                           bool                          score_batch = false) const;
-
-    void dispatchSingleStream(GenerateStreamPtr   stream,
-                              const MergedOutput& merge_outputs,
-                              int                 batch_idx_in,
-                              int                 batch_idx_out,
-                              int                 token_offset,
-                              bool                return_all_probs,
-                              const BufferPtr&    new_tokens_all) const;
-
-    void setKVCacheGroupTypes(std::vector<CacheGroupType> kv_cache_group_types) {
-        kv_cache_group_types_ = kv_cache_group_types;
-    }
-
-protected:
-    size_t                       num_layers_;
-    size_t                       vocab_size_;
-    size_t                       input_vocab_size_;
-    bool                         use_int8_kv_cache_;
-    bool                         has_positional_encoding_;
-    bool                         is_multimodal_;
-    PositionIdsStyle             mm_position_ids_style_;
-    size_t                       position_id_len_factor_;
-    RoleType                     role_type_;
-    bool                         decode_entrance_;
-    size_t                       block_stride_bytes_;
-    size_t                       scale_stride_bytes_;
-    size_t                       seq_size_per_block_;
-    size_t                       kernel_seq_size_per_block_;
-    size_t                       kernel_blocks_per_kv_block_ = 1;
-    size_t                       kv_cache_group_nums_        = 1;
-    mutable std::vector<int32_t> layer_to_kv_cache_group_id_;
-    std::vector<CacheGroupType>  kv_cache_group_types_;
-    bool                         warm_up_;
-    bool                         enable_detail_log_;
+    SamplerInputs allocateSamplerInputs(const StreamGroups& stream_groups,
+                                        size_t              total_batch_size_in,
+                                        size_t              total_batch_size_out,
+                                        size_t              propose_step) const;
+    void          fillSamplerCommonInputs(SamplerInputs&                sampler_inputs,
+                                          std::list<GenerateStreamPtr>& all_streams,
+                                          bool                          score_batch  = false,
+                                          size_t                        propose_step = 0) const;
 
     rtp_llm::DeviceBase* device_;
+    size_t               vocab_size_;
+
+    std::unique_ptr<NormalModelInputGatherer>   model_input_gatherer_;
+    std::unique_ptr<NormalSamplerInputGatherer> sampler_input_gatherer_;
+    std::unique_ptr<NormalOutputDispatcher>     output_dispatcher_;
 };
 
 }  // namespace rtp_llm
@@ -170,8 +170,7 @@ absl::Status NormalExecutor::process(const std::list<GenerateStreamPtr>& streams
     {
         RTP_LLM_PROFILE_SCOPE("executor.sampler_forward");
         int64_t start_time_us = autil::TimeUtility::currentTimeInMicroSeconds();
-        CHECK_AND_RETURN_REF(sampler_input,
-                             batch_stream_processor_->gatherSamplerInput(stream_groups, model_input, model_output));
+        CHECK_AND_RETURN_REF(sampler_input, batch_stream_processor_->gatherSamplerInput(stream_groups, model_output));
         sampler_output = std::move(sampler_->forward(sampler_input));
         RTP_LLM_LOG_DEBUG("sampler forward done");
         executor_collector.sample_input_us = autil::TimeUtility::currentTimeInMicroSeconds() - start_time_us;