Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 37 additions & 5 deletions csrc/engine/compiler/paged_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "../../utils.hpp"

namespace infinilm::engine {

PagedCompiler::PagedCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier)
: GraphCompiler(model, barrier) {
for (size_t b = 1; b < 64; ++b) {
Expand All @@ -27,7 +28,8 @@ void PagedCompiler::compile() {
block_tables_holder_ = infinicore::Tensor::empty(
{nblocks * max_batch_size}, infinicore::DataType::I32, infinicore::context::getDevice());
set_zeros(block_tables_holder_);
for (size_t b : decode_batch_sizes_) {

auto make_decode_input = [&](size_t b) {
InfinilmModel::Input input;
input.input_ids = infinicore::Tensor::empty({1, b}, infinicore::DataType::I64, infinicore::context::getDevice());
input.position_ids = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice());
Expand Down Expand Up @@ -59,8 +61,31 @@ void PagedCompiler::compile() {
input.block_tables,
input.slot_mapping,
};
return input;
};

{
const size_t warmup_batch_size = std::min(max_batch_size, static_cast<size_t>(64));
auto input = make_decode_input(warmup_batch_size);
model_->forward(input);
infinicore::context::syncStream();
// Warmup runs the eager Marlin path and may leave per-layer lock
// workspaces dirty. Reset before CUDA graph capture so capture
// starts from the same all-zero lock state as normal execution.
model_->reset_runtime_state();
infinicore::context::syncStream();
}

for (size_t b : decode_batch_sizes_) {
auto input = make_decode_input(b);

barrier_->wait();
// Capture must not start with stale Marlin locks from previous
// warmup/capture attempts. This reset is intentionally outside
// graph capture; the current implementation still pays a memset
// before every graph replay in get_compiled().
model_->reset_runtime_state();
infinicore::context::syncStream();
infinicore::context::startGraphRecording();
auto output = model_->forward(input);
auto graph = infinicore::context::stopGraphRecording();
Expand Down Expand Up @@ -101,12 +126,19 @@ PagedCompiler::Compiled PagedCompiler::get_compiled(const InfinilmModel::Input &
return {nullptr, nullptr};
}

// Initialize full padding to -1, then overwrite the narrowed logical region.
// This matches scheduler padding semantics without risking -1 access during graph recording.
// Initialize only the active graph rows to -1, then overwrite the
// runtime logical region. Avoid clearing the full preallocated
// holder on every decode token.
auto &graph_block_tables = graph_input.block_tables.value();
set_minus_one(graph_block_tables);
graph_input.block_tables.value()->narrow({{1, 0, block_per_req}})->copy_from(input.block_tables.value());
set_minus_one_device_async(graph_block_tables);
graph_block_tables->narrow({{1, 0, block_per_req}})->copy_from(input.block_tables.value());
graph_input.slot_mapping.value()->copy_from(input.slot_mapping.value());
// CUDA graph replay reuses the same per-layer Marlin workspaces.
// The graph itself does not contain a workspace reset, so enqueue
// one on the same stream before launch. This is correct but costs
// decode latency; the intended follow-up is a reusable global
// zero workspace/lock buffer shared by all Marlin layers.
model_->reset_runtime_state();

auto graph = std::get<0>(result->second.compiled);
auto shared_output = std::shared_ptr<InfinilmModel::Output>(new InfinilmModel::Output{std::get<1>(result->second.compiled)->logits->resume_from_blob_()});
Expand Down
18 changes: 16 additions & 2 deletions csrc/engine/infer_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@ InferEngine::InferEngine(
enable_graph_compiling,
attention_backend_));
}
// Compile the model on all workers
this->compile();
// Graphs must be compiled after weights are loaded and post-processed.
// Quantized models may replace their linear implementations during
// process_weights_after_loading(), so compiling here would capture stale
// fallback operators.
}

//------------------------------------------------------
Expand Down Expand Up @@ -77,6 +79,8 @@ void InferEngine::process_weights_after_loading() {
for (auto &worker : workers_) {
worker->process_weights_after_loading();
}
weights_finalized_ = true;
this->compile();
}

//------------------------------------------------------
Expand All @@ -94,6 +98,13 @@ std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> InferEng
return results;
}

std::vector<std::string> InferEngine::state_dict_keys() {
if (0 == workers_.size()) {
throw std::runtime_error(" Model object not found. ");
}
return workers_.front()->state_dict_keys();
}

//------------------------------------------------------
// forward
//------------------------------------------------------
Expand Down Expand Up @@ -159,6 +170,9 @@ InferEngine::Output InferEngine::forward(const InferEngine::Input &input) {
}

void InferEngine::compile() {
if (!weights_finalized_) {
return;
}
for (auto &worker : workers_) {
worker->compile();
}
Expand Down
3 changes: 3 additions & 0 deletions csrc/engine/infer_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ class InferEngine {
// return the parameters (i.e. weights and biases).
std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> state_dict();

std::vector<std::string> state_dict_keys();

// Run a single forward pass on all workers and return the outputs from all ranks
Output forward(const Input &input);

Expand All @@ -63,6 +65,7 @@ class InferEngine {
std::unique_ptr<cache::CacheConfig> cache_config_;
std::shared_ptr<infinilm::config::ModelConfig> model_config_;
backends::AttentionBackend attention_backend_ = backends::AttentionBackend::Default;
bool weights_finalized_ = false;
};

} // namespace infinilm::engine
13 changes: 13 additions & 0 deletions csrc/engine/rank_worker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,17 @@ std::unordered_map<std::string, infinicore::nn::Parameter> RankWorker::state_dic
return model_->state_dict();
}

std::vector<std::string> RankWorker::state_dict_keys() {
std::unique_lock<std::mutex> lk(mutex_);
cv_.wait(lk, [&] { return init_done_ || should_exit_; });

if (!model_) {
throw std::runtime_error("state_dict_keys called before model initialization");
}

return model_->state_dict_keys();
}

//------------------------------------------------------
// run -- asynchronous
//------------------------------------------------------
Expand Down Expand Up @@ -349,6 +360,8 @@ void RankWorker::thread_loop() {
// Handle preprocess command
try {
model_->process_weights_after_loading();
infinicore::context::syncStream();
infinicore::context::trimMemory();
} catch (const std::exception &e) {
{
std::lock_guard<std::mutex> lk(mutex_);
Expand Down
2 changes: 2 additions & 0 deletions csrc/engine/rank_worker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ class RankWorker {
// return the parameters (i.e. weights and biases).
std::unordered_map<std::string, infinicore::nn::Parameter> state_dict();

std::vector<std::string> state_dict_keys();

// Submit a run (forward) job.
void run(const Input &args);

Expand Down
12 changes: 8 additions & 4 deletions csrc/layers/attention/attention.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,14 @@ class Attention : public infinicore::nn::Module {
infinicore::Tensor forward(const infinicore::Tensor &positions,
const infinicore::Tensor &hidden_states) const;

void process_fused_weights_after_loading() {
void process_weights_after_loading() override {
qkv_proj_->process_weights_after_loading();
}

void reset_runtime_state() const override {
qkv_proj_->reset_runtime_state();
}

size_t layer_idx() const { return layer_idx_; }
size_t num_heads() const { return num_attention_heads_; }
size_t num_kv_heads() const { return num_key_value_heads_; }
Expand Down Expand Up @@ -55,7 +59,7 @@ class Attention : public infinicore::nn::Module {
INFINICORE_NN_PARAMETER(kv_cache_v_scale);
};
void init_kv_cache_quant_params(std::function<void(const std::string &, infinicore::nn::Parameter)> register_fn,
const infinicore::Device &device,
infinicore::nn::Parameter &kv_cache_k_scale,
infinicore::nn::Parameter &kv_cache_v_scale);
const infinicore::Device &device,
infinicore::nn::Parameter &kv_cache_k_scale,
infinicore::nn::Parameter &kv_cache_v_scale);
} // namespace infinilm::layers::attention
16 changes: 8 additions & 8 deletions csrc/layers/linear/base_linear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,22 +59,22 @@ void BaseLinear::process_weights_after_loading() {
params[name] = static_cast<const infinicore::Tensor &>(param);
}

auto new_quant = quantization_->process_weights_after_loading(params, device_);
auto new_quant = quantization_->process_weights_after_loading(params, device_, split_dim_);
if (!new_quant) return;

for (auto &[name, param] : parameters_) {
param = infinicore::nn::Parameter();
}

parameters_.clear();
for (const auto &[name, tensor] : params) {
auto it = parameters_.find(name);
if (it == parameters_.end()) continue;
it->second = infinicore::nn::Parameter(tensor);
parameters_.emplace(name, infinicore::nn::Parameter(tensor));
}
params.clear();

quantization_ = std::move(new_quant);
}

void BaseLinear::reset_runtime_state() const {
quantization_->reset_runtime_state();
}

// Backward compatible accessors

infinicore::Tensor BaseLinear::weight() const {
Expand Down
5 changes: 3 additions & 2 deletions csrc/layers/linear/base_linear.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#pragma once

#include "infinicore/ops.hpp"
#include "../quantization/quantization.hpp"
#include "infinicore/nn/module.hpp"
#include "infinicore/ops.hpp"
#include <infiniccl.h>
#include <optional>

Expand Down Expand Up @@ -45,7 +45,8 @@ class BaseLinear : public infinicore::nn::Module {
infinicore::Tensor get_param(const std::string &name) const;

std::shared_ptr<infinilm::quantization::BaseQuantization> get_quantization() const { return quantization_; }
virtual void process_weights_after_loading();
void process_weights_after_loading() override;
void reset_runtime_state() const override;

// Split fused linear parameters into named sub-parameters
std::vector<infinilm::quantization::SplitParam> split_params(
Expand Down
6 changes: 5 additions & 1 deletion csrc/layers/mlp/mlp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,14 @@ class MLP : public infinicore::nn::Module {
*/
infinicore::Tensor forward(const infinicore::Tensor &hidden_states) const;

void process_fused_weights_after_loading() {
void process_weights_after_loading() override {
gate_up_proj_->process_weights_after_loading();
}

void reset_runtime_state() const override {
gate_up_proj_->reset_runtime_state();
}

// Module information
size_t hidden_size() const { return hidden_size_; }
size_t intermediate_size() const { return intermediate_size_; }
Expand Down
61 changes: 55 additions & 6 deletions csrc/layers/quantization/awq.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#include "awq.hpp"
#include "awq_marlin.hpp"
#include "infinicore/ops/linear_w4a16_awq.hpp"
#include "marlin_support.hpp"
#include "marlin_utils.hpp"
#include <optional>

namespace infinilm::quantization {
Expand All @@ -21,12 +24,9 @@ std::vector<ParamDescriptor> AWQ::get_param_layout(
int packing_num = get_packing_num();

std::vector<ParamDescriptor> descs;
descs.push_back({"qweight", {in_features, out_features / packing_num},
infinicore::DataType::I32, awq_tp_dim, tp_rank, tp_size});
descs.push_back({"scales", {in_features / group_size, out_features},
dtype, awq_tp_dim, tp_rank, tp_size});
descs.push_back({"qzeros", {in_features / group_size, out_features / packing_num},
infinicore::DataType::I32, awq_tp_dim, tp_rank, tp_size});
descs.push_back({"qweight", {in_features, out_features / packing_num}, infinicore::DataType::I32, awq_tp_dim, tp_rank, tp_size});
descs.push_back({"scales", {in_features / group_size, out_features}, dtype, awq_tp_dim, tp_rank, tp_size});
descs.push_back({"qzeros", {in_features / group_size, out_features / packing_num}, infinicore::DataType::I32, awq_tp_dim, tp_rank, tp_size});
if (bias) {
descs.push_back({"bias", {out_features}, dtype, -1, 0, 1});
}
Expand All @@ -52,6 +52,55 @@ infinicore::Tensor AWQ::forward(
return infinicore::op::linear_w4a16_awq(input_contiguous->contiguous(), qweight, scales, qzeros, bias_opt);
}

std::shared_ptr<BaseQuantization> AWQ::process_weights_after_loading(
ParamsMap &params,
const infinicore::Device &device,
int /*split_dim*/) const {
if (device.getType() != infinicore::Device::Type::NVIDIA) {
return nullptr;
}

#if INFINILM_ENABLE_MARLIN
const int bits = get_or<int>("bits", get_or<int>("w_bit", 4));
if (bits != 4) {
return nullptr;
}

auto qweight = params.at("qweight");
const size_t input_size_per_partition = qweight->size(0);
const size_t output_size_per_partition = qweight->size(1) * get_packing_num();
const int group_size = get_group_size();
if (!marlin::supports_shape(input_size_per_partition, output_size_per_partition, group_size)) {
return nullptr;
}

params["qweight"] = marlin::awq_marlin_repack(
qweight,
input_size_per_partition,
output_size_per_partition,
bits);
params["scales"] = marlin::permute_scales(
params.at("scales"),
input_size_per_partition,
output_size_per_partition,
group_size);
params["qzeros"] = marlin::awq_to_marlin_zero_points(
params.at("qzeros"),
input_size_per_partition / static_cast<size_t>(group_size == -1 ? input_size_per_partition : group_size),
output_size_per_partition,
bits);
params["g_idx"] = marlin::make_empty_i32(device);
params["perm"] = marlin::make_empty_i32(device);
params["a_scales"] = marlin::make_empty_i32(device);
params["global_scales"] = marlin::make_empty_i32(device);

return std::make_shared<AWQMarlin>(get_config(), input_size_per_partition, output_size_per_partition);
#else
(void)params;
return nullptr;
#endif
}

std::vector<SplitParam> AWQ::split_params(
const std::unordered_map<std::string, infinicore::nn::Parameter> &params,
const std::vector<SplitInfo> &splits,
Expand Down
9 changes: 7 additions & 2 deletions csrc/layers/quantization/awq.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ class AWQ : public BaseQuantization {
};

int get_packing_num() const {
return 32 / get_or<int>("bits", 4);
return 32 / get_or<int>("bits", get_or<int>("w_bit", 4));
}

int get_group_size() const {
return get_or<int>("group_size", 128);
return get_or<int>("group_size", get_or<int>("q_group_size", 128));
}

std::vector<ParamDescriptor> get_param_layout(
Expand All @@ -43,6 +43,11 @@ class AWQ : public BaseQuantization {
const std::vector<SplitInfo> &splits,
int narrow_dim,
int tp_rank, int tp_size, int tp_num_heads) const override;

std::shared_ptr<BaseQuantization> process_weights_after_loading(
ParamsMap &params,
const infinicore::Device &device,
int split_dim = -1) const override;
};

} // namespace infinilm::quantization
Loading