Skip to content

Commit 1b05f5f

Browse files
Merge remote-tracking branch 'origin/main' into issue/343
2 parents 14c78c3 + b2eccc2 commit 1b05f5f

97 files changed

Lines changed: 3783 additions & 1347 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

csrc/cache/kv_cache.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ StaticKVCache::StaticKVCache(
6767
k_dim_},
6868
dtype_,
6969
rank_info.device);
70+
set_zeros(k_caches_);
7071

7172
// Allocate V cache
7273
v_caches_ = infinicore::Tensor::empty(
@@ -77,6 +78,9 @@ StaticKVCache::StaticKVCache(
7778
v_dim_},
7879
dtype_,
7980
rank_info.device);
81+
set_zeros(v_caches_);
82+
83+
infinicore::context::syncStream();
8084
}
8185

8286
infinicore::Tensor StaticKVCache::create_layer_kv_cache(
@@ -110,6 +114,9 @@ infinicore::Tensor StaticKVCache::create_layer_kv_cache(
110114
kv_dim},
111115
dtype,
112116
rank_info.device);
117+
set_zeros(kv_cache);
118+
119+
infinicore::context::syncStream();
113120

114121
return kv_cache;
115122
}
@@ -211,6 +218,7 @@ PagedKVCache::PagedKVCache(
211218
k_dim_},
212219
dtype_,
213220
rank_info.device);
221+
set_zeros(k_caches_);
214222

215223
// [num_layers, num_blocks, num_rank_v_heads, block_size, v_dim]
216224
v_caches_ = infinicore::Tensor::empty(
@@ -221,6 +229,9 @@ PagedKVCache::PagedKVCache(
221229
v_dim_},
222230
dtype_,
223231
rank_info.device);
232+
set_zeros(v_caches_);
233+
234+
infinicore::context::syncStream();
224235
}
225236

226237
infinicore::Tensor PagedKVCache::create_layer_kv_cache(
@@ -256,6 +267,9 @@ infinicore::Tensor PagedKVCache::create_layer_kv_cache(
256267
kv_shape,
257268
dtype,
258269
rank_info.device);
270+
set_zeros(kv_cache);
271+
272+
infinicore::context::syncStream();
259273

260274
return kv_cache;
261275
}

csrc/config/model_config.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@ ModelConfig::ModelConfig(const std::string &path) {
1616
this->quant_config = QuantConfig(config_json["quantization_config"]);
1717
}
1818

19-
infinicore::quantization::QuantScheme
19+
infinilm::quantization::QuantScheme
2020
ModelConfig::get_quant_scheme() const {
21-
if (quant_config.get_quant_scheme() != infinicore::quantization::QuantScheme::NONE) {
21+
if (quant_config.get_quant_scheme() != infinilm::quantization::QuantScheme::NONE) {
2222
return quant_config.get_quant_scheme();
2323
} else {
24-
return infinicore::quantization::QuantScheme::NONE;
24+
return infinilm::quantization::QuantScheme::NONE;
2525
}
2626
}
2727

csrc/config/model_config.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,17 +62,17 @@ class ModelConfig {
6262
return quant_config;
6363
}
6464

65-
std::shared_ptr<infinicore::quantization::BaseQuantization> get_quantization_method() const {
65+
std::shared_ptr<infinilm::quantization::BaseQuantization> get_quantization_method() const {
6666
return quant_config.get_quantization_method();
6767
}
6868

6969
infinicore::DataType get_dtype() const;
70-
infinicore::quantization::QuantScheme get_quant_scheme() const;
70+
infinilm::quantization::QuantScheme get_quant_scheme() const;
7171
std::shared_ptr<infinicore::nn::RoPE::ScalingConfig> get_rope_scaling() const;
7272
void set_kv_quant_scheme(infinicore::DataType kv_cache_dtype) {
7373
this->quant_config.set_kv_quant_scheme(kv_cache_dtype);
7474
}
75-
infinicore::quantization::KVQuantAlgo get_kv_quant_scheme() const {
75+
infinilm::quantization::KVQuantAlgo get_kv_quant_scheme() const {
7676
return quant_config.get_kv_quant_scheme();
7777
}
7878
infinicore::DataType get_kv_cache_dtype() const {

csrc/config/quant_config.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,24 @@ QuantConfig::QuantConfig(const nlohmann::json &json) : quantization_config(json)
55
this->quantization_method = get_quantization_method();
66
}
77

8-
std::shared_ptr<infinicore::quantization::BaseQuantization>
8+
std::shared_ptr<infinilm::quantization::BaseQuantization>
99
QuantConfig::get_quantization_method() const {
1010
if (quantization_config.is_null()) {
11-
return std::make_shared<infinicore::quantization::NoneQuantization>(quantization_config); // Default case if no matching scheme
11+
return std::make_shared<infinilm::quantization::NoneQuantization>(quantization_config); // Default case if no matching scheme
1212
}
1313

1414
// Determine the quantization scheme from the JSON config
1515
if (quantization_config["quant_method"] == "compressed-tensors") {
16-
return std::make_shared<infinicore::quantization::CompressedTensors>(quantization_config);
16+
return std::make_shared<infinilm::quantization::CompressedTensors>(quantization_config);
1717
} else if (quantization_config["quant_method"] == "awq") {
18-
return std::make_shared<infinicore::quantization::AWQ>(quantization_config);
18+
return std::make_shared<infinilm::quantization::AWQ>(quantization_config);
1919
} else if (quantization_config["quant_method"] == "gptq") {
20-
// return std::make_shared<infinicore::quantization::GPTQ_QY>(quantization_config);
21-
return std::make_shared<infinicore::quantization::GPTQ>(quantization_config);
20+
return std::make_shared<infinilm::quantization::GPTQ>(quantization_config);
2221
} else {
23-
return std::make_shared<infinicore::quantization::NoneQuantization>(quantization_config);
22+
return std::make_shared<infinilm::quantization::NoneQuantization>(quantization_config);
2423
}
2524
// Add other schemes as needed
2625

27-
return std::make_shared<infinicore::quantization::NoneQuantization>(quantization_config); // Default case if no matching scheme
26+
return std::make_shared<infinilm::quantization::NoneQuantization>(quantization_config); // Default case if no matching scheme
2827
}
2928
} // namespace infinilm::config

csrc/config/quant_config.hpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#pragma once
22
#include "../utils.hpp"
3-
#include "infinicore/quantization.hpp"
3+
#include "../layers/quantization/quantization.hpp"
44
#include "nlohmann/json.hpp"
55
#include <optional>
66
#include <spdlog/spdlog.h>
@@ -14,13 +14,13 @@ class QuantConfig {
1414
QuantConfig() = default;
1515
QuantConfig(const nlohmann::json &json);
1616

17-
std::shared_ptr<infinicore::quantization::BaseQuantization> get_quantization_method() const;
17+
std::shared_ptr<infinilm::quantization::BaseQuantization> get_quantization_method() const;
1818

19-
infinicore::quantization::QuantScheme get_quant_scheme() const {
19+
infinilm::quantization::QuantScheme get_quant_scheme() const {
2020
if (quantization_method != nullptr) {
2121
return quantization_method->get_quant_scheme();
2222
} else {
23-
return infinicore::quantization::QuantScheme::NONE;
23+
return infinilm::quantization::QuantScheme::NONE;
2424
}
2525
}
2626

@@ -29,22 +29,22 @@ class QuantConfig {
2929
this->kv_cache_dtype_ = std::make_optional(kv_cache_dtype);
3030
switch (kv_cache_dtype) {
3131
case infinicore::DataType::I8: {
32-
this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::INT8;
32+
this->kv_quant_scheme = infinilm::quantization::KVQuantAlgo::INT8;
3333
break;
3434
}
3535
default: {
3636
spdlog::warn("Unsupported kv_cache_dtype: '{}', fallback to NONE", infinicore::toString(kv_cache_dtype));
37-
this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE;
37+
this->kv_quant_scheme = infinilm::quantization::KVQuantAlgo::NONE;
3838
break;
3939
}
4040
}
4141
} catch (const std::exception &e) {
4242
spdlog::error("Failed to parse kv_cache_dtype '{}': {}", infinicore::toString(kv_cache_dtype), e.what());
43-
this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE;
43+
this->kv_quant_scheme = infinilm::quantization::KVQuantAlgo::NONE;
4444
}
4545
}
4646

47-
infinicore::quantization::KVQuantAlgo get_kv_quant_scheme() const {
47+
infinilm::quantization::KVQuantAlgo get_kv_quant_scheme() const {
4848
return kv_quant_scheme;
4949
}
5050

@@ -57,9 +57,9 @@ class QuantConfig {
5757

5858
private:
5959
nlohmann::json quantization_config;
60-
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization_method;
60+
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization_method;
6161

62-
infinicore::quantization::KVQuantAlgo kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE;
62+
infinilm::quantization::KVQuantAlgo kv_quant_scheme = infinilm::quantization::KVQuantAlgo::NONE;
6363
std::optional<infinicore::DataType> kv_cache_dtype_ = std::nullopt;
6464
};
6565

csrc/engine/compiler/paged_compiler.cpp

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,7 @@
11
#include "paged_compiler.hpp"
22
#include "../../global_state/global_state.hpp"
3+
#include "../../utils.hpp"
34

4-
namespace {
5-
// Todo: replace with Tensor::zeros when it is available
6-
inline void set_zeros(infinicore::Tensor &tensor) {
7-
std::vector<uint8_t> zeros(tensor->nbytes(), 0);
8-
infinicore::context::memcpyH2D(tensor->data(), zeros.data(), tensor->nbytes(), false);
9-
}
10-
11-
inline void set_minus_one(infinicore::Tensor &tensor) {
12-
// For int32 tensors, 0xFF bytes correspond to -1 in two's complement.
13-
std::vector<uint8_t> minus_one(tensor->nbytes(), 0xFF);
14-
infinicore::context::memcpyH2D(tensor->data(), minus_one.data(), tensor->nbytes(), false);
15-
}
16-
17-
} // namespace
185
namespace infinilm::engine {
196
PagedCompiler::PagedCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier)
207
: GraphCompiler(model, barrier) {

csrc/engine/infer_engine.cpp

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -7,49 +7,6 @@ namespace infinilm::engine {
77
//------------------------------------------------------
88
// Constructor
99
//------------------------------------------------------
10-
/**
11-
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
12-
*
13-
* ⚠️ DEVELOPMENT POLICY:
14-
* - NO new development or feature additions permitted on this interface
15-
* - Only critical bug fixes (security/stability) allowed until removal
16-
* - All new code MUST migrate to the polymorphic overload below
17-
*
18-
* Replacement: Use the polymorphic overload of this same function name with updated signature
19-
* Reason: Legacy signature lacks support for dynamic quantization modes.
20-
* Removal target: v0.2.0 (Q2 2026)
21-
*/
22-
InferEngine::InferEngine(
23-
const InfinilmModel::Config &config,
24-
const distributed::DistConfig &distributed_config,
25-
infinicore::Device::Type device_type,
26-
const cache::CacheConfig *cache_config,
27-
bool enable_graph_compiling,
28-
backends::AttentionBackend attention_backend) // Changed parameter
29-
: communication_group_(distributed_config, device_type),
30-
legacy_model_config_(config),
31-
attention_backend_(attention_backend) {
32-
if (cache_config != nullptr) {
33-
cache_config_ = cache_config->unique_copy();
34-
}
35-
// Create one RankWorker per rank
36-
int world_size = communication_group_.get_world_size();
37-
barrier_ = std::make_unique<RankBarrier>((size_t)world_size);
38-
workers_.reserve(world_size);
39-
for (int r = 0; r < world_size; ++r) {
40-
workers_.emplace_back(std::make_unique<RankWorker>(
41-
legacy_model_config_,
42-
communication_group_.get_rank_info(r),
43-
cache_config_ != nullptr ? cache_config_.get() : nullptr,
44-
barrier_.get(),
45-
enable_graph_compiling,
46-
attention_backend_));
47-
}
48-
49-
// Compile the model on all workers
50-
this->compile();
51-
}
52-
5310
InferEngine::InferEngine(
5411
const std::string &config_str,
5512
const distributed::DistConfig &distributed_config,

csrc/engine/infer_engine.hpp

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include "../config/model_config.hpp"
44
#include "../global_state/global_state.hpp"
55
#include "../models/infinilm_model.hpp"
6-
#include "../models/llama_legacy/llama_config.hpp"
76
#include "distributed/distributed.hpp"
87
#include "infinicore/tensor.hpp"
98
#include "rank_barrier.hpp"
@@ -21,26 +20,6 @@ class InferEngine {
2120
using Output = RankWorker::Output;
2221

2322
// Updated constructor: accept CacheConfig instead of CacheType
24-
/**
25-
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
26-
*
27-
* ⚠️ DEVELOPMENT POLICY:
28-
* - NO new development or feature additions permitted on this interface
29-
* - Only critical bug fixes (security/stability) allowed until removal
30-
* - All new code MUST migrate to the polymorphic overload below
31-
*
32-
* Replacement: Use the polymorphic overload of this same function name with updated signature
33-
* Reason: Legacy signature lacks support for dynamic quantization modes.
34-
* Removal target: v0.2.0 (Q2 2026)
35-
*/
36-
InferEngine(
37-
const InfinilmModel::Config &config,
38-
const distributed::DistConfig &distributed_config = distributed::DistConfig(),
39-
infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
40-
const cache::CacheConfig *cache_config = nullptr,
41-
bool enable_graph_compiling = false,
42-
backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
43-
4423
InferEngine(
4524
const std::string &config_str,
4625
const distributed::DistConfig &distributed_config = distributed::DistConfig(),
@@ -78,7 +57,6 @@ class InferEngine {
7857
std::unique_ptr<RankBarrier> barrier_;
7958
distributed::CommunicationGroup communication_group_;
8059
std::unique_ptr<cache::CacheConfig> cache_config_;
81-
const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config();
8260
std::shared_ptr<infinilm::config::ModelConfig> model_config_;
8361
backends::AttentionBackend attention_backend_ = backends::AttentionBackend::Default;
8462
};

csrc/engine/rank_worker.cpp

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -10,46 +10,6 @@
1010

1111
namespace infinilm::engine {
1212

13-
/**
14-
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
15-
*
16-
* ⚠️ DEVELOPMENT POLICY:
17-
* - NO new development or feature additions permitted on this interface
18-
* - Only critical bug fixes (security/stability) allowed until removal
19-
* - All new code MUST migrate to the polymorphic overload below
20-
*
21-
* Replacement: Use the polymorphic overload of this same function name with updated signature
22-
* Reason: Legacy signature lacks support for dynamic quantization modes.
23-
* Removal target: v0.2.0 (Q2 2026)
24-
*/
25-
RankWorker::RankWorker(const InfinilmModel::Config &model_config,
26-
const distributed::RankInfo &rank_info,
27-
const cache::CacheConfig *cache_config,
28-
RankBarrier *barrier,
29-
bool enable_graph_compiling,
30-
backends::AttentionBackend attention_backend)
31-
: legacy_model_config_(model_config),
32-
rank_info_(rank_info),
33-
attention_backend_(attention_backend),
34-
enable_graph_compiling_(enable_graph_compiling),
35-
job_cmd_(Command::INIT),
36-
has_job_(false),
37-
job_done_(false),
38-
should_exit_(false),
39-
init_done_(false),
40-
rng_(std::random_device{}()),
41-
barrier_(barrier) {
42-
if (cache_config != nullptr) {
43-
pending_cache_config_ = cache_config->unique_copy();
44-
}
45-
// start the thread
46-
thread_ = std::thread(&RankWorker::thread_loop, this);
47-
48-
// Wait until the worker thread finishes initialization (model created)
49-
std::unique_lock<std::mutex> lk(mutex_);
50-
cv_.wait(lk, [&] { return init_done_; });
51-
}
52-
5313
RankWorker::RankWorker(
5414
std::shared_ptr<infinilm::global_state::InfinilmConfig> infinilm_config,
5515
const distributed::RankInfo &rank_info,
@@ -269,15 +229,6 @@ void RankWorker::thread_loop() {
269229
infinilm::global_state::initialize_infinilm_config(infinilm_config_);
270230

271231
// Create model using factory (may be expensive)
272-
if (model_config_ == nullptr) {
273-
// model_ = InfinilmModelFactory::createModel(
274-
// legacy_model_config_,
275-
// rank_info_,
276-
// pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr,
277-
// attention_backend_);
278-
throw std::runtime_error("RankWorker::thread_loop(): the way of creating models using LlamaConfig is no longer supported !!!");
279-
}
280-
281232
const std::string &model_type = model_config_->get<std::string>("model_type");
282233
const auto &model_map = models::get_causal_lm_model_map();
283234
auto it = model_map.find(model_type);

csrc/engine/rank_worker.hpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,6 @@ class RankWorker {
7272
infinicore::Tensor output_ids;
7373
};
7474

75-
RankWorker(const InfinilmModel::Config &model_config,
76-
const distributed::RankInfo &rank_info,
77-
const cache::CacheConfig *cache_config,
78-
RankBarrier *barrier,
79-
bool enable_graph_compiling,
80-
backends::AttentionBackend attention_backend);
81-
8275
RankWorker(std::shared_ptr<infinilm::global_state::InfinilmConfig> infinilm_config,
8376
const distributed::RankInfo &rank_info,
8477
const cache::CacheConfig *cache_config,
@@ -120,7 +113,6 @@ class RankWorker {
120113

121114
private:
122115
// Worker properties
123-
const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config();
124116
std::shared_ptr<infinilm::global_state::InfinilmConfig> infinilm_config_;
125117
std::shared_ptr<infinilm::config::ModelConfig> model_config_;
126118
engine::distributed::RankInfo rank_info_;

0 commit comments

Comments
 (0)