Skip to content

Commit f4c8b29

Browse files
authored
Merge pull request #416 from InfiniTensor/issue/406
issue/406 - feat: support gpt2
2 parents 65817cf + 9504053 commit f4c8b29

5 files changed

Lines changed: 467 additions & 0 deletions

File tree

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
#include "gpt2_for_causal_lm.hpp"
2+
#include "../../global_state/global_state.hpp"
3+
#include "../../layers/attention/attention.hpp"
4+
#include "../models_registry.hpp"
5+
#include "infinicore/ops.hpp"
6+
7+
namespace infinilm::models::gpt2 {
8+
9+
std::shared_ptr<infinilm::config::ModelConfig>
10+
create_gpt2_model_config(std::shared_ptr<infinilm::config::ModelConfig> config) {
11+
const std::string &model_type = config->get<std::string>("model_type");
12+
if ("gpt2" != model_type) {
13+
throw std::runtime_error(
14+
"infinilm::models::gpt2::create_gpt2_model_config: model_type is not gpt2");
15+
}
16+
17+
auto &j = config->get_config_json();
18+
19+
j["hidden_size"] = j.value("hidden_size", j.value("n_embd", 768));
20+
j["num_hidden_layers"] = j.value("num_hidden_layers", j.value("n_layer", 12));
21+
j["num_attention_heads"] = j.value("num_attention_heads", j.value("n_head", 12));
22+
j["num_key_value_heads"] = j["num_attention_heads"];
23+
j["head_dim"] = j["hidden_size"].get<size_t>() / j["num_attention_heads"].get<size_t>();
24+
j["max_position_embeddings"] = j.value("max_position_embeddings", j.value("n_positions", 1024));
25+
j["intermediate_size"] = j.value("n_inner", 4 * j["hidden_size"].get<size_t>());
26+
j["layer_norm_eps"] = j.value("layer_norm_epsilon", 1e-5);
27+
j["attention_bias"] = true;
28+
j["attention_output_bias"] = false;
29+
j["mlp_bias"] = true;
30+
31+
return config;
32+
}
33+
34+
GPT2Attention::GPT2Attention(std::shared_ptr<infinilm::config::ModelConfig> config,
35+
size_t layer_idx,
36+
const infinicore::Device &device)
37+
: layer_idx_(layer_idx) {
38+
const auto &dtype = config->get_dtype();
39+
hidden_size_ = config->get<size_t>("hidden_size");
40+
num_heads_ = config->get<size_t>("num_attention_heads");
41+
num_kv_heads_ = config->get<size_t>("num_key_value_heads");
42+
head_dim_ = config->get<size_t>("head_dim");
43+
44+
const bool use_bias = config->get_or<bool>("attention_bias", true);
45+
auto quantization_method = config->get_quantization_method();
46+
const auto &rank_info = infinilm::global_state::get_tensor_model_parallel_rank_info();
47+
const int tp_rank = infinilm::global_state::get_tensor_model_parallel_rank();
48+
const int tp_size = infinilm::global_state::get_tensor_model_parallel_world_size();
49+
const size_t total_num_heads = num_heads_;
50+
const size_t total_num_kv_heads = num_kv_heads_;
51+
52+
num_heads_ = total_num_heads / tp_size;
53+
num_kv_heads_ = total_num_kv_heads < static_cast<size_t>(tp_size)
54+
? 1
55+
: total_num_kv_heads / tp_size;
56+
57+
auto register_fn = [this](const std::string &name, infinicore::nn::Parameter param) {
58+
this->register_parameter(name, std::move(param));
59+
};
60+
qkv_proj_ = std::make_shared<infinilm::layers::linear::QKVParallelLinear>(
61+
hidden_size_,
62+
head_dim_,
63+
total_num_heads,
64+
total_num_kv_heads,
65+
"q_proj",
66+
"k_proj",
67+
"v_proj",
68+
register_fn,
69+
quantization_method,
70+
use_bias,
71+
dtype,
72+
device,
73+
rank_info);
74+
INFINICORE_NN_MODULE_INIT(
75+
o_proj,
76+
total_num_heads * head_dim_,
77+
hidden_size_,
78+
quantization_method,
79+
false,
80+
dtype,
81+
device,
82+
tp_rank,
83+
tp_size,
84+
rank_info.comm);
85+
INFINICORE_NN_PARAMETER_INIT(o_proj_bias, ({hidden_size_}, dtype, device));
86+
87+
infinilm::layers::attention::init_kv_cache_quant_params(
88+
register_fn, device, kv_cache_k_scale_, kv_cache_v_scale_);
89+
90+
const float scale = 1.0f / std::sqrt(static_cast<float>(head_dim_));
91+
attention_backend_ = infinilm::global_state::get_infinilm_config().attention_backend;
92+
attn_ = std::make_shared<infinilm::layers::attention::AttentionLayer>(
93+
num_heads_,
94+
head_dim_,
95+
scale,
96+
num_kv_heads_,
97+
layer_idx_,
98+
kv_cache_k_scale_,
99+
kv_cache_v_scale_,
100+
attention_backend_);
101+
}
102+
103+
infinicore::Tensor GPT2Attention::forward(const infinicore::Tensor &positions,
104+
const infinicore::Tensor &hidden_states) const {
105+
(void)positions;
106+
auto hidden_states_mutable = hidden_states;
107+
auto shape = hidden_states->shape();
108+
size_t batch_size = shape[0];
109+
size_t seq_len = shape[1];
110+
111+
auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable);
112+
113+
if (attention_backend_ == infinilm::backends::AttentionBackend::PAGED_ATTN
114+
|| attention_backend_ == infinilm::backends::AttentionBackend::FLASH_ATTN) {
115+
auto q_reshaped = q->view({seq_len, num_heads_, head_dim_});
116+
auto k_reshaped = k->view({seq_len, num_kv_heads_, head_dim_});
117+
auto v_reshaped = v->view({seq_len, num_kv_heads_, head_dim_});
118+
auto attn_output = attn_->forward(q_reshaped, k_reshaped, v_reshaped);
119+
auto output = o_proj_->forward(attn_output);
120+
infinicore::op::add_(output, output, o_proj_bias_->as_strided(output->shape(), {0, 0, 1}));
121+
return output;
122+
}
123+
124+
auto q_reshaped = q->view({batch_size, seq_len, num_heads_, head_dim_});
125+
auto k_reshaped = k->view({batch_size, seq_len, num_kv_heads_, head_dim_});
126+
auto v_reshaped = v->view({batch_size, seq_len, num_kv_heads_, head_dim_});
127+
auto attn_output = attn_->forward(q_reshaped, k_reshaped, v_reshaped);
128+
auto output = o_proj_->forward(attn_output);
129+
infinicore::op::add_(output, output, o_proj_bias_->as_strided(output->shape(), {0, 0, 1}));
130+
return output;
131+
}
132+
133+
GPT2MLP::GPT2MLP(std::shared_ptr<infinilm::config::ModelConfig> config,
134+
const infinicore::Device &device) {
135+
const auto &dtype = config->get_dtype();
136+
const size_t hidden_size = config->get<size_t>("hidden_size");
137+
const size_t intermediate_size = config->get<size_t>("intermediate_size");
138+
const bool use_bias = config->get_or<bool>("mlp_bias", true);
139+
activation_ = config->get_or<std::string>("activation_function", "gelu_new");
140+
auto quantization_method = config->get_quantization_method();
141+
const auto &rank_info = infinilm::global_state::get_tensor_model_parallel_rank_info();
142+
143+
INFINICORE_NN_MODULE_INIT(
144+
c_fc,
145+
hidden_size,
146+
intermediate_size,
147+
quantization_method,
148+
use_bias,
149+
dtype,
150+
device,
151+
rank_info.tp_rank,
152+
rank_info.tp_size);
153+
INFINICORE_NN_MODULE_INIT(
154+
c_proj,
155+
intermediate_size,
156+
hidden_size,
157+
quantization_method,
158+
false,
159+
dtype,
160+
device,
161+
rank_info.tp_rank,
162+
rank_info.tp_size,
163+
rank_info.comm);
164+
INFINICORE_NN_PARAMETER_INIT(c_proj_bias, ({hidden_size}, dtype, device));
165+
}
166+
167+
infinicore::Tensor GPT2MLP::forward(const infinicore::Tensor &hidden_states) const {
168+
auto x = const_cast<infinicore::Tensor &>(hidden_states);
169+
x = c_fc_->forward(x);
170+
if (activation_ == "gelu_new" || activation_ == "gelu_tanh") {
171+
x = infinicore::op::gelu_tanh(x);
172+
} else if (activation_ == "gelu") {
173+
x = infinicore::op::gelu(x);
174+
} else {
175+
throw std::runtime_error("infinilm::models::gpt2::GPT2MLP: unsupported activation " + activation_);
176+
}
177+
x = c_proj_->forward(x);
178+
infinicore::op::add_(x, x, c_proj_bias_->as_strided(x->shape(), {0, 0, 1}));
179+
return x;
180+
}
181+
182+
GPT2Block::GPT2Block(std::shared_ptr<infinilm::config::ModelConfig> config,
183+
size_t layer_idx,
184+
const infinicore::Device &device) {
185+
const auto &dtype = config->get_dtype();
186+
const size_t hidden_size = config->get<size_t>("hidden_size");
187+
const double layer_norm_eps = config->get<double>("layer_norm_eps");
188+
189+
INFINICORE_NN_MODULE_INIT(ln_1, hidden_size, layer_norm_eps, dtype, device);
190+
INFINICORE_NN_MODULE_INIT(attn, config, layer_idx, device);
191+
INFINICORE_NN_MODULE_INIT(ln_2, hidden_size, layer_norm_eps, dtype, device);
192+
INFINICORE_NN_MODULE_INIT(mlp, config, device);
193+
}
194+
195+
infinicore::Tensor GPT2Block::forward(const infinicore::Tensor &positions,
196+
const infinicore::Tensor &hidden_states) const {
197+
auto residual = hidden_states;
198+
auto x = ln_1_->forward(hidden_states);
199+
x = attn_->forward(positions, x);
200+
x = infinicore::op::add(x, residual);
201+
202+
residual = x;
203+
x = ln_2_->forward(x);
204+
x = mlp_->forward(x);
205+
return infinicore::op::add(x, residual);
206+
}
207+
208+
GPT2Model::GPT2Model(std::shared_ptr<infinilm::config::ModelConfig> config,
209+
const infinicore::Device &device) {
210+
const auto &dtype = config->get_dtype();
211+
const size_t vocab_size = config->get<size_t>("vocab_size");
212+
const size_t hidden_size = config->get<size_t>("hidden_size");
213+
const size_t max_position_embeddings = config->get<size_t>("max_position_embeddings");
214+
const size_t num_hidden_layers = config->get<size_t>("num_hidden_layers");
215+
const double layer_norm_eps = config->get<double>("layer_norm_eps");
216+
217+
INFINICORE_NN_MODULE_INIT(embed_tokens, vocab_size, hidden_size, std::nullopt, dtype, device);
218+
INFINICORE_NN_MODULE_INIT(embed_positions, max_position_embeddings, hidden_size, std::nullopt, dtype, device);
219+
layers_.reserve(num_hidden_layers);
220+
for (size_t i = 0; i < num_hidden_layers; ++i) {
221+
layers_.push_back(this->register_module<GPT2Block>("layers." + std::to_string(i), config, i, device));
222+
}
223+
INFINICORE_NN_MODULE_INIT(norm, hidden_size, layer_norm_eps, dtype, device);
224+
}
225+
226+
infinicore::Tensor GPT2Model::forward(const infinilm::InfinilmModel::Input &input) const {
227+
auto input_ids = input.input_ids.value();
228+
auto position_ids = input.position_ids.value();
229+
if (position_ids->shape().size() == 1) {
230+
position_ids = position_ids->view({1, position_ids->shape()[0]});
231+
}
232+
233+
auto hidden_states = infinicore::op::add(
234+
embed_tokens_->forward(input_ids),
235+
embed_positions_->forward(position_ids));
236+
237+
for (const auto &layer : layers_) {
238+
hidden_states = layer->forward(position_ids, hidden_states);
239+
}
240+
241+
return norm_->forward(hidden_states);
242+
}
243+
244+
GPT2ForCausalLM::GPT2ForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> config,
245+
const infinicore::Device &device) {
246+
model_config_ = config;
247+
const auto &dtype = config->get_dtype();
248+
const size_t hidden_size = config->get<size_t>("hidden_size");
249+
const size_t vocab_size = config->get<size_t>("vocab_size");
250+
251+
INFINICORE_NN_MODULE_INIT(model, config, device);
252+
INFINICORE_NN_MODULE_INIT(lm_head, hidden_size, vocab_size, false, dtype, device);
253+
}
254+
255+
InfinilmModel::Output GPT2ForCausalLM::forward(const InfinilmModel::Input &input) const {
256+
auto hidden_states = model_->forward(input);
257+
auto logits = lm_head_->forward(hidden_states);
258+
return {logits};
259+
}
260+
261+
} // namespace infinilm::models::gpt2
262+
263+
namespace {
264+
265+
INFINILM_REGISTER_CAUSAL_LM_MODEL(
266+
gpt2,
267+
infinilm::models::gpt2::GPT2ForCausalLM,
268+
infinilm::models::gpt2::create_gpt2_model_config);
269+
270+
} // namespace
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#pragma once
2+
3+
#include "../../backends/attention_backends.hpp"
4+
#include "../../layers/linear/fused_linear.hpp"
5+
#include "../../layers/attention/backends/attention_layer.hpp"
6+
#include "../../config/model_config.hpp"
7+
#include "infinicore/nn/embedding.hpp"
8+
#include "infinicore/nn/layer_norm.hpp"
9+
#include "infinicore/nn/parameter.hpp"
10+
11+
namespace infinilm::models::gpt2 {
12+
13+
class GPT2Attention : public infinicore::nn::Module {
14+
public:
15+
GPT2Attention(std::shared_ptr<infinilm::config::ModelConfig> config,
16+
size_t layer_idx,
17+
const infinicore::Device &device);
18+
19+
infinicore::Tensor forward(const infinicore::Tensor &positions,
20+
const infinicore::Tensor &hidden_states) const;
21+
22+
private:
23+
std::shared_ptr<infinilm::layers::linear::QKVParallelLinear> qkv_proj_;
24+
std::shared_ptr<infinilm::layers::attention::AttentionLayer> attn_;
25+
INFINICORE_NN_MODULE(infinilm::layers::linear::RowParallelLinear, o_proj);
26+
INFINICORE_NN_PARAMETER(o_proj_bias);
27+
INFINICORE_NN_PARAMETER(kv_cache_k_scale);
28+
INFINICORE_NN_PARAMETER(kv_cache_v_scale);
29+
30+
size_t layer_idx_;
31+
size_t hidden_size_;
32+
size_t num_heads_;
33+
size_t num_kv_heads_;
34+
size_t head_dim_;
35+
infinilm::backends::AttentionBackend attention_backend_;
36+
};
37+
38+
class GPT2MLP : public infinicore::nn::Module {
39+
public:
40+
GPT2MLP(std::shared_ptr<infinilm::config::ModelConfig> config,
41+
const infinicore::Device &device);
42+
43+
infinicore::Tensor forward(const infinicore::Tensor &hidden_states) const;
44+
45+
private:
46+
INFINICORE_NN_MODULE(infinilm::layers::linear::ColumnParallelLinear, c_fc);
47+
INFINICORE_NN_MODULE(infinilm::layers::linear::RowParallelLinear, c_proj);
48+
INFINICORE_NN_PARAMETER(c_proj_bias);
49+
std::string activation_;
50+
};
51+
52+
class GPT2Block : public infinicore::nn::Module {
53+
public:
54+
GPT2Block(std::shared_ptr<infinilm::config::ModelConfig> config,
55+
size_t layer_idx,
56+
const infinicore::Device &device);
57+
58+
infinicore::Tensor forward(const infinicore::Tensor &positions,
59+
const infinicore::Tensor &hidden_states) const;
60+
61+
private:
62+
INFINICORE_NN_MODULE(infinicore::nn::LayerNorm, ln_1);
63+
INFINICORE_NN_MODULE(GPT2Attention, attn);
64+
INFINICORE_NN_MODULE(infinicore::nn::LayerNorm, ln_2);
65+
INFINICORE_NN_MODULE(GPT2MLP, mlp);
66+
};
67+
68+
class GPT2Model : public infinicore::nn::Module {
69+
public:
70+
GPT2Model(std::shared_ptr<infinilm::config::ModelConfig> config,
71+
const infinicore::Device &device);
72+
73+
infinicore::Tensor forward(const infinilm::InfinilmModel::Input &input) const;
74+
75+
private:
76+
INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens);
77+
INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_positions);
78+
INFINICORE_NN_MODULE_VEC(GPT2Block, layers);
79+
INFINICORE_NN_MODULE(infinicore::nn::LayerNorm, norm);
80+
};
81+
82+
class GPT2ForCausalLM : public infinilm::InfinilmModel {
83+
public:
84+
GPT2ForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> config,
85+
const infinicore::Device &device);
86+
87+
Output forward(const Input &input) const override;
88+
89+
private:
90+
INFINICORE_NN_MODULE(GPT2Model, model);
91+
INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, lm_head);
92+
};
93+
94+
std::shared_ptr<infinilm::config::ModelConfig>
95+
create_gpt2_model_config(std::shared_ptr<infinilm::config::ModelConfig> config);
96+
97+
} // namespace infinilm::models::gpt2

python/infinilm/infer_engine.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@ def read_hf_config(model_path):
1818
with open(config_path, "r") as f:
1919
config_dict = json.load(f)
2020

21+
if (
22+
config_dict.get("model_type") == "gpt2"
23+
and config_dict.get("torch_dtype") is None
24+
and config_dict.get("dtype") is None
25+
):
26+
config_dict["torch_dtype"] = "float32"
2127
if "model_type" not in config_dict:
2228
raise ValueError(
2329
f"`model_type` is not specified in the config file `{config_path}`."

0 commit comments

Comments
 (0)