Skip to content

Commit b4688d0

Browse files
committed
bringup qwen2.5-1.5B
1 parent 7ac9fb4 commit b4688d0

7 files changed

Lines changed: 67 additions & 10 deletions

File tree

src/maxtext/checkpoint_conversion/utils/hf_model_configs.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1-
# Copyright 2023–2025 Google LLC
1+
# Copyright 2023–2026 Google LLC
22
#
3-
# Licensed under the Apache License, Version 2.0 (the "License");
4-
# you may not use this file except in compliance with the License.
5-
# You may obtain a copy of the License at
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
66
#
7-
# https://www.apache.org/licenses/LICENSE-2.0
7+
# https://www.apache.org/licenses/LICENSE-2.0
88
#
9-
# Unless required by applicable law or agreed to in writing, software
10-
# distributed under the License is distributed on an "AS IS" BASIS,
11-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12-
# See the License for the specific language governing permissions and
13-
# limitations under the License.
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
1414

1515
"""
1616
This config defines the architectural configurations of the Hugging Face version of a model.
@@ -210,6 +210,22 @@
210210
query_pre_attn_scalar=144,
211211
)
212212

213+
qwen25_1_5b_config = transformers.Qwen2Config(
214+
vocab_size=152064,
215+
hidden_size=1536,
216+
intermediate_size=8960,
217+
num_hidden_layers=28,
218+
num_attention_heads=12,
219+
num_key_value_heads=2,
220+
hidden_act="silu",
221+
max_position_embeddings=32768,
222+
rms_norm_eps=1e-06,
223+
rope_theta=1000000.0,
224+
tie_word_embeddings=True,
225+
torch_dtype="bfloat16",
226+
attention_bias=True,
227+
)
228+
213229
qwen25_7b_config = transformers.Qwen2Config(
214230
vocab_size=152064,
215231
hidden_size=3584,
@@ -868,6 +884,7 @@
868884
"gemma3-27b": gemma3_27b_config,
869885
"qwen2.5-7b": qwen25_7b_config,
870886
"qwen2.5-14b": qwen25_14b_config,
887+
"qwen2.5-1.5b": qwen25_1_5b_config,
871888
"qwen3-0.6b": qwen3_0_6b_config,
872889
"qwen3-1.7b": qwen3_1_7b_config,
873890
"qwen3-1.7b-base": qwen3_1_7b_config,

src/maxtext/checkpoint_conversion/utils/hf_shape.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,7 @@ def MIXTRAL_HF_WEIGHTS_TO_SHAPE(config):
768768
"gemma3-27b": GEMMA3_HF_WEIGHTS_TO_SHAPE,
769769
"qwen2.5-7b": QWEN_HF_WEIGHTS_TO_SHAPE,
770770
"qwen2.5-14b": QWEN_HF_WEIGHTS_TO_SHAPE,
771+
"qwen2.5-1.5b": QWEN_HF_WEIGHTS_TO_SHAPE,
771772
"qwen3-0.6b": QWEN_HF_WEIGHTS_TO_SHAPE,
772773
"qwen3-4b": QWEN_HF_WEIGHTS_TO_SHAPE,
773774
"qwen3-4b-thinking-2507": QWEN_HF_WEIGHTS_TO_SHAPE,

src/maxtext/checkpoint_conversion/utils/param_mapping.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2359,6 +2359,7 @@ def pad_hf_embedding_layer(input_tensor, target_shape):
23592359
"gemma3-4b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
23602360
"gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
23612361
"gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
2362+
"qwen2.5-1.5b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
23622363
"qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
23632364
"qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
23642365
"qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
@@ -2399,6 +2400,7 @@ def pad_hf_embedding_layer(input_tensor, target_shape):
23992400
"gemma3-4b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24002401
"gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24012402
"gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
2403+
"qwen2.5-1.5b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24022404
"qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24032405
"qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24042406
"qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Copyright 2023–2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Qwen 2.5 1.5B Instruct Configuration
16+
# https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct
17+
18+
base_emb_dim: 1536
19+
base_num_query_heads: 12
20+
base_num_kv_heads: 2
21+
base_mlp_dim: 8960
22+
base_num_decoder_layers: 28
23+
head_dim: 128
24+
mlp_activations: ["silu", "linear"]
25+
vocab_size: 151936
26+
decoder_block: "qwen2"
27+
normalization_layer_epsilon: 1e-06
28+
rope_max_timescale: 1000000.0
29+
use_qk_norm: False
30+
# Bias for q, k, v proj.
31+
attention_bias: True
32+
logits_via_embedding: True
33+
normalize_embedding_logits: False
34+
tokenizer_type: "huggingface"

src/maxtext/configs/pyconfig_deprecated.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,7 @@ def validate_model_name(s: str) -> bool:
462462
"gemma3-27b",
463463
"qwen2.5-7b",
464464
"qwen2.5-14b",
465+
"qwen2.5-1.5b",
465466
"qwen3-0.6b",
466467
"qwen3-4b",
467468
"qwen3-4b-thinking-2507",

src/maxtext/configs/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ class ProfilerType(str, Enum):
235235
"gemma3-27b",
236236
"qwen2.5-7b",
237237
"qwen2.5-14b",
238+
"qwen2.5-1.5b",
238239
"qwen3-0.6b",
239240
"qwen3-1.7b",
240241
"qwen3-1.7b-base",

src/maxtext/utils/globals.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
"gemma3-27b": "google/gemma-3-27b-it",
5353
"qwen2.5-7b": "Qwen/Qwen2.5-7B-Instruct",
5454
"qwen2.5-14b": "Qwen/Qwen2.5-14B-Instruct",
55+
"qwen2.5-1.5b": "Qwen/Qwen2.5-1.5B-Instruct",
5556
"qwen3-0.6b": "Qwen/Qwen3-0.6B",
5657
"qwen3-4b": "Qwen/Qwen3-4B",
5758
"qwen3-4b-thinking-2507": "Qwen/Qwen3-4B-Thinking-2507",

0 commit comments

Comments
 (0)