PaddlePaddle · BossPi · Jan 16, 2026 · Jan 16, 2026
diff --git a/examples/config/xpu/ERNIE-4.5-0.3B/sft/full_8k.yaml b/examples/config/xpu/ERNIE-4.5-0.3B/sft/full_8k.yaml
@@ -0,0 +1,56 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: ./tests/fixtures/dummy/sft/train.jsonl
+train_dataset_prob: "1.0"
+eval_dataset_path: ./tests/fixtures/dummy/sft/eval.jsonl
+eval_dataset_prob: "1.0"
+max_seq_len: 8192
+packing: false
+mix_strategy: concat
+template_backend: custom
+template: ernie_nothink
+
+### model
+model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
+attn_impl: flashmask
+
+### finetuning
+# base
+stage: SFT
+fine_tuning: full
+seed: 23
+do_train: true
+do_eval: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: -1
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 100
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/ernie-0.3b-sft-8k
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-5
+
+# performance
+tensor_model_parallel_size: 1
+pipeline_model_parallel_size: 1
+sharding: stage1
+recompute_granularity: full
+recompute_method: uniform
+recompute_num_layers: 1
+bf16: true
+fp16_opt_level: O2
+load_checkpoint_format: flex_checkpoint
+save_checkpoint_format: flex_checkpoint
+
+device: xpu
diff --git a/examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k.yaml b/examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k.yaml
@@ -0,0 +1,58 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: ./tests/fixtures/dummy/sft/train.jsonl
+train_dataset_prob: "1.0"
+eval_dataset_path: ./tests/fixtures/dummy/sft/eval.jsonl
+eval_dataset_prob: "1.0"
+max_seq_len: 8192
+packing: false
+mix_strategy: concat
+template_backend: custom
+template: ernie_nothink
+
+### model
+model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
+attn_impl: flashmask
+lora: true
+lora_rank: 8
+
+### finetuning
+# base
+stage: SFT
+fine_tuning: lora
+seed: 23
+do_train: true
+do_eval: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: -1
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 100
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/ernie-0.3b-sft-lora-8k
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-4
+
+# performance
+tensor_model_parallel_size: 1
+pipeline_model_parallel_size: 1
+sharding: stage1
+recompute_granularity: full
+recompute_method: uniform
+recompute_num_layers: 1
+bf16: true
+fp16_opt_level: O2
+load_checkpoint_format: flex_checkpoint
+save_checkpoint_format: flex_checkpoint
+
+device: xpu
diff --git a/examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k_export.yaml b/examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k_export.yaml
@@ -0,0 +1,6 @@
+### model
+fine_tuning: LoRA
+model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
+output_dir: checkpoints/ernie-0.3b-sft-lora-8k
+
+device: xpu
diff --git a/...config/xpu/ERNIE-4.5-21B-A3B/sft/32k.yaml → ...g/xpu/ERNIE-4.5-21B-A3B/sft/full_32k.yaml b/...config/xpu/ERNIE-4.5-21B-A3B/sft/32k.yaml → ...g/xpu/ERNIE-4.5-21B-A3B/sft/full_32k.yaml
@@ -8,6 +8,8 @@ eval_dataset_prob: "1.0"
 max_seq_len: 32768
 packing: true
 mix_strategy: concat
+template_backend: custom
+template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT
@@ -31,7 +33,7 @@ save_strategy: steps
 logging_steps: 1
 gradient_accumulation_steps: 4
 logging_dir: ./vdl_log
-output_dir: ./checkpoints/ernie-sft-full-tp-pp
+output_dir: ./checkpoints/ernie-21b-sft-32k
 disable_tqdm: true
 eval_accumulation_steps: 16
 
@@ -45,12 +47,13 @@ pipeline_model_parallel_size: 2
 sequence_parallel: true
 sharding: stage1
 offload_optim: false
-tensorwise_offload_optimizer: false
+tensorwise_offload_optimizer: true
 recompute_granularity: full
 recompute_method: uniform
 recompute_num_layers: 1
 bf16: true
 fp16_opt_level: O2
-unified_checkpoint: true
+load_checkpoint_format: flex_checkpoint
+save_checkpoint_format: flex_checkpoint
 
 device: xpu
diff --git a/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k.yaml b/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k.yaml
@@ -0,0 +1,61 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: ./tests/fixtures/dummy/sft/train.jsonl
+train_dataset_prob: "1.0"
+eval_dataset_path: ./tests/fixtures/dummy/sft/eval.jsonl
+eval_dataset_prob: "1.0"
+max_seq_len: 32768
+packing: true
+mix_strategy: concat
+template_backend: custom
+template: ernie_nothink
+
+### model
+model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT
+attn_impl: flashmask
+lora: true
+lora_rank: 8
+
+### finetuning
+# base
+stage: SFT
+fine_tuning: lora
+seed: 23
+do_train: true
+do_eval: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: -1
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 100
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/ernie-21b-sft-lora-32k
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-4
+
+# performance
+tensor_model_parallel_size: 4
+pipeline_model_parallel_size: 1
+sequence_parallel: true
+sharding: stage1
+offload_optim: false
+tensorwise_offload_optimizer: false
+recompute_granularity: full
+recompute_method: uniform
+recompute_num_layers: 1
+bf16: true
+fp16_opt_level: O2
+load_checkpoint_format: flex_checkpoint
+save_checkpoint_format: flex_checkpoint
+
+device: xpu
diff --git a/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k_export.yaml b/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k_export.yaml
@@ -0,0 +1,6 @@
+### model
+fine_tuning: LoRA
+model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT
+output_dir: checkpoints/ernie-21b-sft-lora-32k
+
+device: xpu
diff --git a/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/run_lora_32k.sh b/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/run_lora_32k.sh
@@ -0,0 +1,16 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export XPU_VISIBLE_DEVICES="0,1,2,3"
+paddleformers-cli train examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k.yaml