diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ee001cb82..2a9149add 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -13,7 +13,7 @@ env: jobs: Lint: name: Lint - runs-on: [self-hosted, ernie-cpu] + runs-on: [self-hosted, ernie-cpu-01] permissions: pull-requests: write contents: read diff --git a/erniekit/cli.py b/erniekit/cli.py index 69e80528a..f572657c4 100644 --- a/erniekit/cli.py +++ b/erniekit/cli.py @@ -25,6 +25,7 @@ from .version.env import VERSION from .version import commit from .utils.process import terminate_process_tree, detect_device, set_ascend_environment +from .hparams import get_env_args script_dir = Path(__file__).parent.resolve() parent_dir = script_dir.parent @@ -139,7 +140,8 @@ def main(): os.environ["FLAGS_dataloader_use_file_descriptor"] = "False" if current_device == "xpu": - os.environ["FLAGS_use_stride_kernel"] = "1" + args = get_env_args() + os.environ["FLAGS_use_stride_kernel"] = str(args.FLAGS_use_stride_kernel) os.environ["XPU_PADDLE_L3_SIZE"] = "0" os.environ["XPUAPI_DEFAULT_SIZE"] = "2205258752" diff --git a/erniekit/hparams/__init__.py b/erniekit/hparams/__init__.py index bab867187..e6883f0e7 100644 --- a/erniekit/hparams/__init__.py +++ b/erniekit/hparams/__init__.py @@ -17,7 +17,14 @@ from .finetuning_args import FinetuningArguments from .generating_args import GeneratingArguments from .model_args import ModelArguments -from .parser import get_eval_args, get_export_args, get_server_args, get_train_args, read_args +from .parser import ( + get_eval_args, + get_export_args, + get_server_args, + get_train_args, + get_env_args, + read_args, +) from .server_args import ServerArguments __all__ = [ @@ -31,5 +38,6 @@ "get_eval_args", "get_server_args", "get_export_args", + "get_env_args", "read_args", ] diff --git a/erniekit/hparams/env_args.py b/erniekit/hparams/env_args.py new file mode 100644 index 000000000..ffb5357d7 --- /dev/null +++ b/erniekit/hparams/env_args.py @@ -0,0 +1,27 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field + + +@dataclass +class EnvConfigArguments: + """Environment parameters""" + + FLAGS_use_stride_kernel: bool = field( + default=False, + metadata={ + "help": "Controls whether the Stride mechanism is enabled. Currently, enabling this mechanism on XPU may cause performance degradation, so it is disabled by default." + }, + ) diff --git a/erniekit/hparams/parser.py b/erniekit/hparams/parser.py index 0873d3ec5..c7239f1cb 100644 --- a/erniekit/hparams/parser.py +++ b/erniekit/hparams/parser.py @@ -38,6 +38,7 @@ from .model_args import ModelArguments from .server_args import ServerArguments from .preprocess_args import End2EndProcessorArguments +from .env_args import EnvConfigArguments _TRAIN_ARGS = [ ModelArguments, @@ -291,3 +292,12 @@ def get_export_args( _parse_export_args(args) ) return model_args, data_args, generating_args, finetuning_args, export_args + + +def get_env_args( + args: Optional[Union[dict[str, Any], list[str]]] = None +) -> EnvConfigArguments: + parser = PdArgumentParser(EnvConfigArguments) + allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS") + (env_args,) = _parse_args(parser, args=args, allow_extra_keys=allow_extra_keys) + return env_args diff --git a/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft/run_sft_32k.yaml b/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft/run_sft_32k.yaml new file mode 100644 index 000000000..fca14c59a --- /dev/null +++ b/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft/run_sft_32k.yaml @@ -0,0 +1,104 @@ +# +stage: VL-SFT + +# model +model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking/ +multimodal: true +fuse_linear: true +fuse_rms_norm: false +use_flash_attention: 1 +use_moe: true +fine_tuning: Full +use_sparse_head_and_loss_fn: true +use_recompute_loss_fn: true +moe_group: "mp" +moe_use_aux_free_update_coef: 0.0 +moe_aux_loss_lambda: 0.0 +moe_use_aux_free: true +moe_use_hard_gate: true +moe_multimodal_dispatch_use_allgather: v2-alltoall-unpad-text +pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer + +# data +train_dataset_path: "examples/data/sft_vl-train_demo1.jsonl" +train_dataset_prob: "1.0" +text_dataset_path: "" +text_dataset_prob: "" +max_seq_len: 32768 +num_samples_each_epoch: 10000000 +modality_ratio: "[1,1]" + +# preprocess +variable_resolution: 1 +pad_to_max_seqlen: 32768 +render_timestamp: true +serialize_output: false +one_sample_in_one_seq: true +chat_template: "ernie_vl_thinking" + +# dataloader +dataloader_num_workers: 1 + +# train +do_train: true +batch_size: 1 +prefetch_factor: 10 +seed: 42 +gradient_accumulation_steps: 4 +max_steps: 8000 +save_steps: 10000 +logging_steps: 1 +weight_decay: 0.1 +warmup_steps: 100 +output_dir: ./output +add_sys_token: true +same_data: true +freeze_config: "freeze_vision" +trigger_data_prob: 1.0 +from_scratch: 0 +gc_interval: 100000 +drop_history_with_k: true +overwrite_output_dir: true + +# optim +lr_scheduler_type: "cosine" +learning_rate: 1.0e-05 +min_lr: 1.0e-06 +moe_gate_lr_ratio: 0.01 +visual_ld: 0.9 +vit_lr_ratio: 0.9 +adam_beta2: 0.95 +adam_beta1: 0.9 +adam_epsilon: 1.0e-08 +scale_loss: 4096 + +# performance +sequence_parallel: 1 +use_sp_callback: true +tensor_parallel_degree: 4 +pipeline_parallel_degree: 2 +pp_need_data: true +pp_need_data_degree: 2 +virtual_pp_degree: 1 +tensor_parallel_config: "sync_param sync_grad sync_moment" +pipeline_parallel_config: "enable_offload_queue enable_delay_scale_loss enable_overlap_p2p_comm best_unbalanced_scheduler" +disable_pipeline_warmup: false +sharding: "stage1" +sharding_parallel_config: "split_param enable_fuse_optimizer_states" +sharding_comm_buffer_size_MB: 2048 +save_sharding_stage1_model_include_freeze_params: true +offload_optim: false +tensorwise_offload_optimizer: false +unified_checkpoint_config: ignore_merge_optimizer +recompute: true +recompute_granularity: full +refined_recompute: "global:2" +pre_alloc_memory: 60 + +# amp +bf16: true +fp16_opt_level: "O2" +amp_master_grad: 1 + +# checkpoint +unified_checkpoint: true diff --git a/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft/run_sft_lora_32k.yaml b/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft/run_sft_lora_32k.yaml new file mode 100644 index 000000000..8df6178de --- /dev/null +++ b/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft/run_sft_lora_32k.yaml @@ -0,0 +1,105 @@ +# +stage: VL-SFT + +# model +model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking/ +multimodal: true +fuse_linear: true +fuse_rms_norm: false +use_flash_attention: 1 +use_moe: true +fine_tuning: LoRA +lora_rank: 32 +use_sparse_head_and_loss_fn: true +use_recompute_loss_fn: true +moe_group: "mp" +moe_use_aux_free_update_coef: 0.0 +moe_aux_loss_lambda: 0.0 +moe_use_aux_free: true +moe_use_hard_gate: true +moe_multimodal_dispatch_use_allgather: v2-alltoall-unpad-text +pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer + +# data +train_dataset_path: "examples/data/sft_vl-train_demo1.jsonl" +train_dataset_prob: "1.0" +text_dataset_path: "" +text_dataset_prob: "" +max_seq_len: 32768 +num_samples_each_epoch: 10000000 +modality_ratio: "[1,1]" + +# preprocess +variable_resolution: 1 +pad_to_max_seqlen: 32768 +render_timestamp: true +serialize_output: false +one_sample_in_one_seq: true +chat_template: "ernie_vl_thinking" + +# dataloader +dataloader_num_workers: 1 + +# train +do_train: true +batch_size: 1 +prefetch_factor: 10 +seed: 42 +gradient_accumulation_steps: 4 +max_steps: 8000 +save_steps: 10000 +logging_steps: 1 +weight_decay: 0.1 +warmup_steps: 100 +output_dir: ./output +add_sys_token: true +same_data: true +freeze_config: "freeze_vision" +trigger_data_prob: 1.0 +from_scratch: 0 +gc_interval: 100000 +drop_history_with_k: true +overwrite_output_dir: true + +# optim +lr_scheduler_type: "cosine" +learning_rate: 3.0e-04 +min_lr: 1.0e-06 +moe_gate_lr_ratio: 0.01 +visual_ld: 0.9 +vit_lr_ratio: 0.9 +adam_beta2: 0.95 +adam_beta1: 0.9 +adam_epsilon: 1.0e-08 +scale_loss: 4096 + +# performance +sequence_parallel: 1 +use_sp_callback: true +tensor_parallel_degree: 2 +pipeline_parallel_degree: 2 +pp_need_data: true +pp_need_data_degree: 2 +virtual_pp_degree: 1 +tensor_parallel_config: "sync_param sync_grad sync_moment" +pipeline_parallel_config: "enable_offload_queue enable_delay_scale_loss enable_overlap_p2p_comm best_unbalanced_scheduler" +disable_pipeline_warmup: false +sharding: "stage1" +sharding_parallel_config: "split_param enable_fuse_optimizer_states" +sharding_comm_buffer_size_MB: 2048 +save_sharding_stage1_model_include_freeze_params: true +offload_optim: false +tensorwise_offload_optimizer: false +unified_checkpoint_config: ignore_merge_optimizer +recompute: true +recompute_granularity: full +refined_recompute: "global:2" +pre_alloc_memory: 60 + +# amp +bf16: true +fp16_opt_level: "O2" +amp_master_grad: 1 + +# checkpoint +unified_checkpoint: true diff --git a/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft_function_call/run_sft_32k.yaml b/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft_function_call/run_sft_32k.yaml new file mode 100644 index 000000000..6885a9ace --- /dev/null +++ b/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft_function_call/run_sft_32k.yaml @@ -0,0 +1,108 @@ +# +stage: VL-SFT + +# model +model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking/ +multimodal: true +fuse_linear: true +fuse_rms_norm: false +use_flash_attention: 1 +use_moe: true +fine_tuning: Full +use_sparse_head_and_loss_fn: true +use_recompute_loss_fn: true +moe_group: "mp" +moe_use_aux_free_update_coef: 0.0 +moe_aux_loss_lambda: 0.0 +moe_use_aux_free: true +moe_use_hard_gate: true +moe_multimodal_dispatch_use_allgather: v2-alltoall-unpad-text +pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer + +# data +dataset_name: "FunctionCallSFTReader" +train_dataset_path: "" +train_dataset_prob: "" +text_dataset_path: "examples/data/function-call-train.jsonl" +text_dataset_prob: "1.0" +max_seq_len: 32768 +num_samples_each_epoch: 10000000 +modality_ratio: "[1,1]" + +# preprocess +variable_resolution: 1 +pad_to_max_seqlen: 32768 +render_timestamp: true +serialize_output: false +one_sample_in_one_seq: true +chat_template: "ernie_vl_thinking" + +# dataloader +dataloader_num_workers: 1 + +# train +do_train: true +batch_size: 1 +prefetch_factor: 10 +seed: 42 +gradient_accumulation_steps: 4 +max_steps: 8000 +save_steps: 10000 +logging_steps: 1 +weight_decay: 0.1 +warmup_steps: 100 +output_dir: ./output +add_sys_token: true +same_data: true +freeze_config: "freeze_vision" +trigger_data_prob: 1.0 +from_scratch: 0 +gc_interval: 100000 +drop_history_with_k: true +overwrite_output_dir: true + +# optim +lr_scheduler_type: "cosine" +learning_rate: 1.0e-05 +min_lr: 1.0e-06 +moe_gate_lr_ratio: 0.01 +visual_ld: 0.9 +vit_lr_ratio: 0.9 +adam_beta2: 0.95 +adam_beta1: 0.9 +adam_epsilon: 1.0e-08 +scale_loss: 4096 + +# performance +sequence_parallel: 1 +use_sp_callback: true +tensor_parallel_degree: 4 +pipeline_parallel_degree: 2 +pp_need_data: true +pp_need_data_degree: 2 +virtual_pp_degree: 1 +tensor_parallel_config: "sync_param sync_grad sync_moment" +pipeline_parallel_config: "enable_offload_queue enable_delay_scale_loss enable_overlap_p2p_comm best_unbalanced_scheduler" +disable_pipeline_warmup: false +sharding: "stage1" +sharding_parallel_config: "split_param enable_fuse_optimizer_states" +sharding_comm_buffer_size_MB: 2048 +save_sharding_stage1_model_include_freeze_params: true +offload_optim: false +tensorwise_offload_optimizer: false +unified_checkpoint_config: ignore_merge_optimizer +recompute: true +recompute_granularity: full +refined_recompute: "global:2" +pre_alloc_memory: 60 + +# amp +bf16: true +fp16_opt_level: "O2" +amp_master_grad: 1 + +# checkpoint +unified_checkpoint: true + +# device flag +FLAGS_use_stride_kernel: false diff --git a/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft_function_call/run_sft_8k.yaml b/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft_function_call/run_sft_8k.yaml new file mode 100644 index 000000000..2e1be5df6 --- /dev/null +++ b/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft_function_call/run_sft_8k.yaml @@ -0,0 +1,105 @@ +# +stage: VL-SFT + +# model +model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking/ +multimodal: true +fuse_linear: true +fuse_rms_norm: false +use_flash_attention: 1 +use_moe: true +fine_tuning: Full +use_sparse_head_and_loss_fn: false +use_recompute_loss_fn: false +moe_group: "mp" +moe_use_aux_free_update_coef: 0.0 +moe_aux_loss_lambda: 0.0 +moe_use_aux_free: true +moe_use_hard_gate: true +moe_multimodal_dispatch_use_allgather: v2-alltoall-unpad-text +pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer + +# data +dataset_name: "FunctionCallSFTReader" +train_dataset_path: "" +train_dataset_prob: "" +text_dataset_path: "examples/data/function-call-train.jsonl" +text_dataset_prob: "1.0" +max_seq_len: 8192 +num_samples_each_epoch: 10000000 +modality_ratio: "[1,1]" + +# preprocess +variable_resolution: 1 +pad_to_max_seqlen: 8192 +render_timestamp: true +serialize_output: false +one_sample_in_one_seq: true +chat_template: "ernie_vl_thinking" + +# dataloader +dataloader_num_workers: 1 + +# train +do_train: true +batch_size: 1 +prefetch_factor: 10 +seed: 42 +gradient_accumulation_steps: 4 +max_steps: 8000 +save_steps: 10000 +logging_steps: 1 +weight_decay: 0.1 +warmup_steps: 100 +output_dir: ./output +add_sys_token: true +same_data: true +freeze_config: "freeze_vision" +trigger_data_prob: 1.0 +from_scratch: 0 +gc_interval: 100000 +drop_history_with_k: true +overwrite_output_dir: true + +# optim +lr_scheduler_type: "cosine" +learning_rate: 1.0e-05 +min_lr: 1.0e-06 +moe_gate_lr_ratio: 0.01 +visual_ld: 0.9 +vit_lr_ratio: 0.9 +adam_beta2: 0.95 +adam_beta1: 0.9 +adam_epsilon: 1.0e-08 +scale_loss: 4096 + +# performance +sequence_parallel: 1 +use_sp_callback: true +tensor_parallel_degree: 4 +pipeline_parallel_degree: 2 +pp_need_data: true +pp_need_data_degree: 2 +virtual_pp_degree: 1 +tensor_parallel_config: "sync_param sync_grad sync_moment" +pipeline_parallel_config: "enable_offload_queue enable_delay_scale_loss enable_overlap_p2p_comm best_unbalanced_scheduler" +disable_pipeline_warmup: false +sharding: "stage1" +sharding_parallel_config: "split_param enable_fuse_optimizer_states" +sharding_comm_buffer_size_MB: 2048 +save_sharding_stage1_model_include_freeze_params: true +offload_optim: false +tensorwise_offload_optimizer: false +unified_checkpoint_config: ignore_merge_optimizer +use_recompute_moe: false +recompute: true +recompute_granularity: core_attn +pre_alloc_memory: 60 + +# amp +bf16: true +fp16_opt_level: "O2" +amp_master_grad: 1 + +# checkpoint +unified_checkpoint: true diff --git a/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft_function_call/run_sft_lora_32k.yaml b/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft_function_call/run_sft_lora_32k.yaml new file mode 100644 index 000000000..57e68672f --- /dev/null +++ b/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B-Thinking/sft_function_call/run_sft_lora_32k.yaml @@ -0,0 +1,106 @@ +# +stage: VL-SFT + +# model +model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking/ +multimodal: true +fuse_linear: true +fuse_rms_norm: false +use_flash_attention: 1 +use_moe: true +fine_tuning: LoRA +lora_rank: 32 +use_sparse_head_and_loss_fn: true +use_recompute_loss_fn: true +moe_group: "mp" +moe_use_aux_free_update_coef: 0.0 +moe_aux_loss_lambda: 0.0 +moe_use_aux_free: true +moe_use_hard_gate: true +moe_multimodal_dispatch_use_allgather: v2-alltoall-unpad-text +pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer + +# data +dataset_name: "FunctionCallSFTReader" +train_dataset_path: "" +train_dataset_prob: "" +text_dataset_path: "examples/data/function-call-train.jsonl" +text_dataset_prob: "1.0" +max_seq_len: 32768 +num_samples_each_epoch: 10000000 +modality_ratio: "[1,1]" + +# preprocess +variable_resolution: 1 +pad_to_max_seqlen: 32768 +render_timestamp: true +serialize_output: false +one_sample_in_one_seq: true +chat_template: "ernie_vl_thinking" + +# dataloader +dataloader_num_workers: 1 + +# train +do_train: true +batch_size: 1 +prefetch_factor: 10 +seed: 42 +gradient_accumulation_steps: 4 +max_steps: 8000 +save_steps: 10000 +logging_steps: 1 +weight_decay: 0.1 +warmup_steps: 100 +output_dir: ./output +add_sys_token: true +same_data: true +freeze_config: "freeze_vision" +trigger_data_prob: 1.0 +from_scratch: 0 +gc_interval: 100000 +drop_history_with_k: true +overwrite_output_dir: true + +# optim +lr_scheduler_type: "cosine" +learning_rate: 3.0e-04 +min_lr: 1.0e-06 +moe_gate_lr_ratio: 0.01 +visual_ld: 0.9 +vit_lr_ratio: 0.9 +adam_beta2: 0.95 +adam_beta1: 0.9 +adam_epsilon: 1.0e-08 +scale_loss: 4096 + +# performance +sequence_parallel: 1 +use_sp_callback: true +tensor_parallel_degree: 2 +pipeline_parallel_degree: 2 +pp_need_data: true +pp_need_data_degree: 2 +virtual_pp_degree: 1 +tensor_parallel_config: "sync_param sync_grad sync_moment" +pipeline_parallel_config: "enable_offload_queue enable_delay_scale_loss enable_overlap_p2p_comm best_unbalanced_scheduler" +disable_pipeline_warmup: false +sharding: "stage1" +sharding_parallel_config: "split_param enable_fuse_optimizer_states" +sharding_comm_buffer_size_MB: 2048 +save_sharding_stage1_model_include_freeze_params: true +offload_optim: false +tensorwise_offload_optimizer: false +unified_checkpoint_config: ignore_merge_optimizer +recompute: true +recompute_granularity: full +refined_recompute: "global:2" +pre_alloc_memory: 60 + +# amp +bf16: true +fp16_opt_level: "O2" +amp_master_grad: 1 + +# checkpoint +unified_checkpoint: true diff --git a/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B/sft/run_sft_32k.yaml b/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B/sft/run_sft_32k.yaml index 4c97aa5ad..8d980f499 100644 --- a/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B/sft/run_sft_32k.yaml +++ b/examples/configs/xpu/ERNIE-4.5-VL-28B-A3B/sft/run_sft_32k.yaml @@ -17,6 +17,7 @@ moe_aux_loss_lambda: 0.0 moe_use_aux_free: true moe_use_hard_gate: true moe_multimodal_dispatch_use_allgather: v2-alltoall-unpad-text +pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer # data train_dataset_path: "examples/data/sft_vl-train_demo1.jsonl" @@ -86,7 +87,8 @@ sharding_parallel_config: "split_param enable_fuse_optimizer_states" sharding_comm_buffer_size_MB: 2048 save_sharding_stage1_model_include_freeze_params: true offload_optim: false -use_recompute_moe: false +tensorwise_offload_optimizer: false +unified_checkpoint_config: ignore_merge_optimizer recompute: true recompute_granularity: full refined_recompute: "global:3"