diff --git a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml new file mode 100644 index 00000000000..2bb001d40d2 --- /dev/null +++ b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml @@ -0,0 +1,69 @@ +### data +train_dataset_type: messages +eval_dataset_type: messages +train_dataset_path: ./ocr_vl_sft-train_Bengali.jsonl +train_dataset_prob: "1.0" +eval_dataset_path: ./ocr_vl_sft-test_Bengali.jsonl +eval_dataset_prob: "1.0" +max_seq_len: 16384 +padding_free: True +truncate_packing: False +dataloader_num_workers: 8 +mix_strategy: concat +template_backend: custom +template: paddleocr_vl + +### model +model_name_or_path: PaddlePaddle/PaddleOCR-VL +attn_impl: flashmask + +### finetuning +# base +stage: VL-SFT +fine_tuning: full +seed: 23 +do_train: true +do_eval: true +per_device_eval_batch_size: 8 +per_device_train_batch_size: 8 +num_train_epochs: 2 +max_steps: -1 +max_estimate_samples: 500 +eval_steps: 400 +evaluation_strategy: steps +save_steps: 400 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 8 +logging_dir: ./PaddleOCR-VL-SFT-Bengali/visualdl_logs/ +output_dir: ./PaddleOCR-VL-SFT-Bengali +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +lr_scheduler_type: cosine +warmup_ratio: 0.01 +learning_rate: 5.0e-6 +min_lr: 5.0e-7 + +# optimizer +weight_decay: 0.1 +adam_epsilon: 1.0e-8 +adam_beta1: 0.9 +adam_beta2: 0.95 + +# performance +tensor_model_parallel_size: 1 +pipeline_model_parallel_size: 1 +sharding: stage1 +recompute_granularity: full +recompute_method: uniform +recompute_num_layers: 1 +bf16: true +fp16_opt_level: O2 +pre_alloc_memory: 24 + +# save +unified_checkpoint: False +save_checkpoint_format: "flex_checkpoint" +load_checkpoint_format: "flex_checkpoint" \ No newline at end of file diff --git a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml new file mode 100644 index 00000000000..6f4cbf00a0c --- /dev/null +++ b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml @@ -0,0 +1,71 @@ +### data +train_dataset_type: messages +eval_dataset_type: messages +train_dataset_path: ./ocr_vl_sft-train_Bengali.jsonl +train_dataset_prob: "1.0" +eval_dataset_path: ./ocr_vl_sft-test_Bengali.jsonl +eval_dataset_prob: "1.0" +max_seq_len: 16384 +padding_free: True +truncate_packing: False +dataloader_num_workers: 8 +mix_strategy: concat +template_backend: custom +template: paddleocr_vl + +### model +model_name_or_path: PaddlePaddle/PaddleOCR-VL +attn_impl: flashmask +lora: true +lora_rank: 8 + +### finetuning +# base +stage: VL-SFT +fine_tuning: lora +seed: 23 +do_train: true +do_eval: true +per_device_eval_batch_size: 8 +per_device_train_batch_size: 8 +num_train_epochs: 2 +max_steps: -1 +max_estimate_samples: 500 +eval_steps: 400 +evaluation_strategy: steps +save_steps: 400 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 8 +logging_dir: ./PaddleOCR-VL-SFT-Bengali-lora/visualdl_logs/ +output_dir: ./PaddleOCR-VL-SFT-Bengali-lora +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +lr_scheduler_type: cosine +warmup_ratio: 0.01 +learning_rate: 5.0e-4 +min_lr: 5.0e-5 + +# optimizer +weight_decay: 0.1 +adam_epsilon: 1.0e-8 +adam_beta1: 0.9 +adam_beta2: 0.95 + +# performance +tensor_model_parallel_size: 1 +pipeline_model_parallel_size: 1 +sharding: stage1 +recompute_granularity: full +recompute_method: uniform +recompute_num_layers: 1 +bf16: true +fp16_opt_level: O2 +pre_alloc_memory: 16 + +# save +unified_checkpoint: false +save_checkpoint_format: "flex_checkpoint" +load_checkpoint_format: "flex_checkpoint" \ No newline at end of file diff --git a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_export.yaml b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_export.yaml new file mode 100644 index 00000000000..7ec0a8473de --- /dev/null +++ b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_export.yaml @@ -0,0 +1,4 @@ +### model +fine_tuning: LoRA +model_name_or_path: PaddlePaddle/PaddleOCR-VL +output_dir: ./PaddleOCR-VL-SFT-Bengali-lora \ No newline at end of file diff --git a/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_full_16k.sh b/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_full_16k.sh new file mode 100644 index 00000000000..311259a723d --- /dev/null +++ b/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_full_16k.sh @@ -0,0 +1,19 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-train_Bengali.jsonl +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-test_Bengali.jsonl + +CUDA_VISIBLE_DEVICES=0 \ +paddleformers-cli train examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml \ No newline at end of file diff --git a/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_full_16k_4090D.sh b/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_full_16k_4090D.sh new file mode 100644 index 00000000000..e0df24fa7e4 --- /dev/null +++ b/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_full_16k_4090D.sh @@ -0,0 +1,23 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-train_Bengali.jsonl +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-test_Bengali.jsonl + +CUDA_VISIBLE_DEVICES=0 \ +paddleformers-cli train examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml \ + per_device_train_batch_size=4 \ + per_device_eval_batch_size=4 \ + gradient_accumulation_steps=16 \ + pre_alloc_memory=18 \ No newline at end of file diff --git a/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_lora_16k.sh b/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_lora_16k.sh new file mode 100644 index 00000000000..29fa009d053 --- /dev/null +++ b/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_lora_16k.sh @@ -0,0 +1,19 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-train_Bengali.jsonl +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-test_Bengali.jsonl + +CUDA_VISIBLE_DEVICES=0 \ +paddleformers-cli train examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml \ No newline at end of file diff --git a/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_lora_16k_4090D.sh b/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_lora_16k_4090D.sh new file mode 100644 index 00000000000..b07e31eec28 --- /dev/null +++ b/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_lora_16k_4090D.sh @@ -0,0 +1,23 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-train_Bengali.jsonl +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-test_Bengali.jsonl + +CUDA_VISIBLE_DEVICES=0 \ +paddleformers-cli train examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml \ + per_device_train_batch_size=4 \ + per_device_eval_batch_size=4 \ + gradient_accumulation_steps=16 \ + pre_alloc_memory=12 \ No newline at end of file diff --git a/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_lora_export.sh b/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_lora_export.sh new file mode 100644 index 00000000000..6b0ea44bfde --- /dev/null +++ b/examples/best_practices/PaddleOCR-VL/run_paddleocr-vl_lora_export.sh @@ -0,0 +1,15 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +paddleformers-cli export examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_export.yaml \ No newline at end of file diff --git a/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml new file mode 100644 index 00000000000..4fa8d6e0dfb --- /dev/null +++ b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml @@ -0,0 +1,72 @@ +### data +train_dataset_type: messages +eval_dataset_type: messages +train_dataset_path: ./ocr_vl_sft-train_Bengali.jsonl +train_dataset_prob: "1.0" +eval_dataset_path: ./ocr_vl_sft-test_Bengali.jsonl +eval_dataset_prob: "1.0" +max_seq_len: 16384 +padding_free: True +truncate_packing: False +dataloader_num_workers: 8 +mix_strategy: concat +template_backend: custom +template: paddleocr_vl + +### model +model_name_or_path: PaddlePaddle/PaddleOCR-VL +attn_impl: sdpa + +### finetuning +# base +stage: VL-SFT +fine_tuning: full +seed: 23 +do_train: true +do_eval: true +per_device_eval_batch_size: 2 +per_device_train_batch_size: 2 +num_train_epochs: 2 +max_steps: -1 +max_estimate_samples: 500 +eval_steps: 400 +evaluation_strategy: steps +save_steps: 400 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 32 +logging_dir: ./PaddleOCR-VL-SFT-Bengali/visualdl_logs/ +output_dir: ./PaddleOCR-VL-SFT-Bengali +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +lr_scheduler_type: cosine +warmup_ratio: 0.01 +learning_rate: 5.0e-6 +min_lr: 5.0e-7 + +# optimizer +weight_decay: 0.1 +adam_epsilon: 1.0e-8 +adam_beta1: 0.9 +adam_beta2: 0.95 + +# performance +tensor_model_parallel_size: 1 +pipeline_model_parallel_size: 1 +sharding: stage2 +recompute_granularity: full +recompute_method: uniform +recompute_num_layers: 1 +bf16: true +fp16_opt_level: O2 +pre_alloc_memory: 18 + +# save +unified_checkpoint: False +save_checkpoint_format: "flex_checkpoint" +load_checkpoint_format: "flex_checkpoint" + +# device +device: iluvatar_gpu \ No newline at end of file diff --git a/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml new file mode 100644 index 00000000000..de7d9417d37 --- /dev/null +++ b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml @@ -0,0 +1,74 @@ +### data +train_dataset_type: messages +eval_dataset_type: messages +train_dataset_path: ./ocr_vl_sft-train_Bengali.jsonl +train_dataset_prob: "1.0" +eval_dataset_path: ./ocr_vl_sft-test_Bengali.jsonl +eval_dataset_prob: "1.0" +max_seq_len: 16384 +padding_free: True +truncate_packing: False +dataloader_num_workers: 8 +mix_strategy: concat +template_backend: custom +template: paddleocr_vl + +### model +model_name_or_path: PaddlePaddle/PaddleOCR-VL +attn_impl: sdpa +lora: true +lora_rank: 8 + +### finetuning +# base +stage: VL-SFT +fine_tuning: lora +seed: 23 +do_train: true +do_eval: true +per_device_eval_batch_size: 2 +per_device_train_batch_size: 2 +num_train_epochs: 2 +max_steps: -1 +max_estimate_samples: 500 +eval_steps: 400 +evaluation_strategy: steps +save_steps: 400 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 32 +logging_dir: ./PaddleOCR-VL-SFT-Bengali-lora/visualdl_logs/ +output_dir: ./PaddleOCR-VL-SFT-Bengali-lora +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +lr_scheduler_type: cosine +warmup_ratio: 0.01 +learning_rate: 5.0e-4 +min_lr: 5.0e-5 + +# optimizer +weight_decay: 0.1 +adam_epsilon: 1.0e-8 +adam_beta1: 0.9 +adam_beta2: 0.95 + +# performance +tensor_model_parallel_size: 1 +pipeline_model_parallel_size: 1 +sharding: stage2 +recompute_granularity: full +recompute_method: uniform +recompute_num_layers: 1 +bf16: true +fp16_opt_level: O2 +pre_alloc_memory: 12 + +# save +unified_checkpoint: false +save_checkpoint_format: "flex_checkpoint" +load_checkpoint_format: "flex_checkpoint" + +# device +device: iluvatar_gpu \ No newline at end of file diff --git a/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_export.yaml b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_export.yaml new file mode 100644 index 00000000000..9fa099df864 --- /dev/null +++ b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_export.yaml @@ -0,0 +1,5 @@ +### model +fine_tuning: LoRA +model_name_or_path: PaddlePaddle/PaddleOCR-VL +output_dir: ./PaddleOCR-VL-SFT-Bengali-lora +device: iluvatar_gpu \ No newline at end of file diff --git a/examples/config/iluvatar/PaddleOCR-VL/sft/run_paddleocr-vl_full_16k.sh b/examples/config/iluvatar/PaddleOCR-VL/sft/run_paddleocr-vl_full_16k.sh new file mode 100644 index 00000000000..5703b3efc27 --- /dev/null +++ b/examples/config/iluvatar/PaddleOCR-VL/sft/run_paddleocr-vl_full_16k.sh @@ -0,0 +1,20 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-train_Bengali.jsonl +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-test_Bengali.jsonl + +CUDA_VISIBLE_DEVICES=0 \ +paddleformers-cli train examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml + \ No newline at end of file diff --git a/examples/config/iluvatar/PaddleOCR-VL/sft/run_paddleocr-vl_lora_16k.sh b/examples/config/iluvatar/PaddleOCR-VL/sft/run_paddleocr-vl_lora_16k.sh new file mode 100644 index 00000000000..7c5b48661b8 --- /dev/null +++ b/examples/config/iluvatar/PaddleOCR-VL/sft/run_paddleocr-vl_lora_16k.sh @@ -0,0 +1,19 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-train_Bengali.jsonl +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-test_Bengali.jsonl + +CUDA_VISIBLE_DEVICES=0 \ +paddleformers-cli train examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml \ No newline at end of file diff --git a/examples/config/iluvatar/PaddleOCR-VL/sft/run_paddleocr-vl_lora_export.sh b/examples/config/iluvatar/PaddleOCR-VL/sft/run_paddleocr-vl_lora_export.sh new file mode 100644 index 00000000000..bbe7fd1d00e --- /dev/null +++ b/examples/config/iluvatar/PaddleOCR-VL/sft/run_paddleocr-vl_lora_export.sh @@ -0,0 +1,15 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +paddleformers-cli export examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_export.yaml \ No newline at end of file diff --git a/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml new file mode 100644 index 00000000000..9aa887f5b50 --- /dev/null +++ b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml @@ -0,0 +1,72 @@ +### data +train_dataset_type: messages +eval_dataset_type: messages +train_dataset_path: ./ocr_vl_sft-train_Bengali.jsonl +train_dataset_prob: "1.0" +eval_dataset_path: ./ocr_vl_sft-test_Bengali.jsonl +eval_dataset_prob: "1.0" +max_seq_len: 16384 +padding_free: True +truncate_packing: False +dataloader_num_workers: 8 +mix_strategy: concat +template_backend: custom +template: paddleocr_vl + +### model +model_name_or_path: PaddlePaddle/PaddleOCR-VL +attn_impl: flashmask + +### finetuning +# base +stage: VL-SFT +fine_tuning: full +seed: 23 +do_train: true +do_eval: true +per_device_eval_batch_size: 8 +per_device_train_batch_size: 8 +num_train_epochs: 2 +max_steps: -1 +max_estimate_samples: 500 +eval_steps: 400 +evaluation_strategy: steps +save_steps: 400 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 8 +logging_dir: ./PaddleOCR-VL-SFT-Bengali/visualdl_logs/ +output_dir: ./PaddleOCR-VL-SFT-Bengali +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +lr_scheduler_type: cosine +warmup_ratio: 0.01 +learning_rate: 5.0e-6 +min_lr: 5.0e-7 + +# optimizer +weight_decay: 0.1 +adam_epsilon: 1.0e-8 +adam_beta1: 0.9 +adam_beta2: 0.95 + +# performance +tensor_model_parallel_size: 1 +pipeline_model_parallel_size: 1 +sharding: stage1 +recompute_granularity: full +recompute_method: uniform +recompute_num_layers: 1 +bf16: true +fp16_opt_level: O2 +pre_alloc_memory: 24 + +# save +unified_checkpoint: False +save_checkpoint_format: "flex_checkpoint" +load_checkpoint_format: "flex_checkpoint" + +# device +device: xpu \ No newline at end of file diff --git a/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml new file mode 100644 index 00000000000..093722c055f --- /dev/null +++ b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml @@ -0,0 +1,74 @@ +### data +train_dataset_type: messages +eval_dataset_type: messages +train_dataset_path: ./ocr_vl_sft-train_Bengali.jsonl +train_dataset_prob: "1.0" +eval_dataset_path: ./ocr_vl_sft-test_Bengali.jsonl +eval_dataset_prob: "1.0" +max_seq_len: 16384 +padding_free: True +truncate_packing: False +dataloader_num_workers: 8 +mix_strategy: concat +template_backend: custom +template: paddleocr_vl + +### model +model_name_or_path: PaddlePaddle/PaddleOCR-VL +attn_impl: flashmask +lora: true +lora_rank: 8 + +### finetuning +# base +stage: VL-SFT +fine_tuning: lora +seed: 23 +do_train: true +do_eval: true +per_device_eval_batch_size: 8 +per_device_train_batch_size: 8 +num_train_epochs: 2 +max_steps: -1 +max_estimate_samples: 500 +eval_steps: 400 +evaluation_strategy: steps +save_steps: 400 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 8 +logging_dir: ./PaddleOCR-VL-SFT-Bengali-lora/visualdl_logs/ +output_dir: ./PaddleOCR-VL-SFT-Bengali-lora +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +lr_scheduler_type: cosine +warmup_ratio: 0.01 +learning_rate: 5.0e-4 +min_lr: 5.0e-5 + +# optimizer +weight_decay: 0.1 +adam_epsilon: 1.0e-8 +adam_beta1: 0.9 +adam_beta2: 0.95 + +# performance +tensor_model_parallel_size: 1 +pipeline_model_parallel_size: 1 +sharding: stage1 +recompute_granularity: full +recompute_method: uniform +recompute_num_layers: 1 +bf16: true +fp16_opt_level: O2 +pre_alloc_memory: 16 + +# save +unified_checkpoint: false +save_checkpoint_format: "flex_checkpoint" +load_checkpoint_format: "flex_checkpoint" + +# device +device: xpu \ No newline at end of file diff --git a/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_export.yaml b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_export.yaml new file mode 100644 index 00000000000..612e72bc19b --- /dev/null +++ b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_export.yaml @@ -0,0 +1,5 @@ +### model +fine_tuning: LoRA +model_name_or_path: PaddlePaddle/PaddleOCR-VL +output_dir: ./PaddleOCR-VL-SFT-Bengali-lora +device: xpu \ No newline at end of file diff --git a/examples/config/xpu/PaddleOCR-VL/sft/run_paddleocr-vl_full_16k.sh b/examples/config/xpu/PaddleOCR-VL/sft/run_paddleocr-vl_full_16k.sh new file mode 100644 index 00000000000..1eb9ce3c6be --- /dev/null +++ b/examples/config/xpu/PaddleOCR-VL/sft/run_paddleocr-vl_full_16k.sh @@ -0,0 +1,22 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-train_Bengali.jsonl +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-test_Bengali.jsonl + +export FLAGS_use_stride_kernel=True + +CUDA_VISIBLE_DEVICES=0 \ +paddleformers-cli train examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml + \ No newline at end of file diff --git a/examples/config/xpu/PaddleOCR-VL/sft/run_paddleocr-vl_lora_16k.sh b/examples/config/xpu/PaddleOCR-VL/sft/run_paddleocr-vl_lora_16k.sh new file mode 100644 index 00000000000..e937b575bf5 --- /dev/null +++ b/examples/config/xpu/PaddleOCR-VL/sft/run_paddleocr-vl_lora_16k.sh @@ -0,0 +1,21 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-train_Bengali.jsonl +wget https://paddleformers.bj.bcebos.com/datasets/ocr-vl/ocr_vl_sft-test_Bengali.jsonl + +export FLAGS_use_stride_kernel=True + +CUDA_VISIBLE_DEVICES=0 \ +paddleformers-cli train examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml \ No newline at end of file diff --git a/examples/config/xpu/PaddleOCR-VL/sft/run_paddleocr-vl_lora_export.sh b/examples/config/xpu/PaddleOCR-VL/sft/run_paddleocr-vl_lora_export.sh new file mode 100644 index 00000000000..58f38f1c9b3 --- /dev/null +++ b/examples/config/xpu/PaddleOCR-VL/sft/run_paddleocr-vl_lora_export.sh @@ -0,0 +1,15 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +paddleformers-cli export examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_export.yaml \ No newline at end of file diff --git a/paddleformers/utils/masking_utils.py b/paddleformers/utils/masking_utils.py index f0d41397eae..d5853604077 100644 --- a/paddleformers/utils/masking_utils.py +++ b/paddleformers/utils/masking_utils.py @@ -79,11 +79,11 @@ def _gen_from_sparse_attn_mask_indices( # [batch_size, k_num_heads, k_seq_len, {1, 2, 4}] -> [batch_size, k_num_heads, {1, 2, 4}, k_seq_len] mask_indices = attn_mask_startend_row_indices.transpose([0, 1, 3, 2]) - downstart_mask_indices = mask_indices[:, :, 0, :] + downstart_mask_indices = mask_indices[:, :, 0:1, :] downstart_mask_indices = downstart_mask_indices.expand([batch_size, num_head, seq_len, -1]) lower_tri = base < downstart_mask_indices if has_end: - downend_mask_indices = mask_indices[:, :, 1, :] + downend_mask_indices = mask_indices[:, :, 1:2, :] downend_mask_indices = downend_mask_indices.expand([batch_size, num_head, seq_len, -1]) lower_tri = paddle.logical_or(lower_tri, base >= downend_mask_indices) @@ -91,14 +91,14 @@ def _gen_from_sparse_attn_mask_indices( if not is_causal: if has_end: - upstart_mask_indices = mask_indices[:, :, 2, :] + upstart_mask_indices = mask_indices[:, :, 2:3, :] upstart_mask_indices = upstart_mask_indices.expand([batch_size, num_head, seq_len, -1]) - upend_mask_indices = mask_indices[:, :, 3, :] + upend_mask_indices = mask_indices[:, :, 3:4, :] upend_mask_indices = upend_mask_indices.expand([batch_size, num_head, seq_len, -1]) upper_tri = base >= upend_mask_indices upper_tri = paddle.logical_or(upper_tri, base < upstart_mask_indices) else: - upend_mask_indices = mask_indices[:, :, 1, :] + upend_mask_indices = mask_indices[:, :, 1:2, :] upend_mask_indices = upend_mask_indices.expand([batch_size, num_head, seq_len, -1]) upper_tri = base >= upend_mask_indices