From 9ec7337438d4dd6b6423eaede20094e55a7d757e Mon Sep 17 00:00:00 2001
From: caoyuanye <cao_dev@foxmail.com>
Date: Mon, 23 Mar 2026 19:57:22 +0800
Subject: [PATCH 1/8] =?UTF-8?q?=E6=94=AF=E6=8C=81interlm25?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddleformers/cli/utils/llm_utils.py          |    8 +
 paddleformers/transformers/__init__.py        |   12 +
 .../transformers/auto/configuration.py        |    3 +
 paddleformers/transformers/auto/modeling.py   |    1 +
 paddleformers/transformers/intern/__init__.py |   13 +
 .../transformers/intern/bert_padding_delte.py |  110 ++
 .../transformers/intern_lm2_5/__init__.py     |   45 +
 .../intern_lm2_5/configuration.py             |  117 ++
 .../transformers/intern_lm2_5/modeling.py     | 1459 +++++++++++++++++
 .../transformers/intern_lm2_5/tokenizer.py    |  219 +++
 tests/config/ci/interlm2_sft.yaml             |   59 +
 tests/integration_test/interlm_sft.sh         |   48 +
 .../intern_lm2_5/test_modeling.py             |  349 ++++
 .../intern_lm2_5/test_tokenizer.py            |   83 +
 14 files changed, 2526 insertions(+)
 create mode 100644 paddleformers/transformers/intern/__init__.py
 create mode 100644 paddleformers/transformers/intern/bert_padding_delte.py
 create mode 100644 paddleformers/transformers/intern_lm2_5/__init__.py
 create mode 100644 paddleformers/transformers/intern_lm2_5/configuration.py
 create mode 100644 paddleformers/transformers/intern_lm2_5/modeling.py
 create mode 100644 paddleformers/transformers/intern_lm2_5/tokenizer.py
 create mode 100644 tests/config/ci/interlm2_sft.yaml
 create mode 100644 tests/integration_test/interlm_sft.sh
 create mode 100644 tests/transformers/intern_lm2_5/test_modeling.py
 create mode 100644 tests/transformers/intern_lm2_5/test_tokenizer.py

diff --git a/paddleformers/cli/utils/llm_utils.py b/paddleformers/cli/utils/llm_utils.py
index ac158086ab3..1b50b050e71 100644
--- a/paddleformers/cli/utils/llm_utils.py
+++ b/paddleformers/cli/utils/llm_utils.py
@@ -416,6 +416,14 @@ def get_lora_target_modules(model):
             "model.visual.blocks.*mlp.up_proj.*",
             "model.visual.blocks.*mlp.down_proj.*",
         ]
+    elif model.config.model_type == "internlm2_5":
+        target_modules = [
+            ".*wqkv.*",
+            ".*wo.*",
+            ".*w1.*",
+            ".*w2.*",
+            ".*w3.*",
+        ]
     else:
         raise ValueError(f"Unknown base_model_prefix: {model.config.model_type}.")
     return target_modules
diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py
index e9bbeb6dd0e..e42d5c6a42a 100644
--- a/paddleformers/transformers/__init__.py
+++ b/paddleformers/transformers/__init__.py
@@ -329,6 +329,17 @@
     ],
     "glm_ocr.processor": ["Glm46VProcessor"],
     "glm_ocr.image_processor": ["Glm46VImageProcessor"],
+    "intern_lm2_5.configuration": ["InternLM25Config"],
+    "intern_lm2_5.modeling": [
+        "InternLM25DecoderLayer",
+        "InternLM25Model",
+        "InternLM25ForCausalLM",
+        "InternLM25PretrainedModel",
+        "InternLM25ForSequenceClassification",
+        "InternLM25ForQuestionAnswering",
+        "InternLM25ForTokenClassification",
+    ],
+    "intern_lm2_5.tokenizer": ["InternLM25Tokenizer"],
 }
 
 if TYPE_CHECKING:
@@ -404,6 +415,7 @@
     from .phi3 import *
     from .gemma3_text import *
     from .glm_ocr import *
+    from .intern_lm2_5 import *
 else:
     sys.modules[__name__] = _LazyModule(
         __name__,
diff --git a/paddleformers/transformers/auto/configuration.py b/paddleformers/transformers/auto/configuration.py
index 289ff0fa46b..0fad6107864 100644
--- a/paddleformers/transformers/auto/configuration.py
+++ b/paddleformers/transformers/auto/configuration.py
@@ -57,6 +57,7 @@
         ("gemma3_text", "Gemma3TextConfig"),
         ("glm4v_moe", "Glm4vMoeConfig"),
         ("glm_ocr", "GlmOcrConfig"),
+        ("internlm2_5", "InternLM25Config"),
     ]
 )
 
@@ -83,6 +84,7 @@
         ("qwen3_vl_moe", "Qwen3VLMoe"),
         ("qwen3_vl_moe_text", "Qwen3VLMoeText"),
         ("glm_ocr", "GlmOcrForConditionalGeneration"),
+        ("internlm2_5", "InternLM25"),
     ]
 )
 
@@ -96,6 +98,7 @@
         ("qwen2_5_vl_text", "qwen2_5_vl"),
         ("qwen3_vl_text", "qwen3_vl"),
         ("qwen3_vl_moe_text", "qwen3_vl_moe"),
+        ("internlm2_5", "intern_lm2_5"),
     ]
 )
 
diff --git a/paddleformers/transformers/auto/modeling.py b/paddleformers/transformers/auto/modeling.py
index 887d76f3012..7502ef0be0d 100644
--- a/paddleformers/transformers/auto/modeling.py
+++ b/paddleformers/transformers/auto/modeling.py
@@ -74,6 +74,7 @@
         ("Gemma3", "gemma3_text"),
         ("Glm4vMoe", "glm4v_moe"),
         ("GlmOcr", "glm_ocr"),
+        ("InternLM2", "intern_lm2_5"),
     ]
 )
 
diff --git a/paddleformers/transformers/intern/__init__.py b/paddleformers/transformers/intern/__init__.py
new file mode 100644
index 00000000000..290f972cf31
--- /dev/null
+++ b/paddleformers/transformers/intern/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddleformers/transformers/intern/bert_padding_delte.py b/paddleformers/transformers/intern/bert_padding_delte.py
new file mode 100644
index 00000000000..f7aa97a6897
--- /dev/null
+++ b/paddleformers/transformers/intern/bert_padding_delte.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# reference from Dao-AILAB flash-attn
+# https://github.com/Dao-AILab/flash-attention/blob/74b0761ff7efc7b90d4e5aeb529c1b2a09a7458c/flash_attn/bert_padding.py#L38
+import operator
+from functools import reduce
+
+import paddle
+import paddle.nn.functional as F
+
+
+class IndexFirstAxis(paddle.autograd.PyLayer):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = reduce(operator.mul, other_shape, 1)
+        return paddle.take_along_axis(
+            arr=input.reshape([input.shape[0], -1]), axis=0, indices=indices.unsqueeze(-1).expand([-1, second_dim])
+        ).reshape([-1, *other_shape])
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Class Attribute: torch.autograd.function.FunctionCtx.saved_tensors, can not convert, please check whether it is torch.Tensor.*/torch.autograd.function.FunctionCtx.*/torch.distributions.Distribution.* and convert manually"""
+        (indices,) = ctx.saved_tensor()
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        grad_output = grad_output.reshape([grad_output.shape[0], -1])
+        grad_input = paddle.zeros(shape=[ctx.first_axis_dim, tuple(grad_output.shape)[1]], dtype=grad_output.dtype)
+
+        grad_input.put_along_axis_(
+            axis=0,
+            indices=indices.unsqueeze(-1).expand([-1, tuple(grad_output.shape)[1]]),
+            values=grad_output,
+        )
+        return grad_input.reshape([ctx.first_axis_dim, *other_shape]), None
+
+
+index_first_axis = IndexFirstAxis.apply
+
+
+class IndexPutFirstAxis(paddle.autograd.PyLayer):
+    @staticmethod
+    def forward(ctx, values, indices, first_axis_dim):
+        ctx.save_for_backward(indices)
+        assert indices.ndim == 1
+        assert values.ndim >= 2
+        output = paddle.zeros(shape=[first_axis_dim, *tuple(values.shape)[1:]], dtype=values.dtype)
+        output[indices] = values
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Class Attribute: torch.autograd.function.FunctionCtx.saved_tensors, can not convert, please check whether it is torch.Tensor.*/torch.autograd.function.FunctionCtx.*/torch.distributions.Distribution.* and convert manually"""
+        (indices,) = ctx.saved_tensor()
+        grad_values = grad_output[indices]
+        return grad_values, None, None
+
+
+index_put_first_axis = IndexPutFirstAxis.apply
+
+
+def unpad_input(hidden_states, attention_mask):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    seqlens_in_batch = paddle.sum(attention_mask, axis=-1, dtype="int32")
+    indices = paddle.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = paddle.max(seqlens_in_batch).item()
+    cu_seqlens = F.pad(paddle.cumsum(seqlens_in_batch, axis=0), [1, 0])
+
+    return (
+        index_first_axis(hidden_states.reshape([-1] + list(hidden_states.shape[2:])), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def pad_input(hidden_states, indices, batch, seqlen):
+    """
+    Arguments:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
+        batch: int, batch size for the padded sequence.
+        seqlen: int, maximum sequence length for the padded sequence.
+    Return:
+        hidden_states: (batch, seqlen, ...)
+    """
+    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return output.reshape([batch, seqlen] + list(output.shape[1:]))
diff --git a/paddleformers/transformers/intern_lm2_5/__init__.py b/paddleformers/transformers/intern_lm2_5/__init__.py
new file mode 100644
index 00000000000..f371a4ed966
--- /dev/null
+++ b/paddleformers/transformers/intern_lm2_5/__init__.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Package"""
+
+import sys
+from typing import TYPE_CHECKING
+
+from ...utils.lazy_import import _LazyModule
+
+import_structure = {
+    "tokenizer": ["InternLM25Tokenizer"],
+    "configuration": ["InternLM25Config"],
+    "modeling": [
+        "InternLM25DecoderLayer",
+        "InternLM25Model",
+        "InternLM25ForCausalLM",
+        "InternLM25ForSequenceClassification",
+        "InternLM25ForQuestionAnswering",
+        "InternLM25ForTokenClassification",
+        "InternLM25PretrainedModel",
+    ],
+}
+
+if TYPE_CHECKING:
+    from .configuration import *
+    from .modeling import *
+    from .tokenizer import *
+else:
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        import_structure,
+        module_spec=__spec__,
+    )
diff --git a/paddleformers/transformers/intern_lm2_5/configuration.py b/paddleformers/transformers/intern_lm2_5/configuration.py
new file mode 100644
index 00000000000..0a44bb36d9e
--- /dev/null
+++ b/paddleformers/transformers/intern_lm2_5/configuration.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" InternLM2.5 model configuration"""
+
+from paddleformers.transformers.configuration_utils import PretrainedConfig
+
+
+
+class InternLM25Config(PretrainedConfig):
+    model_type = "internlm2_5"
+    _auto_class = "AutoConfig"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=92550,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        bias=True,
+        rope_theta=10000,
+        rope_scaling=None,
+        attn_implementation=None,
+        dtype="bfloat16",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.bias = bias
+        import paddle
+        if isinstance(dtype, str):
+            dtype_map = {
+                "float32": paddle.float32,
+                "float16": paddle.float16,
+                "bfloat16": paddle.bfloat16,
+            }
+            self.dtype = dtype_map.get(dtype.lower(), paddle.float32)
+        else:
+            self.dtype = dtype
+
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attn_implementation = attn_implementation
+        if self.attn_implementation is None:
+            self.attn_implementation = "eager"
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    # 检查ROPE位置算法的参数是否存在问题，提前终止异常参数
+    def _rope_scaling_validation(self):
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if (
+            rope_scaling_factor is None
+            or not isinstance(rope_scaling_factor, (float, int))
+            or rope_scaling_factor < 1.0
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s factor field must be a number >= 1, got {rope_scaling_factor} "
+                f"of type {type(rope_scaling_factor)}"
+            )
diff --git a/paddleformers/transformers/intern_lm2_5/modeling.py b/paddleformers/transformers/intern_lm2_5/modeling.py
new file mode 100644
index 00000000000..c094c991af9
--- /dev/null
+++ b/paddleformers/transformers/intern_lm2_5/modeling.py
@@ -0,0 +1,1459 @@
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Paddle InternLM25 model."""
+import logging
+import math
+import queue
+import threading
+from typing import List, Optional, Tuple, Union
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed.fleet.recompute.recompute import recompute
+from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from paddleformers.transformers import PretrainedModel, register_base_model
+from paddleformers.transformers.activations import ACT2FN
+from paddleformers.transformers.model_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from .configuration import InternLM25Config
+from ..cache_utils import Cache, DynamicCache
+
+logger = logging.getLogger(__name__)
+
+try:
+    from paddleformers.generation.streamers import BaseStreamer
+except Exception:
+    BaseStreamer = None
+
+try:
+    from paddle.nn.functional.flash_attention import flash_attention as flash_attn_func
+    from paddle.nn.functional.flash_attention import flash_attn_unpadded as flash_attn_varlen_func
+    has_flash_attn = True
+except:
+    flash_attn_func, flash_attn_varlen_func = None, None
+    has_flash_attn = False
+
+try:
+    from ..intern.bert_padding_delte import index_first_axis, pad_input, unpad_input
+except ImportError:
+    def index_first_axis(tensor, index):
+        return tensor[index]
+
+    def pad_input(hidden_states, attention_mask):
+        return hidden_states
+
+    def unpad_input(hidden_states, attention_mask):
+        return hidden_states, attention_mask
+
+_CONFIG_FOR_DOC = "InternLM25Config"
+
+
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(axis=-1, dtype=paddle.int32)
+    indices = paddle.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(paddle.cumsum(seqlens_in_batch, axis=0, dtype=paddle.int32), (1, 0))  
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+class InternLM25RMSNorm(nn.Layer):
+    """InternLM25RMSNorm is equivalent to T5LayerNorm."""
+
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        out_2 = paddle.create_parameter(
+            shape=paddle.ones(shape=hidden_size).shape,
+            dtype=paddle.ones(shape=hidden_size).numpy().dtype,
+            default_initializer=paddle.nn.initializer.Assign(paddle.ones(shape=hidden_size)),
+        )
+        out_2.stop_gradient = not True
+        self.weight = out_2
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.astype(paddle.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.astype(input_dtype)
+
+
+# 这里会有一些 bf16 到 float32的类型提升，是正常的，原版也是这样。最好不要优化这里了，如果不提升精度，会导致 准确率显著下降
+# 可以参考 https://github.com/huggingface/transformers/pull/29285
+
+class InternLM25RotaryEmbedding(nn.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        super().__init__()
+        self.scaling_factor = scaling_factor
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (paddle.arange(0, self.dim, 2, dtype=paddle.int64).astype("float32") / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistable=False)
+        self.max_seq_len_cached = max_position_embeddings
+
+
+    @paddle.no_grad()
+    def forward(self, x, position_ids):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        inv_freq_expanded = self.inv_freq[None, :, None].astype("float32").expand(
+            [position_ids.shape[0], self.inv_freq.shape[0], 1]
+        )
+        position_ids_expanded = position_ids[:, None, :].astype("float32")
+        freqs = (inv_freq_expanded @ position_ids_expanded).transpose([0, 2, 1])
+        emb = paddle.concat((freqs, freqs), axis=-1)
+        cos = emb.cos()
+        sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class InternLM25LinearScalingRotaryEmbedding(InternLM25RotaryEmbedding):
+    def forward(self, x, position_ids):
+        position_ids = position_ids.astype("float32") / self.scaling_factor
+        cos, sin = super().forward(x, position_ids)
+        return cos, sin
+
+
+class InternLM25DynamicNTKScalingRotaryEmbedding(InternLM25RotaryEmbedding):
+    def forward(self, x, position_ids):
+        seq_len = paddle.max(position_ids) + 1
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (paddle.arange(0, self.dim, 2, dtype=paddle.int64).astype("float32").to(x.place) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistable=False)
+
+        cos, sin = super().forward(x, position_ids)
+        return cos, sin
+
+
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat((-x2, x1), axis=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_axis=1):
+    cos = cos.unsqueeze(unsqueeze_axis)
+    sin = sin.unsqueeze(unsqueeze_axis)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class InternLM25MLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+        self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+        self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
+
+        return down_proj
+
+
+def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand([batch, num_key_value_heads, n_rep, slen, head_dim])
+    return hidden_states.reshape([batch, num_key_value_heads * n_rep, slen, head_dim])
+
+
+class InternLM25Attention(nn.Layer):
+    def __init__(self, config: InternLM25Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.wqkv = nn.Linear(
+            self.hidden_size,
+            (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
+            bias_attr=config.bias,
+        )
+        self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias_attr=config.bias)
+
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = InternLM25RotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = InternLM25LinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = InternLM25DynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,  
+        cache_position: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        bsz, q_len, _ = hidden_states.shape
+
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            qkv_slices = self.wqkv.weight.split(key_value_slicing, axis=0)
+            qkv_states = paddle.concat(
+                [F.linear(hidden_states, qkv_slice) for qkv_slice in qkv_slices], axis=-1  
+            )
+        else:
+            qkv_states = self.wqkv(hidden_states)
+
+        gs = 2 + self.num_key_value_groups
+        d = self.head_dim
+        h = qkv_states.shape[-1] // (gs * d)
+        qkv_states = qkv_states.reshape([bsz, q_len, h, gs, d])
+
+        query_states = qkv_states[..., : self.num_key_value_groups, :]
+        query_states = query_states.reshape([bsz, q_len, -1, self.head_dim]).transpose([0, 2, 1, 3])
+        key_states = qkv_states[..., -2, :].transpose([0, 2, 1, 3])
+        value_states = qkv_states[..., -1, :].transpose([0, 2, 1, 3])
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32).to(query_states.dtype)
+        attn_output = paddle.matmul(attn_weights, value_states)
+
+        if attn_output.shape != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.shape}"
+            )
+
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, axis=2)
+            o_proj_slices = self.wo.weight.split(self.hidden_size // self.config.pretraining_tp, axis=1)
+            attn_output = sum(
+                [
+                    F.linear(attn_output[i], o_proj_slices[i])  
+                    for i in range(self.config.pretraining_tp)
+                ]
+            )
+        else:
+            attn_output = self.wo(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class InternLM25FlashAttention2(InternLM25Attention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._flash_attn_uses_top_left_mask = not False
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.shape
+
+        qkv_states = self.wqkv(hidden_states)
+
+        gs = 2 + self.num_key_value_groups
+        d = self.head_dim
+        h = qkv_states.shape[-1] // (gs * d)
+        qkv_states = qkv_states.reshape([bsz, q_len, h, gs, d])
+
+        query_states = qkv_states[..., : self.num_key_value_groups, :]
+        query_states = query_states.reshape([bsz, q_len, -1, self.head_dim])
+        key_states = qkv_states[..., -2, :]
+        value_states = qkv_states[..., -1, :]
+
+        query_states = query_states.transpose([0, 2, 1, 3])
+        key_states = key_states.transpose([0, 2, 1, 3])
+        value_states = value_states.transpose([0, 2, 1, 3])
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        query_states = query_states.transpose([0, 2, 1, 3])
+        key_states = key_states.transpose([0, 2, 1, 3])
+        value_states = value_states.transpose([0, 2, 1, 3])
+
+        dropout_rate = 0.0
+
+        input_dtype = query_states.dtype
+        if input_dtype == paddle.float32:
+            if False:
+                target_dtype = paddle.float32
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.wqkv.weight.dtype
+
+            logger.warning(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.wo(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value  
+
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            causal = self.is_causal and query_length != 1
+
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(  
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)  
+        else:
+            attn_output = flash_attn_func(  
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(  
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(  
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(  
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = paddle.arange(
+                batch_size + 1, dtype=paddle.int32
+            )
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(  
+                query_layer, attention_mask
+            )
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+class InternLM25SdpaAttention(InternLM25Attention):
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        if output_attentions:
+
+            logger.warning(
+                "InternLM25Model uses InternLM25SdpaAttention, but `paddle.nn.functional.scaled_dot_product_attention` "
+                "does not support `output_attentions=True`. "
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+        bsz, q_len, _ = hidden_states.shape
+
+        qkv_states = self.wqkv(hidden_states)
+
+        gs = 2 + self.num_key_value_groups
+        d = self.head_dim
+        h = qkv_states.shape[-1] // (gs * d)
+        qkv_states = qkv_states.reshape([bsz, q_len, h, gs, d])
+
+        query_states = qkv_states[..., : self.num_key_value_groups, :]
+        query_states = query_states.reshape([bsz, q_len, -1, self.head_dim])
+        key_states = qkv_states[..., -2, :]
+        value_states = qkv_states[..., -1, :]
+
+        query_states = query_states.transpose([0, 2, 1, 3])
+        key_states = key_states.transpose([0, 2, 1, 3])
+        value_states = value_states.transpose([0, 2, 1, 3])
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        if query_states.place.type == "cuda" and causal_mask is not None:
+            query_states = query_states
+            key_states = key_states
+            value_states = value_states
+
+        is_causal = bool(causal_mask is None and q_len > 1)
+
+        attn_output = paddle.nn.functional.scaled_dot_product_attention(  
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.wo(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+INTERNLM25_ATTENTION_CLASSES = {
+    "eager": InternLM25Attention,
+    "flash_attention_2": InternLM25FlashAttention2,
+    "sdpa": InternLM25SdpaAttention,
+}
+
+
+class InternLM25DecoderLayer(nn.Layer):
+    def __init__(self, config: InternLM25Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+
+        self.attention = INTERNLM25_ATTENTION_CLASSES[config.attn_implementation](config=config, layer_idx=layer_idx)
+
+        self.feed_forward = InternLM25MLP(config)
+        self.attention_norm = InternLM25RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.ffn_norm = InternLM25RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.attention_norm(hidden_states)
+
+        hidden_states, self_attn_weights, present_key_value = self.attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.ffn_norm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+class InternLM25PretrainedModel(PretrainedModel):
+    config_class = InternLM25Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["InternLM25DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    transpose_weight_keys = ["wqkv", "wo", "w1", "w2", "w3", "output"]
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            paddle.nn.initializer.Normal(mean=0.0, std=std)(module.weight)
+            if module.bias is not None:
+                paddle.nn.initializer.Constant(0.0)(module.bias)
+        elif isinstance(module, nn.Embedding):
+            paddle.nn.initializer.Normal(mean=0.0, std=std)(module.weight)
+            if module._padding_idx is not None:
+                module.weight[module._padding_idx].zero_()
+
+    @classmethod
+    def _gen_aoa_config(cls, config: InternLM25Config):
+        """Generate AOA (Auto-Transpose-Adapter) config for loading HuggingFace checkpoints."""
+        # 禁用AOA以解决tok_embeddings.weight未分配的问题
+        return {"aoa_statements": []}
+
+@register_base_model
+class InternLM25Model(InternLM25PretrainedModel):
+    _auto_class = "AutoModel"
+
+    def __init__(self, config: InternLM25Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+
+        self.layers = nn.LayerList(
+            [InternLM25DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = InternLM25RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.enable_recompute = False
+
+    def get_input_embeddings(self):
+        return self.tok_embeddings
+
+    def set_input_embeddings(self, value):
+        self.tok_embeddings = value
+
+    # 原始代码使用 HF 的 self._gradient_checkpointing_func(decoder_layer.__call__, ...)
+    # PaddleFormers 使用 paddle.distributed.fleet.recompute.recompute 替代
+    @paddle.jit.not_to_static
+    def recompute_training_full(
+        self,
+        layer_module: nn.Layer,
+        hidden_states: paddle.Tensor,
+        causal_mask: Optional[paddle.Tensor],
+        position_ids: Optional[paddle.Tensor],
+        past_key_values: Optional[Cache],
+        output_attentions: bool,
+        use_cache: bool,
+        cache_position: Optional[paddle.Tensor],
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+
+        layer_outputs = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            causal_mask,
+            position_ids,
+            past_key_values,
+            output_attentions,
+            use_cache,
+            cache_position,
+        )
+        return layer_outputs
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[Union[Cache, List[paddle.Tensor]]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[paddle.Tensor] = None,
+        # PaddleFormers SFT trainer may pass extra kwargs like attn_mask_startend_row_indices;
+        # accept and ignore them here for compatibility.
+        **kwargs,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.enable_recompute and self.training and use_cache:
+            logger.warning(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.tok_embeddings(input_ids)
+
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None or len(past_key_values) == 0:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache(ddp_cache_data=past_key_values)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = paddle.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1]
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.enable_recompute and self.training:
+                # 原始HF代码: self._gradient_checkpointing_func(decoder_layer.__call__, ...)
+                # PaddleFormers使用paddle recompute替代
+                layer_outputs = self.recompute_training_full(
+                    decoder_layer,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = tuple((layer.keys, layer.values) for layer in next_cache.layers)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: paddle.Tensor,
+        input_tensor: paddle.Tensor,
+        cache_position: paddle.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config.attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = False
+
+        if self.config.attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            pass
+
+        dtype, device = input_tensor.dtype, input_tensor.place
+        min_dtype = paddle.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, paddle.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        if attention_mask is not None and attention_mask.ndim == 4:
+            if attention_mask.max() != 0:
+                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+            causal_mask = attention_mask
+        else:
+            causal_mask = paddle.full([sequence_length, target_length], fill_value=min_dtype, dtype=dtype)
+            if device is not None:
+                causal_mask = causal_mask.to(device)
+            if sequence_length != 1:
+                if dtype == paddle.float32:
+                    causal_mask = paddle.triu(causal_mask, diagonal=1)
+                else:
+                    triu_mask = paddle.triu(paddle.ones(causal_mask.shape).to(device), diagonal=1).astype("bool")
+                    causal_mask = paddle.where(triu_mask, causal_mask, paddle.zeros_like(causal_mask))
+            causal_mask *= (paddle.arange(target_length).to(device) > cache_position.reshape(-1, 1)).astype(dtype)
+            causal_mask = causal_mask[None, None, :, :].expand([input_tensor.shape[0], 1, -1, -1])
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].astype(causal_mask.dtype)
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        if (
+            self.config.attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.place.type == "cuda"
+            and not output_attentions
+        ):
+            pass
+
+        return causal_mask
+
+
+class InternLM25ForCausalLM(InternLM25PretrainedModel):
+    _auto_class = "AutoModelForCausalLM"
+    _tied_weights_keys = ["output.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = InternLM25Model(config)
+        self.vocab_size = config.vocab_size
+        self.output = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
+
+    def get_input_embeddings(self):
+        return self.model.tok_embeddings
+
+    def set_input_embeddings(self, value):
+        self.model.tok_embeddings = value
+
+    def get_output_embeddings(self):
+        return self.output
+
+    def set_output_embeddings(self, new_embeddings):
+        self.output = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[Union[Cache, List[paddle.Tensor]]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[paddle.Tensor] = None,
+        # PaddleFormers SFT trainer may pass extra kwargs like attn_mask_startend_row_indices;
+        # accept and ignore them here for compatibility.
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            output_slices = self.output.weight.split(self.vocab_size // self.config.pretraining_tp, axis=0)
+            logits = [
+                F.linear(hidden_states, output_slices[i])  
+                for i in range(self.config.pretraining_tp)
+            ]
+            logits = paddle.concat(logits, axis=-1)
+        else:
+            logits = self.output(hidden_states)
+        logits = logits.astype("float32")
+
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.reshape(-1, self.vocab_size)
+            shift_labels = shift_labels.reshape(-1)
+            shift_labels = shift_labels.to(shift_logits.place)
+            loss = loss_fct(shift_logits, shift_labels)
+            # DEBUG: log raw loss details for diagnosis
+            import os
+            if os.environ.get("INTERNLM25_DEBUG_LOSS"):
+                n_valid = (shift_labels != -100).sum().item()
+                n_total = shift_labels.shape[0]
+                print(
+                    f"[DEBUG LOSS] raw_loss={loss.item():.4f} n_valid={n_valid} n_total={n_total} "
+                    f"logits_shape={shift_logits.shape} labels_shape={shift_labels.shape} "
+                    f"logits_min={shift_logits.min().item():.4f} logits_max={shift_logits.max().item():.4f} "
+                    f"logits_mean={shift_logits.mean().item():.4f}",
+                    flush=True
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        past_length = 0
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
+                max_cache_length = (
+                    paddle.to_tensor(past_key_values.get_max_cache_shape(), place=input_ids.place if hasattr(input_ids, 'place') else None)
+                    if past_key_values.get_max_cache_shape() is not None
+                    else None
+                )
+                cache_length = past_length if max_cache_length is None else paddle.min(max_cache_length, past_length)
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]  
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.astype("int64").cumsum(-1) - 1
+            position_ids = paddle.where(attention_mask == 0, paddle.ones_like(position_ids), position_ids)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
+        if cache_position is None:
+            cache_position = paddle.arange(past_length, past_length + input_length)
+        elif use_cache:
+            cache_position = cache_position[-input_length:]
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.place)) for past_state in layer_past),
+            )
+        return reordered_past
+
+    def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, meta_instruction=""):
+        if history is None:
+            history = []
+        if tokenizer.add_bos_token:
+            prompt = ""
+        else:
+            prompt = tokenizer.bos_token
+        if meta_instruction:
+            prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
+        for record in history:
+            prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
+        prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
+        return tokenizer([prompt], return_tensors="pd")
+
+    @paddle.no_grad()
+    def chat(
+        self,
+        tokenizer,
+        query: str,
+        history: Optional[List[Tuple[str, str]]] = None,
+        streamer: Optional[BaseStreamer] = None,
+        max_new_tokens: int = 1024,
+        do_sample: bool = True,
+        temperature: float = 0.8,
+        top_p: float = 0.8,
+        meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n"
+        "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory "
+        "(上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
+        "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such "
+        "as English and 中文.",
+        **kwargs,
+    ):
+        if history is None:
+            history = []
+        inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
+        inputs = {k: v for k, v in inputs.items() if paddle.is_tensor(v)}
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
+        outputs = self.generate(
+            **inputs,
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+            do_sample=do_sample,
+            temperature=temperature,
+            top_p=top_p,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
+        response = tokenizer.decode(outputs, skip_special_tokens=True)
+        response = response.split("<|im_end|>")[0]
+        history = history + [(query, response)]
+        return response, history
+
+    @paddle.no_grad()
+    def stream_chat(
+        self,
+        tokenizer,
+        query: str,
+        history: List[Tuple[str, str]] = None,
+        max_new_tokens: int = 1024,
+        do_sample: bool = True,
+        temperature: float = 0.8,
+        top_p: float = 0.8,
+        **kwargs,
+    ):
+        if history is None:
+            history = []
+        if BaseStreamer is None:
+            raise ModuleNotFoundError(
+                "The version of `paddle` is too low."
+            )
+
+        response_queue = queue.Queue(maxsize=20)
+
+        class ChatStreamer(BaseStreamer):
+            def __init__(self, tokenizer) -> None:
+                super().__init__()
+                self.tokenizer = tokenizer
+                self.queue = response_queue
+                self.query = query
+                self.history = history
+                self.response = ""
+                self.cache = []
+                self.received_inputs = False
+                self.queue.put((self.response, history + [(self.query, self.response)]))
+
+            def put(self, value):
+                if len(value.shape) > 1 and value.shape[0] > 1:
+                    raise ValueError("ChatStreamer only supports batch size 1")
+                elif len(value.shape) > 1:
+                    value = value[0]
+
+                if not self.received_inputs:
+                    self.received_inputs = True
+                    return
+
+                self.cache.extend(value.tolist())
+                token = self.tokenizer.decode(self.cache, skip_special_tokens=True)
+                if token.strip() != "<|im_end|>":
+                    self.response = self.response + token
+                    history = self.history + [(self.query, self.response)]
+                    self.queue.put((self.response, history))
+                    self.cache = []
+                else:
+                    self.end()
+
+            def end(self):
+                self.queue.put(None)
+
+        def stream_producer():
+            return self.chat(
+                tokenizer=tokenizer,
+                query=query,
+                streamer=ChatStreamer(tokenizer=tokenizer),
+                history=history,
+                max_new_tokens=max_new_tokens,
+                do_sample=do_sample,
+                temperature=temperature,
+                top_p=top_p,
+                **kwargs,
+            )
+
+        def consumer():
+            producer = threading.Thread(target=stream_producer)
+            producer.start()
+            while True:
+                res = response_queue.get()
+                if res is None:
+                    return
+                yield res
+
+        return consumer()
+
+
+class InternLM25ForSequenceClassification(InternLM25PretrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = InternLM25Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias_attr=False)
+    def get_input_embeddings(self):
+        return self.model.tok_embeddings
+
+    def set_input_embeddings(self, value):
+        self.model.tok_embeddings = value
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[Union[Cache, List[paddle.Tensor]]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = paddle.equal(input_ids, self.config.pad_token_id).astype("int32").argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.place)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[paddle.arange(batch_size), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.place)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype in (paddle.int64, paddle.int32)):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.reshape(-1, self.num_labels), labels.reshape(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class InternLM25ForQuestionAnswering(InternLM25PretrainedModel):
+    base_model_prefix = "transformer"
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = InternLM25Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+    def get_input_embeddings(self):
+        return self.transformer.tok_embeddings
+
+    def set_input_embeddings(self, value):
+        self.transformer.tok_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[Union[Cache, List[paddle.Tensor]]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        start_positions: Optional[paddle.Tensor] = None,
+        end_positions: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = paddle.split(logits, num_or_sections=2, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            if len(start_positions.shape) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.place)
+            if len(end_positions.shape) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.place)
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class InternLM25ForTokenClassification(InternLM25PretrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = InternLM25Model(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+    def get_input_embeddings(self):
+        return self.model.tok_embeddings
+
+    def set_input_embeddings(self, value):
+        self.model.tok_embeddings = value
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.reshape(-1, self.num_labels), labels.reshape(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# original config.json architectures field uses "InternLM2ForCausalLM" while the paddle
+# implementation class is InternLM25ForCausalLM; provide backward-compatible alias so
+# that Auto mapping (which does getattr(module, "InternLM2ForCausalLM")) resolves correctly.
+InternLM2ForCausalLM = InternLM25ForCausalLM
diff --git a/paddleformers/transformers/intern_lm2_5/tokenizer.py b/paddleformers/transformers/intern_lm2_5/tokenizer.py
new file mode 100644
index 00000000000..dfb4163e44f
--- /dev/null
+++ b/paddleformers/transformers/intern_lm2_5/tokenizer.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for InternLM2.5."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+try:
+    import sentencepiece as spm
+except ImportError:
+    spm = None
+
+from paddleformers.transformers.tokenizer_utils import PretrainedTokenizer
+from paddleformers.utils.log import logger
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+
+
+class InternLM25Tokenizer(PretrainedTokenizer):
+    resource_files_names = VOCAB_FILES_NAMES
+    # PretrainedTokenizer.from_pretrained() uses vocab_files_names to locate vocab files
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="</s>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        decode_with_prefix_space=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        if spm is None:
+            raise ImportError(
+                "You need to install sentencepiece to use InternLM25Tokenizer. "
+                "See https://github.com/google/sentencepiece for installation."
+            )
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.decode_with_prefix_space = decode_with_prefix_space
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        self._no_prefix_space_tokens = None
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+    @property
+    def no_prefix_space_tokens(self):
+        if self._no_prefix_space_tokens is None:
+            vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+            self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
+        return self._no_prefix_space_tokens
+
+    @property
+    def vocab_size(self):
+        return self.sp_model.get_piece_size()
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def _maybe_add_prefix_space(self, tokens, decoded):
+        if tokens and tokens[0] not in self.no_prefix_space_tokens:
+            return " " + decoded
+        else:
+            return decoded
+
+    def convert_tokens_to_string(self, tokens):
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            if token in self.all_special_tokens:
+                if not prev_is_special:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        out_string = self.clean_up_tokenization(out_string)
+        out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
+        return out_string[1:]
+
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is not None:
+            output = output + token_ids_1
+
+        if self.add_eos_token:
+            output = output + [self.eos_token_id]
+
+        return output
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        eos = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+
+    # 全参微调的时候，会调用到;
+    # 不要觉得没人调用，就删掉，因为 原版的transformers没有集成lm25的实现；
+    # 如果强行 使用 warp_tokenizer 会很多棘手问题处理，这里 直接借鉴了hg上原版的lm25的逻辑
+    def encode(
+            self,
+            text: None = None,
+            text_pair:  None = None,
+            add_special_tokens: bool = True,
+            padding: bool | str  = False,
+            truncation: bool | str | None = None,
+            max_length: int | None = None,
+            stride: int = 0,
+            padding_side: str | None = None,
+            return_tensors: str  | None = None,
+            **kwargs,
+    ) -> List[int]:
+        padding_strategy, truncation_strategy, max_length, kwargs_updated = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            **kwargs,
+        )
+
+        kwargs.update(kwargs_updated)
+
+        encoded_inputs = self._encode_plus(
+            text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_inputs["input_ids"]
diff --git a/tests/config/ci/interlm2_sft.yaml b/tests/config/ci/interlm2_sft.yaml
new file mode 100644
index 00000000000..bc7f3401a30
--- /dev/null
+++ b/tests/config/ci/interlm2_sft.yaml
@@ -0,0 +1,59 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: ./tests/fixtures/dummy/sft/train.jsonl
+train_dataset_prob: "1.0"
+eval_dataset_path: ./tests/fixtures/dummy/sft/eval.jsonl
+eval_dataset_prob: "1.0"
+max_seq_len: 512
+packing: false
+dataloader_shuffle: false
+mix_strategy: concat
+template_backend: custom
+template: internlm2_5
+### model
+model_name_or_path: learncat/internlm2_5-1_8b-chat-paddle
+_attn_implementation: flashmask
+
+
+### finetuning
+# base
+stage: SFT
+fine_tuning: full
+seed: 23
+do_train: true
+do_eval: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: 500
+eval_steps: 1000
+evaluation_strategy: steps
+save_steps: 100000
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/qwen3-sft-full
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+
+# train
+warmup_steps: 5
+learning_rate: 1.0e-5
+
+# performance
+tensor_model_parallel_size: 1
+pipeline_model_parallel_size: 1
+sharding: stage1
+recompute_granularity: full
+recompute_method: uniform
+recompute_num_layers: 1
+bf16: true
+fp16_opt_level: O1
+unified_checkpoint: false
+# 注释之后跳过保存阶段
+# save_checkpoint_format: flex_checkpoint
+load_checkpoint_format: sharding_io
+continue_training: false
diff --git a/tests/integration_test/interlm_sft.sh b/tests/integration_test/interlm_sft.sh
new file mode 100644
index 00000000000..463aea59430
--- /dev/null
+++ b/tests/integration_test/interlm_sft.sh
@@ -0,0 +1,48 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO ，前期不在 .github/workflows/fleet-model-test.yml 中生效，避免直接卡死流程
+#  TODO，提交PR的时候，会提交loss对比材料
+
+set -exo pipefail
+export root_dir=$(pwd)
+
+if [ -f 'PaddleFleet/.venv/bin/activate' ]; then
+   source PaddleFleet/.venv/bin/activate
+fi
+
+config_sft_yaml=$root_dir/PaddleFormers/tests/config/ci/interlm2_sft.yaml
+
+if [[ ! -f "$config_sft_yaml" ]]; then
+  echo "Config file not found: $config_sft_yaml"
+  exit 1
+fi
+
+rm -rf ./outputs
+rm -rf paddleformers_dist_log
+master=$(hostname -i)
+port=36677
+
+export FLAGS_embedding_deterministic=1
+export FLAGS_cudnn_deterministic=1
+export FLAGS_use_stride_compute_kernel=False
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+unset http_proxy https_proxy
+
+log_file=interlm_sft.txt
+gt_loss_file=interlm_sft_multi_card_gt_loss.txt
+
+set +e
+NNODES=1 MASTER_ADDR=$master MASTER_PORT=$port coverage run $(which paddleformers-cli) train $config_sft_yaml 2>&1 | tee ./${log_file}
\ No newline at end of file
diff --git a/tests/transformers/intern_lm2_5/test_modeling.py b/tests/transformers/intern_lm2_5/test_modeling.py
new file mode 100644
index 00000000000..7760cd5802d
--- /dev/null
+++ b/tests/transformers/intern_lm2_5/test_modeling.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+import paddle
+
+from paddleformers.transformers import (
+    InternLM25Config,
+    InternLM25ForCausalLM,
+    InternLM25Tokenizer,
+)
+from tests.testing_utils import slow, require_package
+
+aistudio_pt_lm25_model_location  = "learncat/internlm2_5-1_8b-chat-raw"
+aistudio_paddle_lm25_model_location = "learncat/internlm2_5-1_8b-chat-paddle"
+hg_lm25_model_location= "internlm/internlm2_5-1_8b-chat"
+
+# config层的常规测试
+class TestInternLM25Config(unittest.TestCase):
+    def test_config_custom_values(self):
+        config = InternLM25Config(
+            vocab_size=10000,
+            hidden_size=4096,
+            num_hidden_layers=32,
+            num_attention_heads=32,
+            intermediate_size=14336,
+        )
+        self.assertEqual(config.vocab_size, 10000)
+        self.assertEqual(config.hidden_size, 4096)
+        self.assertEqual(config.intermediate_size, 14336)
+
+    def test_config_save_and_load(self):
+        config = InternLM25Config(vocab_size=10000, hidden_size=4096)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            config.save_pretrained(temp_dir)
+            loaded_config = InternLM25Config.from_pretrained(temp_dir)
+            self.assertEqual(config.vocab_size, loaded_config.vocab_size)
+            self.assertEqual(config.hidden_size, loaded_config.hidden_size)
+
+# model层的常规测试
+class InternLM25ModelTest(unittest.TestCase):
+    def setUp(self):
+        self.config = InternLM25Config(
+            vocab_size=1000,
+            hidden_size=256,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=4,
+            intermediate_size=512,
+            max_position_embeddings=128,
+            use_cache=False,
+        )
+
+    def test_model_initialization(self):
+        model = InternLM25ForCausalLM(self.config)
+        self.assertIsNotNone(model)
+        self.assertEqual(model.config.vocab_size, 1000)
+        self.assertEqual(model.config.hidden_size, 256)
+
+    def test_model_forward(self):
+        model = InternLM25ForCausalLM(self.config)
+        model.eval()
+
+        batch_size = 2
+        seq_length = 10
+        input_ids = paddle.randint(0, self.config.vocab_size, [batch_size, seq_length])
+
+        with paddle.no_grad():
+            outputs = model(input_ids=input_ids, return_dict=True)
+
+        logits = outputs.logits
+        self.assertEqual(logits.shape, [batch_size, seq_length, self.config.vocab_size])
+
+    def test_model_generation(self):
+        model = InternLM25ForCausalLM(self.config)
+        model.eval()
+
+        input_ids = paddle.randint(0, self.config.vocab_size, [1, 5])
+
+        with paddle.no_grad():
+            generated_ids = model.generate(
+                input_ids=input_ids,
+                max_length=20,
+                min_length=10,
+                use_cache=False,
+            )
+
+        if isinstance(generated_ids, tuple):
+            generated_ids = generated_ids[0]
+
+        self.assertGreaterEqual(generated_ids.shape[1], 10)
+        self.assertLessEqual(generated_ids.shape[1], 20)
+
+    def test_model_save_and_load(self):
+        model = InternLM25ForCausalLM(self.config)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            model.save_pretrained(temp_dir, save_checkpoint_format="", save_to_hf=False)
+
+            self.assertTrue(os.path.exists(os.path.join(temp_dir, "model_state.pdparams")))
+            self.assertTrue(os.path.exists(os.path.join(temp_dir, "config.json")))
+
+            loaded_model = InternLM25ForCausalLM.from_pretrained(temp_dir, load_checkpoint_format="")
+
+            self.assertEqual(model.config.vocab_size, loaded_model.config.vocab_size)
+            self.assertEqual(model.config.hidden_size, loaded_model.config.hidden_size)
+
+    def test_chat_method(self):
+        model = InternLM25ForCausalLM(self.config)
+        model.eval()
+        self.assertTrue(hasattr(model, "chat"))
+        self.assertTrue(hasattr(model, "build_inputs"))
+        self.assertTrue(hasattr(model, "stream_chat"))
+
+    def test_model_with_attention_mask(self):
+        model = InternLM25ForCausalLM(self.config)
+        model.eval()
+
+        batch_size = 2
+        seq_length = 10
+        input_ids = paddle.randint(0, self.config.vocab_size, [batch_size, seq_length])
+        attention_mask = paddle.ones([batch_size, seq_length])
+
+        with paddle.no_grad():
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
+
+        logits = outputs.logits
+        self.assertEqual(logits.shape, [batch_size, seq_length, self.config.vocab_size])
+
+    def test_model_with_past_key_values(self):
+        config = InternLM25Config(
+            vocab_size=1000,
+            hidden_size=256,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=4,
+            intermediate_size=512,
+            max_position_embeddings=128,
+            use_cache=True,
+        )
+        model = InternLM25ForCausalLM(config)
+        model.eval()
+
+        batch_size = 1
+        seq_length = 5
+        input_ids = paddle.randint(0, config.vocab_size, [batch_size, seq_length])
+
+        with paddle.no_grad():
+            outputs = model(input_ids=input_ids, use_cache=True, return_dict=True)
+            past_key_values = outputs.past_key_values
+            next_input_ids = paddle.randint(0, config.vocab_size, [batch_size, 1])
+            outputs = model(
+                input_ids=next_input_ids,
+                past_key_values=past_key_values,
+                use_cache=True,
+                return_dict=True,
+            )
+
+        self.assertIsNotNone(outputs.past_key_values)
+
+
+# paddle直接加载 原始的 hg权重的测试，是否可以正常推理
+class InternLM25ConvertedWeightTest(unittest.TestCase):
+    def setUp(self):
+        self._original_dtype = paddle.get_default_dtype()
+        paddle.set_default_dtype("bfloat16")
+
+    def tearDown(self):
+        paddle.set_default_dtype(self._original_dtype)
+
+
+    # 使用paddle格式的权重，推理一次
+    @slow
+    def test_paddle_model_load_and_infer(self):
+        paddle.set_device("gpu")
+
+        model = InternLM25ForCausalLM.from_pretrained(
+            aistudio_paddle_lm25_model_location,
+            convert_from_hf=False,
+            dtype="bfloat16",
+            low_cpu_mem_usage=True,
+            load_checkpoint_format="",
+        )
+        model.eval()
+
+        tokenizer = InternLM25Tokenizer.from_pretrained(aistudio_paddle_lm25_model_location)
+
+        prompt = "猫和狗的区别是什么，列出主要的3点"
+        meta_instruction = "You are a helpful assistant. Please answer in plain text without markdown."
+        chat_inputs = model.build_inputs(
+            tokenizer, prompt, history=[], meta_instruction=meta_instruction
+        )
+
+        with paddle.no_grad():
+            out = model.generate(
+                input_ids=chat_inputs["input_ids"],
+                attention_mask=chat_inputs.get("attention_mask"),
+                max_new_tokens=128,
+                use_cache=True,
+                decode_strategy="greedy_search",
+            )
+
+        seq = out[0] if isinstance(out, (list, tuple)) else out
+
+        decoded = tokenizer.decode(seq.numpy().tolist()[0], skip_special_tokens=True)
+
+        print("\n" + "=" * 80)
+        print("Chinese Generation Test (Chat Mode)")
+        print("=" * 80)
+        print(f"Prompt: {prompt}")
+        print(f"Generated: {decoded}")
+        print("=" * 80 + "\n")
+
+        self.assertGreater(len(decoded.strip()), 0)
+
+    # 使用paddle框架，直接加载 pytorch原版的模型权重
+    @slow
+    def test_hf_direct_load_and_inference(self):
+        if not paddle.is_compiled_with_cuda():
+            self.skipTest("CUDA is required for this test")
+
+        paddle.set_device("gpu")
+        paddle.set_default_dtype("bfloat16")
+
+        model = InternLM25ForCausalLM.from_pretrained(
+            aistudio_pt_lm25_model_location,
+            convert_from_hf=True,
+            dtype="bfloat16",
+            low_cpu_mem_usage=True,
+            load_checkpoint_format="",
+        )
+        model.eval()
+        tokenizer = InternLM25Tokenizer.from_pretrained(aistudio_pt_lm25_model_location, load_checkpoint_format="")
+
+        prompt = "猫和狗的区别是什么，列出主要的3点"
+        meta_instruction = "You are a helpful assistant. Please answer in plain text without markdown."
+        inputs = model.build_inputs(tokenizer, prompt, history=[], meta_instruction=meta_instruction)
+        with paddle.no_grad():
+            outputs = model.generate(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs.get("attention_mask"),
+                max_new_tokens=128,
+                use_cache=True,
+                decode_strategy="greedy_search",
+            )
+
+        generated_ids = outputs[0] if isinstance(outputs, (tuple, list)) else outputs
+        decoded = tokenizer.decode(generated_ids[0].numpy().tolist(), skip_special_tokens=True)
+        print("\n[HF Direct Load] prompt:", prompt)
+        print("[HF Direct Load] response:", decoded)
+
+        self.assertIsNotNone(decoded)
+        self.assertGreater(len(decoded.strip()), 0)
+
+
+# 测试 paddle框架 和 transformers框架 的推理结果对比，直接固定随机数， 确认infer结果是否对齐
+class InternLM25CompatibilityTest(unittest.TestCase):
+    @classmethod
+    @require_package("transformers", "torch")
+    def setUpClass(cls) -> None:
+        import sys
+        if "transformers" in sys.modules:
+            del sys.modules["transformers"]
+
+        import torch
+        import shutil
+        cls.torch_model_path = tempfile.TemporaryDirectory().name
+
+        from transformers import AutoModelForCausalLM, AutoConfig
+
+        # 从远程获取 InternLM2.5 的配置类并创建小配置用于快速测试
+        # 远程加载 configuration_internlm2.py 中的 InternLM2Config 类
+        config = AutoConfig.from_pretrained(
+            hg_lm25_model_location,
+            trust_remote_code=True,
+            hidden_size=128,
+            intermediate_size=384,
+            num_hidden_layers=4,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            max_position_embeddings=128,
+            vocab_size=10000,
+            use_cache=True,
+        )
+
+        cls.torch_model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+        cls.torch_model.config.save_pretrained(cls.torch_model_path)
+        torch.save(cls.torch_model.state_dict(), f"{cls.torch_model_path}/pytorch_model.bin")
+
+    @require_package("transformers", "torch")
+    def test_intern_converter(self):
+        import torch
+        input_ids = np.random.randint(100, 200, [1, 20])
+
+        self.torch_model.eval()
+        torch_logit = self.torch_model(torch.tensor(input_ids), use_cache=False, return_dict=False)[0]
+
+        paddle_model = InternLM25ForCausalLM.from_pretrained(
+            self.torch_model_path, convert_from_hf=True, load_checkpoint_format=""
+        )
+        paddle_model.eval()
+        paddle_logit = paddle_model(paddle.to_tensor(input_ids), use_cache=False)[0]
+
+        paddle_out = paddle_logit.detach().cpu().reshape([-1])[:9].astype("float32").numpy()
+        torch_out = torch_logit.detach().cpu().reshape([-1])[:9].float().numpy()
+        max_diff = np.max(np.abs(paddle_out - torch_out))
+        print(f"\nMax diff: {max_diff}")
+
+        # !! 对齐前10个token是否一致！！
+        paddle_token_ids = paddle.argmax(paddle_logit, axis=-1).cpu().numpy()[0][:10]
+        torch_token_ids = torch.argmax(torch_logit, dim=-1).cpu().numpy()[0][:10]
+        print(f"Paddle token ids: {paddle_token_ids}")
+        print(f"Torch token ids:  {torch_token_ids}")
+        self.assertTrue(
+            np.array_equal(paddle_token_ids, torch_token_ids),
+            f"Token ids mismatch: paddle={paddle_token_ids}, torch={torch_token_ids}"
+        )
+
+        # 对齐推理的 1e-2 的容差
+        self.assertTrue(
+            np.allclose(paddle_out, torch_out, atol=1e-2, rtol=1e-2),
+            f"Max diff {max_diff} exceeds tolerance"
+        )
+
+
+
+
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/transformers/intern_lm2_5/test_tokenizer.py b/tests/transformers/intern_lm2_5/test_tokenizer.py
new file mode 100644
index 00000000000..f48f8d311b0
--- /dev/null
+++ b/tests/transformers/intern_lm2_5/test_tokenizer.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import unittest
+
+from paddleformers.transformers import InternLM25Tokenizer
+
+model_path = "learncat/internlm2_5-1_8b-chat-paddle"
+
+
+class TestTokenizer(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        try:
+            cls.tokenizer = InternLM25Tokenizer.from_pretrained(model_path)
+        except Exception:
+            cls.tokenizer = None
+
+    def test_tokenizer_from_pretrained(self):
+        if self.tokenizer is None:
+            self.skipTest("Model path not available")
+        self.assertTrue(self.tokenizer is not None)
+
+    def test_tokenizer_save_pretrained(self):
+        if self.tokenizer is None:
+            self.skipTest("Model path not available")
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            special_tokens_dict = {"additional_special_tokens": ["[ENT_START]", "[ENT_END]"]}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+            self.tokenizer.add_tokens(["new_word", "another_word"])
+            self.tokenizer.model_max_length = 512
+            self.tokenizer.save_pretrained(tmpdir)
+            self.assertTrue(os.path.exists(os.path.join(tmpdir, "tokenizer_config.json")))
+
+    def test_tokenize(self):
+        if self.tokenizer is None:
+            self.skipTest("Model path not available")
+
+        text = "hello world, this is a tokenizer test"
+        output_dict = self.tokenizer(text)
+        decode_text = self.tokenizer.decode(output_dict, skip_special_tokens=True)
+        self.assertEqual(text, decode_text)
+
+    def test_tokenizer_vocab_size(self):
+        if self.tokenizer is None:
+            self.skipTest("Model path not available")
+
+        vocab_size = self.tokenizer.vocab_size
+        self.assertGreater(vocab_size, 0)
+
+    def test_tokenizer_bos_eos_tokens(self):
+        if self.tokenizer is None:
+            self.skipTest("Model path not available")
+
+        self.assertIsNotNone(self.tokenizer.bos_token_id)
+        self.assertIsNotNone(self.tokenizer.eos_token_id)
+
+    def test_tokenizer_build_inputs_with_special_tokens(self):
+        if self.tokenizer is None:
+            self.skipTest("Model path not available")
+
+        token_ids_0 = [1, 2, 3]
+        output = self.tokenizer.build_inputs_with_special_tokens(token_ids_0)
+        self.assertIsInstance(output, list)
+        self.assertGreater(len(output), len(token_ids_0))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 53c9c855c536bc7e6c1b41174f1a3dc4db951521 Mon Sep 17 00:00:00 2001
From: caoyuanye <cao_dev@foxmail.com>
Date: Mon, 23 Mar 2026 20:15:48 +0800
Subject: [PATCH 2/8] =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../intern_lm2_5/configuration.py             |   2 +-
 .../transformers/intern_lm2_5/modeling.py     | 112 ++++++++++--------
 .../transformers/intern_lm2_5/tokenizer.py    |  24 ++--
 .../intern_lm2_5/test_modeling.py             |  28 ++---
 4 files changed, 87 insertions(+), 79 deletions(-)

diff --git a/paddleformers/transformers/intern_lm2_5/configuration.py b/paddleformers/transformers/intern_lm2_5/configuration.py
index 0a44bb36d9e..eb40311948c 100644
--- a/paddleformers/transformers/intern_lm2_5/configuration.py
+++ b/paddleformers/transformers/intern_lm2_5/configuration.py
@@ -17,7 +17,6 @@
 from paddleformers.transformers.configuration_utils import PretrainedConfig
 
 
-
 class InternLM25Config(PretrainedConfig):
     model_type = "internlm2_5"
     _auto_class = "AutoConfig"
@@ -56,6 +55,7 @@ def __init__(
         self.num_attention_heads = num_attention_heads
         self.bias = bias
         import paddle
+
         if isinstance(dtype, str):
             dtype_map = {
                 "float32": paddle.float32,
diff --git a/paddleformers/transformers/intern_lm2_5/modeling.py b/paddleformers/transformers/intern_lm2_5/modeling.py
index c094c991af9..e0d8c5922bb 100644
--- a/paddleformers/transformers/intern_lm2_5/modeling.py
+++ b/paddleformers/transformers/intern_lm2_5/modeling.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,8 +47,9 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from .configuration import InternLM25Config
+
 from ..cache_utils import Cache, DynamicCache
+from .configuration import InternLM25Config
 
 logger = logging.getLogger(__name__)
 
@@ -45,7 +60,10 @@
 
 try:
     from paddle.nn.functional.flash_attention import flash_attention as flash_attn_func
-    from paddle.nn.functional.flash_attention import flash_attn_unpadded as flash_attn_varlen_func
+    from paddle.nn.functional.flash_attention import (
+        flash_attn_unpadded as flash_attn_varlen_func,
+    )
+
     has_flash_attn = True
 except:
     flash_attn_func, flash_attn_varlen_func = None, None
@@ -54,6 +72,7 @@
 try:
     from ..intern.bert_padding_delte import index_first_axis, pad_input, unpad_input
 except ImportError:
+
     def index_first_axis(tensor, index):
         return tensor[index]
 
@@ -63,6 +82,7 @@ def pad_input(hidden_states, attention_mask):
     def unpad_input(hidden_states, attention_mask):
         return hidden_states, attention_mask
 
+
 _CONFIG_FOR_DOC = "InternLM25Config"
 
 
@@ -70,7 +90,7 @@ def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(axis=-1, dtype=paddle.int32)
     indices = paddle.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(paddle.cumsum(seqlens_in_batch, axis=0, dtype=paddle.int32), (1, 0))  
+    cu_seqlens = F.pad(paddle.cumsum(seqlens_in_batch, axis=0, dtype=paddle.int32), (1, 0))
     return (
         indices,
         cu_seqlens,
@@ -103,6 +123,7 @@ def forward(self, hidden_states):
 # 这里会有一些 bf16 到 float32的类型提升，是正常的，原版也是这样。最好不要优化这里了，如果不提升精度，会导致 准确率显著下降
 # 可以参考 https://github.com/huggingface/transformers/pull/29285
 
+
 class InternLM25RotaryEmbedding(nn.Layer):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
         super().__init__()
@@ -110,16 +131,17 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, s
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (paddle.arange(0, self.dim, 2, dtype=paddle.int64).astype("float32") / self.dim))
+        inv_freq = 1.0 / (
+            self.base ** (paddle.arange(0, self.dim, 2, dtype=paddle.int64).astype("float32") / self.dim)
+        )
         self.register_buffer("inv_freq", inv_freq, persistable=False)
         self.max_seq_len_cached = max_position_embeddings
 
-
     @paddle.no_grad()
     def forward(self, x, position_ids):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        inv_freq_expanded = self.inv_freq[None, :, None].astype("float32").expand(
-            [position_ids.shape[0], self.inv_freq.shape[0], 1]
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].astype("float32").expand([position_ids.shape[0], self.inv_freq.shape[0], 1])
         )
         position_ids_expanded = position_ids[:, None, :].astype("float32")
         freqs = (inv_freq_expanded @ position_ids_expanded).transpose([0, 2, 1])
@@ -143,7 +165,9 @@ def forward(self, x, position_ids):
             base = self.base * (
                 (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
             ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (paddle.arange(0, self.dim, 2, dtype=paddle.int64).astype("float32").to(x.place) / self.dim))
+            inv_freq = 1.0 / (
+                base ** (paddle.arange(0, self.dim, 2, dtype=paddle.int64).astype("float32").to(x.place) / self.dim)
+            )
             self.register_buffer("inv_freq", inv_freq, persistable=False)
 
         cos, sin = super().forward(x, position_ids)
@@ -259,7 +283,7 @@ def forward(
         position_ids: Optional[paddle.Tensor] = None,
         past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
-        use_cache: bool = False,  
+        use_cache: bool = False,
         cache_position: Optional[paddle.Tensor] = None,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         bsz, q_len, _ = hidden_states.shape
@@ -267,9 +291,7 @@ def forward(
         if self.config.pretraining_tp > 1:
             key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
             qkv_slices = self.wqkv.weight.split(key_value_slicing, axis=0)
-            qkv_states = paddle.concat(
-                [F.linear(hidden_states, qkv_slice) for qkv_slice in qkv_slices], axis=-1  
-            )
+            qkv_states = paddle.concat([F.linear(hidden_states, qkv_slice) for qkv_slice in qkv_slices], axis=-1)
         else:
             qkv_states = self.wqkv(hidden_states)
 
@@ -315,12 +337,7 @@ def forward(
         if self.config.pretraining_tp > 1:
             attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, axis=2)
             o_proj_slices = self.wo.weight.split(self.hidden_size // self.config.pretraining_tp, axis=1)
-            attn_output = sum(
-                [
-                    F.linear(attn_output[i], o_proj_slices[i])  
-                    for i in range(self.config.pretraining_tp)
-                ]
-            )
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
         else:
             attn_output = self.wo(attn_output)
 
@@ -407,7 +424,7 @@ def forward(
         if not output_attentions:
             attn_weights = None
 
-        return attn_output, attn_weights, past_key_value  
+        return attn_output, attn_weights, past_key_value
 
     def _flash_attention_forward(
         self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
@@ -426,7 +443,7 @@ def _flash_attention_forward(
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
 
-            attn_output_unpad = flash_attn_varlen_func(  
+            attn_output_unpad = flash_attn_varlen_func(
                 query_states,
                 key_states,
                 value_states,
@@ -439,9 +456,9 @@ def _flash_attention_forward(
                 causal=causal,
             )
 
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)  
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
-            attn_output = flash_attn_func(  
+            attn_output = flash_attn_func(
                 query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
 
@@ -451,14 +468,14 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 
-        key_layer = index_first_axis(  
+        key_layer = index_first_axis(
             key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
         )
-        value_layer = index_first_axis(  
+        value_layer = index_first_axis(
             value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
         )
         if query_length == kv_seq_len:
-            query_layer = index_first_axis(  
+            query_layer = index_first_axis(
                 query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
             )
             cu_seqlens_q = cu_seqlens_k
@@ -466,16 +483,12 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
             indices_q = indices_k
         elif query_length == 1:
             max_seqlen_in_batch_q = 1
-            cu_seqlens_q = paddle.arange(
-                batch_size + 1, dtype=paddle.int32
-            )
+            cu_seqlens_q = paddle.arange(batch_size + 1, dtype=paddle.int32)
             indices_q = cu_seqlens_q[:-1]
             query_layer = query_layer.squeeze(1)
         else:
             attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(  
-                query_layer, attention_mask
-            )
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
 
         return (
             query_layer,
@@ -553,7 +566,7 @@ def forward(
 
         is_causal = bool(causal_mask is None and q_len > 1)
 
-        attn_output = paddle.nn.functional.scaled_dot_product_attention(  
+        attn_output = paddle.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
@@ -629,6 +642,7 @@ def forward(
 
         return outputs
 
+
 class InternLM25PretrainedModel(PretrainedModel):
     config_class = InternLM25Config
     base_model_prefix = "model"
@@ -659,6 +673,7 @@ def _gen_aoa_config(cls, config: InternLM25Config):
         # 禁用AOA以解决tok_embeddings.weight未分配的问题
         return {"aoa_statements": []}
 
+
 @register_base_model
 class InternLM25Model(InternLM25PretrainedModel):
     _auto_class = "AutoModel"
@@ -701,6 +716,7 @@ def recompute_training_full(
         def create_custom_forward(module):
             def custom_forward(*inputs):
                 return module(*inputs)
+
             return custom_forward
 
         layer_outputs = recompute(
@@ -744,9 +760,7 @@ def forward(
             )
 
         if self.enable_recompute and self.training and use_cache:
-            logger.warning(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
+            logger.warning("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.")
             use_cache = False
 
         if inputs_embeds is None:
@@ -762,9 +776,7 @@ def forward(
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            cache_position = paddle.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1]
-            )
+            cache_position = paddle.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1])
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
 
@@ -881,7 +893,9 @@ def _update_causal_mask(
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].astype(causal_mask.dtype)
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].astype(
+                    causal_mask.dtype
+                )
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     padding_mask, min_dtype
@@ -964,10 +978,7 @@ def forward(
         hidden_states = outputs[0]
         if self.config.pretraining_tp > 1:
             output_slices = self.output.weight.split(self.vocab_size // self.config.pretraining_tp, axis=0)
-            logits = [
-                F.linear(hidden_states, output_slices[i])  
-                for i in range(self.config.pretraining_tp)
-            ]
+            logits = [F.linear(hidden_states, output_slices[i]) for i in range(self.config.pretraining_tp)]
             logits = paddle.concat(logits, axis=-1)
         else:
             logits = self.output(hidden_states)
@@ -984,6 +995,7 @@ def forward(
             loss = loss_fct(shift_logits, shift_labels)
             # DEBUG: log raw loss details for diagnosis
             import os
+
             if os.environ.get("INTERNLM25_DEBUG_LOSS"):
                 n_valid = (shift_labels != -100).sum().item()
                 n_total = shift_labels.shape[0]
@@ -992,7 +1004,7 @@ def forward(
                     f"logits_shape={shift_logits.shape} labels_shape={shift_labels.shape} "
                     f"logits_min={shift_logits.min().item():.4f} logits_max={shift_logits.max().item():.4f} "
                     f"logits_mean={shift_logits.mean().item():.4f}",
-                    flush=True
+                    flush=True,
                 )
 
         if not return_dict:
@@ -1022,7 +1034,10 @@ def prepare_inputs_for_generation(
             if isinstance(past_key_values, Cache):
                 past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
                 max_cache_length = (
-                    paddle.to_tensor(past_key_values.get_max_cache_shape(), place=input_ids.place if hasattr(input_ids, 'place') else None)
+                    paddle.to_tensor(
+                        past_key_values.get_max_cache_shape(),
+                        place=input_ids.place if hasattr(input_ids, "place") else None,
+                    )
                     if past_key_values.get_max_cache_shape() is not None
                     else None
                 )
@@ -1041,7 +1056,7 @@ def prepare_inputs_for_generation(
                 and attention_mask is not None
                 and cache_length + input_ids.shape[1] > max_cache_length
             ):
-                attention_mask = attention_mask[:, -max_cache_length:]  
+                attention_mask = attention_mask[:, -max_cache_length:]
 
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
@@ -1149,9 +1164,7 @@ def stream_chat(
         if history is None:
             history = []
         if BaseStreamer is None:
-            raise ModuleNotFoundError(
-                "The version of `paddle` is too low."
-            )
+            raise ModuleNotFoundError("The version of `paddle` is too low.")
 
         response_queue = queue.Queue(maxsize=20)
 
@@ -1221,6 +1234,7 @@ def __init__(self, config):
         self.num_labels = config.num_labels
         self.model = InternLM25Model(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias_attr=False)
+
     def get_input_embeddings(self):
         return self.model.tok_embeddings
 
diff --git a/paddleformers/transformers/intern_lm2_5/tokenizer.py b/paddleformers/transformers/intern_lm2_5/tokenizer.py
index dfb4163e44f..3388144822f 100644
--- a/paddleformers/transformers/intern_lm2_5/tokenizer.py
+++ b/paddleformers/transformers/intern_lm2_5/tokenizer.py
@@ -26,11 +26,9 @@
 from paddleformers.transformers.tokenizer_utils import PretrainedTokenizer
 from paddleformers.utils.log import logger
 
-
 VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
 
 
-
 class InternLM25Tokenizer(PretrainedTokenizer):
     resource_files_names = VOCAB_FILES_NAMES
     # PretrainedTokenizer.from_pretrained() uses vocab_files_names to locate vocab files
@@ -182,17 +180,17 @@ def create_token_type_ids_from_sequences(
     # 不要觉得没人调用，就删掉，因为 原版的transformers没有集成lm25的实现；
     # 如果强行 使用 warp_tokenizer 会很多棘手问题处理，这里 直接借鉴了hg上原版的lm25的逻辑
     def encode(
-            self,
-            text: None = None,
-            text_pair:  None = None,
-            add_special_tokens: bool = True,
-            padding: bool | str  = False,
-            truncation: bool | str | None = None,
-            max_length: int | None = None,
-            stride: int = 0,
-            padding_side: str | None = None,
-            return_tensors: str  | None = None,
-            **kwargs,
+        self,
+        text: None = None,
+        text_pair: None = None,
+        add_special_tokens: bool = True,
+        padding: bool | str = False,
+        truncation: bool | str | None = None,
+        max_length: int | None = None,
+        stride: int = 0,
+        padding_side: str | None = None,
+        return_tensors: str | None = None,
+        **kwargs,
     ) -> List[int]:
         padding_strategy, truncation_strategy, max_length, kwargs_updated = self._get_padding_truncation_strategies(
             padding=padding,
diff --git a/tests/transformers/intern_lm2_5/test_modeling.py b/tests/transformers/intern_lm2_5/test_modeling.py
index 7760cd5802d..031f4e95520 100644
--- a/tests/transformers/intern_lm2_5/test_modeling.py
+++ b/tests/transformers/intern_lm2_5/test_modeling.py
@@ -26,11 +26,12 @@
     InternLM25ForCausalLM,
     InternLM25Tokenizer,
 )
-from tests.testing_utils import slow, require_package
+from tests.testing_utils import require_package, slow
 
-aistudio_pt_lm25_model_location  = "learncat/internlm2_5-1_8b-chat-raw"
+aistudio_pt_lm25_model_location = "learncat/internlm2_5-1_8b-chat-raw"
 aistudio_paddle_lm25_model_location = "learncat/internlm2_5-1_8b-chat-paddle"
-hg_lm25_model_location= "internlm/internlm2_5-1_8b-chat"
+hg_lm25_model_location = "internlm/internlm2_5-1_8b-chat"
+
 
 # config层的常规测试
 class TestInternLM25Config(unittest.TestCase):
@@ -55,6 +56,7 @@ def test_config_save_and_load(self):
             self.assertEqual(config.vocab_size, loaded_config.vocab_size)
             self.assertEqual(config.hidden_size, loaded_config.hidden_size)
 
+
 # model层的常规测试
 class InternLM25ModelTest(unittest.TestCase):
     def setUp(self):
@@ -186,7 +188,6 @@ def setUp(self):
     def tearDown(self):
         paddle.set_default_dtype(self._original_dtype)
 
-
     # 使用paddle格式的权重，推理一次
     @slow
     def test_paddle_model_load_and_infer(self):
@@ -205,9 +206,7 @@ def test_paddle_model_load_and_infer(self):
 
         prompt = "猫和狗的区别是什么，列出主要的3点"
         meta_instruction = "You are a helpful assistant. Please answer in plain text without markdown."
-        chat_inputs = model.build_inputs(
-            tokenizer, prompt, history=[], meta_instruction=meta_instruction
-        )
+        chat_inputs = model.build_inputs(tokenizer, prompt, history=[], meta_instruction=meta_instruction)
 
         with paddle.no_grad():
             out = model.generate(
@@ -277,14 +276,15 @@ class InternLM25CompatibilityTest(unittest.TestCase):
     @require_package("transformers", "torch")
     def setUpClass(cls) -> None:
         import sys
+
         if "transformers" in sys.modules:
             del sys.modules["transformers"]
 
         import torch
-        import shutil
+
         cls.torch_model_path = tempfile.TemporaryDirectory().name
 
-        from transformers import AutoModelForCausalLM, AutoConfig
+        from transformers import AutoConfig, AutoModelForCausalLM
 
         # 从远程获取 InternLM2.5 的配置类并创建小配置用于快速测试
         # 远程加载 configuration_internlm2.py 中的 InternLM2Config 类
@@ -308,6 +308,7 @@ def setUpClass(cls) -> None:
     @require_package("transformers", "torch")
     def test_intern_converter(self):
         import torch
+
         input_ids = np.random.randint(100, 200, [1, 20])
 
         self.torch_model.eval()
@@ -331,19 +332,14 @@ def test_intern_converter(self):
         print(f"Torch token ids:  {torch_token_ids}")
         self.assertTrue(
             np.array_equal(paddle_token_ids, torch_token_ids),
-            f"Token ids mismatch: paddle={paddle_token_ids}, torch={torch_token_ids}"
+            f"Token ids mismatch: paddle={paddle_token_ids}, torch={torch_token_ids}",
         )
 
         # 对齐推理的 1e-2 的容差
         self.assertTrue(
-            np.allclose(paddle_out, torch_out, atol=1e-2, rtol=1e-2),
-            f"Max diff {max_diff} exceeds tolerance"
+            np.allclose(paddle_out, torch_out, atol=1e-2, rtol=1e-2), f"Max diff {max_diff} exceeds tolerance"
         )
 
 
-
-
-
-
 if __name__ == "__main__":
     unittest.main()

From fe3c51934e658951a57f6f21446a7086a74ed7b8 Mon Sep 17 00:00:00 2001
From: caoyuanye <cao_dev@foxmail.com>
Date: Mon, 23 Mar 2026 20:19:45 +0800
Subject: [PATCH 3/8] =?UTF-8?q?internlm2=5F5=E7=9A=84template=E6=8F=90?=
 =?UTF-8?q?=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddleformers/datasets/template/template.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/paddleformers/datasets/template/template.py b/paddleformers/datasets/template/template.py
index 2ceea17a7c8..98b5903000e 100644
--- a/paddleformers/datasets/template/template.py
+++ b/paddleformers/datasets/template/template.py
@@ -921,3 +921,13 @@ def _get_gpt_oss_prefix():
     chat_sep="<|assistant|>\n",
     mm_plugin=get_mm_plugin(name="glm_ocr", image_token="<|image|>"),
 )
+register_template(
+    name="internlm2_5",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=["<s>"]),
+    chat_sep="<|im_end|>\n",
+    suffix=["<|im_end|>\n"],
+    enable_thinking=None,
+)
\ No newline at end of file

From 9e2a84a6b329db3b1f06aa3b9a6de8f15ad60c6c Mon Sep 17 00:00:00 2001
From: caoyuanye <cao_dev@foxmail.com>
Date: Tue, 24 Mar 2026 09:15:57 +0800
Subject: [PATCH 4/8] =?UTF-8?q?internlm2=5F5=E7=9A=84workflow=E6=8A=A5?=
 =?UTF-8?q?=E9=94=99=E4=BF=AE=E6=AD=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddleformers/datasets/template/template.py | 2 +-
 tests/requirements.txt                      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddleformers/datasets/template/template.py b/paddleformers/datasets/template/template.py
index 98b5903000e..453feea29cf 100644
--- a/paddleformers/datasets/template/template.py
+++ b/paddleformers/datasets/template/template.py
@@ -930,4 +930,4 @@ def _get_gpt_oss_prefix():
     chat_sep="<|im_end|>\n",
     suffix=["<|im_end|>\n"],
     enable_thinking=None,
-)
\ No newline at end of file
+)
diff --git a/tests/requirements.txt b/tests/requirements.txt
index 71ce415946f..80ff9910631 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -33,3 +33,4 @@ triton >= 3.1
 use_triton_in_paddle
 audioread
 librosa
+einops
\ No newline at end of file

From f53f4970140081960a9029b9b9a439aba85dcd09 Mon Sep 17 00:00:00 2001
From: caoyuanye <cao_dev@foxmail.com>
Date: Tue, 24 Mar 2026 09:56:30 +0800
Subject: [PATCH 5/8] =?UTF-8?q?=E5=9B=A0=E4=B8=BA=E7=BC=BA=E5=B0=91=5F=5Fi?=
 =?UTF-8?q?nit=5F=5F=E5=AF=BC=E8=87=B4package=E8=B7=AF=E5=BE=84=E5=86=B2?=
 =?UTF-8?q?=E7=AA=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/transformers/intern_lm2_5/__init__.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 tests/transformers/intern_lm2_5/__init__.py

diff --git a/tests/transformers/intern_lm2_5/__init__.py b/tests/transformers/intern_lm2_5/__init__.py
new file mode 100644
index 00000000000..a9cc79cc9d7
--- /dev/null
+++ b/tests/transformers/intern_lm2_5/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

From 1987e441a857ed445eea38c33eb193f3375823b9 Mon Sep 17 00:00:00 2001
From: caoyuanye <cao_dev@foxmail.com>
Date: Thu, 21 May 2026 17:21:29 +0800
Subject: [PATCH 6/8] =?UTF-8?q?=E6=A0=B9=E6=8D=AEPR=E6=8C=87=E5=AF=BC?=
 =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E5=A4=8DInternLM2.5=E7=9A=84=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../transformers/auto/configuration.py        |   5 +-
 paddleformers/transformers/auto/modeling.py   |   2 +-
 paddleformers/transformers/intern/__init__.py |  31 ++++
 .../transformers/intern/bert_padding_delte.py | 110 ------------
 .../transformers/intern/configuration.py      | 132 +++++++++++++++
 paddleformers/transformers/intern/modeling.py | 159 ++++++++++++++++++
 .../transformers/intern_lm2_5/__init__.py     |   2 +-
 .../intern_lm2_5/configuration.py             |   4 +-
 .../transformers/intern_lm2_5/modeling.py     |  61 +++----
 .../transformers/intern_lm2_5/tokenizer.py    |   7 +-
 tests/config/ci/interlm2_sft.yaml             |  59 -------
 tests/integration_test/interlm_sft.sh         |  48 ------
 tests/transformers/intern_lm2_5/__init__.py   |   2 +-
 .../intern_lm2_5/test_modeling.py             |  71 ++------
 .../intern_lm2_5/test_tokenizer.py            |   8 +-
 15 files changed, 369 insertions(+), 332 deletions(-)
 delete mode 100644 paddleformers/transformers/intern/bert_padding_delte.py
 create mode 100644 paddleformers/transformers/intern/configuration.py
 create mode 100644 paddleformers/transformers/intern/modeling.py
 delete mode 100644 tests/config/ci/interlm2_sft.yaml
 delete mode 100644 tests/integration_test/interlm_sft.sh

diff --git a/paddleformers/transformers/auto/configuration.py b/paddleformers/transformers/auto/configuration.py
index 9e6a209d3c2..5adf74eb4fe 100644
--- a/paddleformers/transformers/auto/configuration.py
+++ b/paddleformers/transformers/auto/configuration.py
@@ -61,7 +61,7 @@
         ("glm_ocr", "GlmOcrConfig"),
         ("qwen3_5", "Qwen3_5Config"),
         ("qwen3_5_moe", "Qwen3_5MoEConfig"),
-        ("internlm2_5", "InternLM25Config"),
+        ("internlm2", "InternLM2Config"),
     ]
 )
 
@@ -90,7 +90,7 @@
         ("glm_ocr", "GlmOcrForConditionalGeneration"),
         ("qwen3_5_moe", "Qwen3_5MoEForConditionalGeneration"),
         ("qwen3_5", "Qwen3_5ForConditionalGeneration"),
-        ("internlm2_5", "InternLM25"),
+        ("internlm2", "InternLM2"),
     ]
 )
 
@@ -104,6 +104,7 @@
         ("qwen2_5_vl_text", "qwen2_5_vl"),
         ("qwen3_vl_text", "qwen3_vl"),
         ("qwen3_vl_moe_text", "qwen3_vl_moe"),
+        ("internlm2", "intern"),
     ]
 )
 
diff --git a/paddleformers/transformers/auto/modeling.py b/paddleformers/transformers/auto/modeling.py
index 0926a22185e..d8775511bc4 100644
--- a/paddleformers/transformers/auto/modeling.py
+++ b/paddleformers/transformers/auto/modeling.py
@@ -78,7 +78,7 @@
         ("Gemma3", "gemma3_text"),
         ("Glm4vMoe", "glm4v_moe"),
         ("GlmOcr", "glm_ocr"),
-        ("InternLM2", "intern_lm2_5"),
+        ("InternLM2", "intern"),
     ]
 )
 
diff --git a/paddleformers/transformers/intern/__init__.py b/paddleformers/transformers/intern/__init__.py
index 290f972cf31..d62ce3aa491 100644
--- a/paddleformers/transformers/intern/__init__.py
+++ b/paddleformers/transformers/intern/__init__.py
@@ -11,3 +11,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+"""
+InternLM2 Common Module
+
+This module provides unified access to both InternLM2 2.0 and 2.5 models.
+It automatically routes to the correct implementation based on the model configuration.
+"""
+
+from .configuration import InternLM2Config
+from .modeling import (
+    InternLM2ForCausalLM,
+    InternLM2ForSequenceClassification,
+    InternLM2ForQuestionAnswering,
+    InternLM2ForTokenClassification,
+    InternLM2Model,
+    InternLM2PretrainedModel,
+)
+
+# Alias for auto system compatibility
+InternLM2 = InternLM2Model
+
+__all__ = [
+    "InternLM2Config",
+    "InternLM2Model",
+    "InternLM2",
+    "InternLM2PretrainedModel",
+    "InternLM2ForCausalLM",
+    "InternLM2ForSequenceClassification",
+    "InternLM2ForQuestionAnswering",
+    "InternLM2ForTokenClassification",
+]
diff --git a/paddleformers/transformers/intern/bert_padding_delte.py b/paddleformers/transformers/intern/bert_padding_delte.py
deleted file mode 100644
index f7aa97a6897..00000000000
--- a/paddleformers/transformers/intern/bert_padding_delte.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# reference from Dao-AILAB flash-attn
-# https://github.com/Dao-AILab/flash-attention/blob/74b0761ff7efc7b90d4e5aeb529c1b2a09a7458c/flash_attn/bert_padding.py#L38
-import operator
-from functools import reduce
-
-import paddle
-import paddle.nn.functional as F
-
-
-class IndexFirstAxis(paddle.autograd.PyLayer):
-    @staticmethod
-    def forward(ctx, input, indices):
-        ctx.save_for_backward(indices)
-        assert input.ndim >= 2
-        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
-        second_dim = reduce(operator.mul, other_shape, 1)
-        return paddle.take_along_axis(
-            arr=input.reshape([input.shape[0], -1]), axis=0, indices=indices.unsqueeze(-1).expand([-1, second_dim])
-        ).reshape([-1, *other_shape])
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        """Class Attribute: torch.autograd.function.FunctionCtx.saved_tensors, can not convert, please check whether it is torch.Tensor.*/torch.autograd.function.FunctionCtx.*/torch.distributions.Distribution.* and convert manually"""
-        (indices,) = ctx.saved_tensor()
-        assert grad_output.ndim >= 2
-        other_shape = grad_output.shape[1:]
-        grad_output = grad_output.reshape([grad_output.shape[0], -1])
-        grad_input = paddle.zeros(shape=[ctx.first_axis_dim, tuple(grad_output.shape)[1]], dtype=grad_output.dtype)
-
-        grad_input.put_along_axis_(
-            axis=0,
-            indices=indices.unsqueeze(-1).expand([-1, tuple(grad_output.shape)[1]]),
-            values=grad_output,
-        )
-        return grad_input.reshape([ctx.first_axis_dim, *other_shape]), None
-
-
-index_first_axis = IndexFirstAxis.apply
-
-
-class IndexPutFirstAxis(paddle.autograd.PyLayer):
-    @staticmethod
-    def forward(ctx, values, indices, first_axis_dim):
-        ctx.save_for_backward(indices)
-        assert indices.ndim == 1
-        assert values.ndim >= 2
-        output = paddle.zeros(shape=[first_axis_dim, *tuple(values.shape)[1:]], dtype=values.dtype)
-        output[indices] = values
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        """Class Attribute: torch.autograd.function.FunctionCtx.saved_tensors, can not convert, please check whether it is torch.Tensor.*/torch.autograd.function.FunctionCtx.*/torch.distributions.Distribution.* and convert manually"""
-        (indices,) = ctx.saved_tensor()
-        grad_values = grad_output[indices]
-        return grad_values, None, None
-
-
-index_put_first_axis = IndexPutFirstAxis.apply
-
-
-def unpad_input(hidden_states, attention_mask):
-    """
-    Arguments:
-        hidden_states: (batch, seqlen, ...)
-        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
-    Return:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
-        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
-        max_seqlen_in_batch: int
-    """
-    seqlens_in_batch = paddle.sum(attention_mask, axis=-1, dtype="int32")
-    indices = paddle.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = paddle.max(seqlens_in_batch).item()
-    cu_seqlens = F.pad(paddle.cumsum(seqlens_in_batch, axis=0), [1, 0])
-
-    return (
-        index_first_axis(hidden_states.reshape([-1] + list(hidden_states.shape[2:])), indices),
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def pad_input(hidden_states, indices, batch, seqlen):
-    """
-    Arguments:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
-        batch: int, batch size for the padded sequence.
-        seqlen: int, maximum sequence length for the padded sequence.
-    Return:
-        hidden_states: (batch, seqlen, ...)
-    """
-    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
-    return output.reshape([batch, seqlen] + list(output.shape[1:]))
diff --git a/paddleformers/transformers/intern/configuration.py b/paddleformers/transformers/intern/configuration.py
new file mode 100644
index 00000000000..3bf675d6dae
--- /dev/null
+++ b/paddleformers/transformers/intern/configuration.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+InternLM2 Common Configuration
+
+This module provides a unified configuration for both InternLM2 2.0 and 2.5 models.
+It detects the version based on the configuration fields and routes accordingly.
+"""
+
+from paddleformers.transformers.configuration_utils import PretrainedConfig
+
+
+class InternLM2Config(PretrainedConfig):
+    """
+    InternLM2 configuration. This is a unified config that handles both 2.0 and 2.5 versions.
+
+    When loading from HuggingFace, the `model_type` will be "internlm2" (not "internlm2_5").
+    This config detects the actual version and routes to the appropriate implementation.
+    """
+
+    model_type = "internlm2"  # Important: must match HuggingFace config
+    _auto_class = "AutoConfig"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=92550,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        bias=True,
+        rope_theta=10000,
+        rope_scaling=None,
+        attn_implementation=None,
+        dtype="bfloat16",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.bias = bias
+
+        import paddle
+
+        if isinstance(dtype, str):
+            dtype_map = {
+                "float32": paddle.float32,
+                "float16": paddle.float16,
+                "bfloat16": paddle.bfloat16,
+            }
+            self.dtype = dtype_map.get(dtype.lower(), paddle.float32)
+        else:
+            self.dtype = dtype
+
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attn_implementation = attn_implementation
+        if self.attn_implementation is None:
+            self.attn_implementation = "eager"
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_factor is None:
+            raise ValueError(
+                "`rope_scaling` must contain 'type' and 'factor' keys, "
+                f"got {self.rope_scaling}"
+            )
+        if rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling` type must be 'linear' or 'dynamic', got '{rope_scaling_type}'"
+            )
+
+    @property
+    def is_version_2_5(self):
+        if hasattr(self, "auto_map") and self.auto_map is not None:
+            if "AutoModelForSequenceClassification" in self.auto_map:
+                return True
+        return False
diff --git a/paddleformers/transformers/intern/modeling.py b/paddleformers/transformers/intern/modeling.py
new file mode 100644
index 00000000000..037f99751d3
--- /dev/null
+++ b/paddleformers/transformers/intern/modeling.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+InternLM2 Common Modeling
+
+This module provides unified model classes that automatically route to the correct
+implementation (2.0 or 2.5) based on the model configuration.
+"""
+
+from paddleformers.transformers.model_utils import PretrainedModel
+from paddleformers.utils.log import logger
+
+from .configuration import InternLM2Config
+
+
+class InternLM2PretrainedModel(PretrainedModel):
+    """
+    Base class for all InternLM2 models.
+
+    This is a proxy that routes to the actual implementation (2.0 or 2.5).
+    """
+
+    config_class = InternLM2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["InternLM2DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def __init__(self, config: InternLM2Config):
+        """
+        Initialize the appropriate model implementation based on config.
+
+        Args:
+            config: InternLM2Config with version detection
+        """
+        super().__init__(config)
+
+        # Detect version and load appropriate implementation
+        if config.is_version_2_5:
+            logger.info("Detected InternLM2 2.5, loading 2.5 implementation")
+            from ..intern_lm2_5.modeling import InternLM25PretrainedModel as ImplModel
+        else:
+            logger.error("Detected InternLM2 2.0, but 2.0 implementation is not supported!")
+            raise NotImplementedError(
+                "InternLM2 2.0 is not supported in PaddleFormers. "
+                "Please use InternLM2 2.5 or later versions. "
+                "If you need to use 2.0, please implement `paddleformers/transformers/internlm2/` module first."
+            )
+
+        # Store the actual implementation
+        self._impl = ImplModel(config)
+
+        # Copy all attributes from implementation to self
+        # This makes the proxy transparent
+        for key, value in self._impl.__dict__.items():
+            if key not in self.__dict__:
+                self.__dict__[key] = value
+
+    def forward(self, *args, **kwargs):
+        """Forward to the actual implementation."""
+        return self._impl(*args, **kwargs)
+
+    def __getattr__(self, name):
+        """Proxy all attribute access to the actual implementation."""
+        if name.startswith("_") or name in ["_impl", "config"]:
+            return object.__getattribute__(self, name)
+        return getattr(self._impl, name)
+
+    def __setattr__(self, name, value):
+        """Proxy all attribute setting to the actual implementation."""
+        if name in ["_impl", "config"] or name.startswith("_"):
+            object.__setattr__(self, name, value)
+        elif hasattr(self, "_impl") and self._impl is not None:
+            setattr(self._impl, name, value)
+        else:
+            object.__setattr__(self, name, value)
+
+
+class InternLM2Model(InternLM2PretrainedModel):
+    """
+    The bare InternLM2 Model outputting raw hidden-states without any specific head.
+
+    This is a proxy that routes to InternLM2 2.0 or 2.5 implementation.
+    """
+
+    _auto_class = "AutoModel"
+
+    def __init__(self, config: InternLM2Config):
+        super().__init__(config)
+
+
+class InternLM2ForCausalLM(InternLM2PretrainedModel):
+    """
+    InternLM2 Model with a language modeling head on top.
+
+    This is a proxy that routes to InternLM2 2.0 or 2.5 implementation.
+    """
+
+    _auto_class = "AutoModelForCausalLM"
+    _tied_weights_keys = ["output.weight"]
+
+    def __init__(self, config: InternLM2Config):
+        super().__init__(config)
+
+
+class InternLM2ForSequenceClassification(InternLM2PretrainedModel):
+    """
+    InternLM2 Model with a sequence classification head on top.
+
+    This is a proxy that routes to InternLM2 2.0 or 2.5 implementation.
+    """
+
+    _auto_class = "AutoModelForSequenceClassification"
+
+    def __init__(self, config: InternLM2Config):
+        super().__init__(config)
+
+
+class InternLM2ForQuestionAnswering(InternLM2PretrainedModel):
+    """
+    InternLM2 Model with a question answering head on top.
+
+    This is a proxy that routes to InternLM2 2.0 or 2.5 implementation.
+    """
+
+    _auto_class = "AutoModelForQuestionAnswering"
+
+    def __init__(self, config: InternLM2Config):
+        super().__init__(config)
+
+
+class InternLM2ForTokenClassification(InternLM2PretrainedModel):
+    """
+    InternLM2 Model with a token classification head on top.
+
+    This is a proxy that routes to InternLM2 2.0 or 2.5 implementation.
+    """
+
+    _auto_class = "AutoModelForTokenClassification"
+
+    def __init__(self, config: InternLM2Config):
+        super().__init__(config)
diff --git a/paddleformers/transformers/intern_lm2_5/__init__.py b/paddleformers/transformers/intern_lm2_5/__init__.py
index f371a4ed966..a8e5e7f412d 100644
--- a/paddleformers/transformers/intern_lm2_5/__init__.py
+++ b/paddleformers/transformers/intern_lm2_5/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddleformers/transformers/intern_lm2_5/configuration.py b/paddleformers/transformers/intern_lm2_5/configuration.py
index eb40311948c..b84a9f0e944 100644
--- a/paddleformers/transformers/intern_lm2_5/configuration.py
+++ b/paddleformers/transformers/intern_lm2_5/configuration.py
@@ -1,5 +1,4 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -90,7 +89,6 @@ def __init__(
             **kwargs,
         )
 
-    # 检查ROPE位置算法的参数是否存在问题，提前终止异常参数
     def _rope_scaling_validation(self):
         if self.rope_scaling is None:
             return
diff --git a/paddleformers/transformers/intern_lm2_5/modeling.py b/paddleformers/transformers/intern_lm2_5/modeling.py
index e0d8c5922bb..0aa99541e17 100644
--- a/paddleformers/transformers/intern_lm2_5/modeling.py
+++ b/paddleformers/transformers/intern_lm2_5/modeling.py
@@ -11,20 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """ Paddle InternLM25 model."""
 import logging
 import math
@@ -120,10 +106,6 @@ def forward(self, hidden_states):
         return self.weight * hidden_states.astype(input_dtype)
 
 
-# 这里会有一些 bf16 到 float32的类型提升，是正常的，原版也是这样。最好不要优化这里了，如果不提升精度，会导致 准确率显著下降
-# 可以参考 https://github.com/huggingface/transformers/pull/29285
-
-
 class InternLM25RotaryEmbedding(nn.Layer):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
         super().__init__()
@@ -669,9 +651,31 @@ def _init_weights(self, module):
 
     @classmethod
     def _gen_aoa_config(cls, config: InternLM25Config):
-        """Generate AOA (Auto-Transpose-Adapter) config for loading HuggingFace checkpoints."""
-        # 禁用AOA以解决tok_embeddings.weight未分配的问题
-        return {"aoa_statements": []}
+        model_prefix = cls.base_model_prefix + "." if cls != cls.base_model_class else ""
+        aoa_statements = [
+            f"model.tok_embeddings.weight -> {model_prefix}tok_embeddings.weight",
+            f"model.norm.weight -> {model_prefix}norm.weight",
+            f"model.layers.$LAYER_ID.attention_norm.weight -> {model_prefix}layers.$LAYER_ID.attention_norm.weight",
+            f"model.layers.$LAYER_ID.ffn_norm.weight -> {model_prefix}layers.$LAYER_ID.ffn_norm.weight",
+        ]
+        aoa_statements.extend(
+            [
+                f"model.layers.$LAYER_ID.attention.{w}.weight^T -> {model_prefix}layers.$LAYER_ID.attention.{w}.weight"
+                for w in ["wqkv", "wo"]
+            ]
+        )
+        aoa_statements.extend(
+            [
+                f"model.layers.$LAYER_ID.feed_forward.{w}.weight^T -> {model_prefix}layers.$LAYER_ID.feed_forward.{w}.weight"
+                for w in ["w1", "w2", "w3"]
+            ]
+        )
+        if cls != cls.base_model_class:
+            if getattr(config, "tie_word_embeddings", False):
+                aoa_statements.append("model.tok_embeddings.weight -> output.weight")
+            else:
+                aoa_statements.append("output.weight^T -> output.weight")
+        return {"aoa_statements": aoa_statements}
 
 
 @register_base_model
@@ -875,8 +879,6 @@ def _update_causal_mask(
             )
 
         if attention_mask is not None and attention_mask.ndim == 4:
-            if attention_mask.max() != 0:
-                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
             causal_mask = attention_mask
         else:
             causal_mask = paddle.full([sequence_length, target_length], fill_value=min_dtype, dtype=dtype)
@@ -993,19 +995,6 @@ def forward(
             shift_labels = shift_labels.reshape(-1)
             shift_labels = shift_labels.to(shift_logits.place)
             loss = loss_fct(shift_logits, shift_labels)
-            # DEBUG: log raw loss details for diagnosis
-            import os
-
-            if os.environ.get("INTERNLM25_DEBUG_LOSS"):
-                n_valid = (shift_labels != -100).sum().item()
-                n_total = shift_labels.shape[0]
-                print(
-                    f"[DEBUG LOSS] raw_loss={loss.item():.4f} n_valid={n_valid} n_total={n_total} "
-                    f"logits_shape={shift_logits.shape} labels_shape={shift_labels.shape} "
-                    f"logits_min={shift_logits.min().item():.4f} logits_max={shift_logits.max().item():.4f} "
-                    f"logits_mean={shift_logits.mean().item():.4f}",
-                    flush=True,
-                )
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/paddleformers/transformers/intern_lm2_5/tokenizer.py b/paddleformers/transformers/intern_lm2_5/tokenizer.py
index 3388144822f..3102bdc3be6 100644
--- a/paddleformers/transformers/intern_lm2_5/tokenizer.py
+++ b/paddleformers/transformers/intern_lm2_5/tokenizer.py
@@ -1,5 +1,4 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,7 +30,6 @@
 
 class InternLM25Tokenizer(PretrainedTokenizer):
     resource_files_names = VOCAB_FILES_NAMES
-    # PretrainedTokenizer.from_pretrained() uses vocab_files_names to locate vocab files
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
 
@@ -176,9 +174,6 @@ def create_token_type_ids_from_sequences(
             return len(token_ids_0 + eos) * [0]
         return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
 
-    # 全参微调的时候，会调用到;
-    # 不要觉得没人调用，就删掉，因为 原版的transformers没有集成lm25的实现；
-    # 如果强行 使用 warp_tokenizer 会很多棘手问题处理，这里 直接借鉴了hg上原版的lm25的逻辑
     def encode(
         self,
         text: None = None,
diff --git a/tests/config/ci/interlm2_sft.yaml b/tests/config/ci/interlm2_sft.yaml
deleted file mode 100644
index bc7f3401a30..00000000000
--- a/tests/config/ci/interlm2_sft.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-### data
-train_dataset_type: erniekit
-eval_dataset_type: erniekit
-train_dataset_path: ./tests/fixtures/dummy/sft/train.jsonl
-train_dataset_prob: "1.0"
-eval_dataset_path: ./tests/fixtures/dummy/sft/eval.jsonl
-eval_dataset_prob: "1.0"
-max_seq_len: 512
-packing: false
-dataloader_shuffle: false
-mix_strategy: concat
-template_backend: custom
-template: internlm2_5
-### model
-model_name_or_path: learncat/internlm2_5-1_8b-chat-paddle
-_attn_implementation: flashmask
-
-
-### finetuning
-# base
-stage: SFT
-fine_tuning: full
-seed: 23
-do_train: true
-do_eval: true
-per_device_eval_batch_size: 1
-per_device_train_batch_size: 1
-num_train_epochs: 1
-max_steps: 500
-eval_steps: 1000
-evaluation_strategy: steps
-save_steps: 100000
-save_strategy: steps
-logging_steps: 1
-gradient_accumulation_steps: 4
-logging_dir: ./vdl_log
-output_dir: ./checkpoints/qwen3-sft-full
-disable_tqdm: true
-eval_accumulation_steps: 16
-
-
-# train
-warmup_steps: 5
-learning_rate: 1.0e-5
-
-# performance
-tensor_model_parallel_size: 1
-pipeline_model_parallel_size: 1
-sharding: stage1
-recompute_granularity: full
-recompute_method: uniform
-recompute_num_layers: 1
-bf16: true
-fp16_opt_level: O1
-unified_checkpoint: false
-# 注释之后跳过保存阶段
-# save_checkpoint_format: flex_checkpoint
-load_checkpoint_format: sharding_io
-continue_training: false
diff --git a/tests/integration_test/interlm_sft.sh b/tests/integration_test/interlm_sft.sh
deleted file mode 100644
index 463aea59430..00000000000
--- a/tests/integration_test/interlm_sft.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TODO ，前期不在 .github/workflows/fleet-model-test.yml 中生效，避免直接卡死流程
-#  TODO，提交PR的时候，会提交loss对比材料
-
-set -exo pipefail
-export root_dir=$(pwd)
-
-if [ -f 'PaddleFleet/.venv/bin/activate' ]; then
-   source PaddleFleet/.venv/bin/activate
-fi
-
-config_sft_yaml=$root_dir/PaddleFormers/tests/config/ci/interlm2_sft.yaml
-
-if [[ ! -f "$config_sft_yaml" ]]; then
-  echo "Config file not found: $config_sft_yaml"
-  exit 1
-fi
-
-rm -rf ./outputs
-rm -rf paddleformers_dist_log
-master=$(hostname -i)
-port=36677
-
-export FLAGS_embedding_deterministic=1
-export FLAGS_cudnn_deterministic=1
-export FLAGS_use_stride_compute_kernel=False
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-
-unset http_proxy https_proxy
-
-log_file=interlm_sft.txt
-gt_loss_file=interlm_sft_multi_card_gt_loss.txt
-
-set +e
-NNODES=1 MASTER_ADDR=$master MASTER_PORT=$port coverage run $(which paddleformers-cli) train $config_sft_yaml 2>&1 | tee ./${log_file}
\ No newline at end of file
diff --git a/tests/transformers/intern_lm2_5/__init__.py b/tests/transformers/intern_lm2_5/__init__.py
index a9cc79cc9d7..290f972cf31 100644
--- a/tests/transformers/intern_lm2_5/__init__.py
+++ b/tests/transformers/intern_lm2_5/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/transformers/intern_lm2_5/test_modeling.py b/tests/transformers/intern_lm2_5/test_modeling.py
index 031f4e95520..1308346b4c5 100644
--- a/tests/transformers/intern_lm2_5/test_modeling.py
+++ b/tests/transformers/intern_lm2_5/test_modeling.py
@@ -1,5 +1,4 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,12 +27,9 @@
 )
 from tests.testing_utils import require_package, slow
 
-aistudio_pt_lm25_model_location = "learncat/internlm2_5-1_8b-chat-raw"
-aistudio_paddle_lm25_model_location = "learncat/internlm2_5-1_8b-chat-paddle"
-hg_lm25_model_location = "internlm/internlm2_5-1_8b-chat"
+modelscope_lm25_model_location = "Shanghai_AI_Laboratory/internlm2_5-1_8b-chat"
 
 
-# config层的常规测试
 class TestInternLM25Config(unittest.TestCase):
     def test_config_custom_values(self):
         config = InternLM25Config(
@@ -57,7 +53,6 @@ def test_config_save_and_load(self):
             self.assertEqual(config.hidden_size, loaded_config.hidden_size)
 
 
-# model层的常规测试
 class InternLM25ModelTest(unittest.TestCase):
     def setUp(self):
         self.config = InternLM25Config(
@@ -179,8 +174,7 @@ def test_model_with_past_key_values(self):
         self.assertIsNotNone(outputs.past_key_values)
 
 
-# paddle直接加载 原始的 hg权重的测试，是否可以正常推理
-class InternLM25ConvertedWeightTest(unittest.TestCase):
+class InternLM25ConvertedTest(unittest.TestCase):
     def setUp(self):
         self._original_dtype = paddle.get_default_dtype()
         paddle.set_default_dtype("bfloat16")
@@ -188,49 +182,6 @@ def setUp(self):
     def tearDown(self):
         paddle.set_default_dtype(self._original_dtype)
 
-    # 使用paddle格式的权重，推理一次
-    @slow
-    def test_paddle_model_load_and_infer(self):
-        paddle.set_device("gpu")
-
-        model = InternLM25ForCausalLM.from_pretrained(
-            aistudio_paddle_lm25_model_location,
-            convert_from_hf=False,
-            dtype="bfloat16",
-            low_cpu_mem_usage=True,
-            load_checkpoint_format="",
-        )
-        model.eval()
-
-        tokenizer = InternLM25Tokenizer.from_pretrained(aistudio_paddle_lm25_model_location)
-
-        prompt = "猫和狗的区别是什么，列出主要的3点"
-        meta_instruction = "You are a helpful assistant. Please answer in plain text without markdown."
-        chat_inputs = model.build_inputs(tokenizer, prompt, history=[], meta_instruction=meta_instruction)
-
-        with paddle.no_grad():
-            out = model.generate(
-                input_ids=chat_inputs["input_ids"],
-                attention_mask=chat_inputs.get("attention_mask"),
-                max_new_tokens=128,
-                use_cache=True,
-                decode_strategy="greedy_search",
-            )
-
-        seq = out[0] if isinstance(out, (list, tuple)) else out
-
-        decoded = tokenizer.decode(seq.numpy().tolist()[0], skip_special_tokens=True)
-
-        print("\n" + "=" * 80)
-        print("Chinese Generation Test (Chat Mode)")
-        print("=" * 80)
-        print(f"Prompt: {prompt}")
-        print(f"Generated: {decoded}")
-        print("=" * 80 + "\n")
-
-        self.assertGreater(len(decoded.strip()), 0)
-
-    # 使用paddle框架，直接加载 pytorch原版的模型权重
     @slow
     def test_hf_direct_load_and_inference(self):
         if not paddle.is_compiled_with_cuda():
@@ -240,16 +191,19 @@ def test_hf_direct_load_and_inference(self):
         paddle.set_default_dtype("bfloat16")
 
         model = InternLM25ForCausalLM.from_pretrained(
-            aistudio_pt_lm25_model_location,
+            modelscope_lm25_model_location,
             convert_from_hf=True,
             dtype="bfloat16",
             low_cpu_mem_usage=True,
             load_checkpoint_format="",
+            download_hub="modelscope",
         )
         model.eval()
-        tokenizer = InternLM25Tokenizer.from_pretrained(aistudio_pt_lm25_model_location, load_checkpoint_format="")
+        tokenizer = InternLM25Tokenizer.from_pretrained(
+            modelscope_lm25_model_location, load_checkpoint_format="", download_hub="modelscope"
+        )
 
-        prompt = "猫和狗的区别是什么，列出主要的3点"
+        prompt = "What are the differences between cats and dogs? Here are the three main points"
         meta_instruction = "You are a helpful assistant. Please answer in plain text without markdown."
         inputs = model.build_inputs(tokenizer, prompt, history=[], meta_instruction=meta_instruction)
         with paddle.no_grad():
@@ -270,7 +224,6 @@ def test_hf_direct_load_and_inference(self):
         self.assertGreater(len(decoded.strip()), 0)
 
 
-# 测试 paddle框架 和 transformers框架 的推理结果对比，直接固定随机数， 确认infer结果是否对齐
 class InternLM25CompatibilityTest(unittest.TestCase):
     @classmethod
     @require_package("transformers", "torch")
@@ -286,10 +239,8 @@ def setUpClass(cls) -> None:
 
         from transformers import AutoConfig, AutoModelForCausalLM
 
-        # 从远程获取 InternLM2.5 的配置类并创建小配置用于快速测试
-        # 远程加载 configuration_internlm2.py 中的 InternLM2Config 类
         config = AutoConfig.from_pretrained(
-            hg_lm25_model_location,
+            "internlm/internlm2_5-1_8b-chat",
             trust_remote_code=True,
             hidden_size=128,
             intermediate_size=384,
@@ -325,7 +276,6 @@ def test_intern_converter(self):
         max_diff = np.max(np.abs(paddle_out - torch_out))
         print(f"\nMax diff: {max_diff}")
 
-        # !! 对齐前10个token是否一致！！
         paddle_token_ids = paddle.argmax(paddle_logit, axis=-1).cpu().numpy()[0][:10]
         torch_token_ids = torch.argmax(torch_logit, dim=-1).cpu().numpy()[0][:10]
         print(f"Paddle token ids: {paddle_token_ids}")
@@ -335,7 +285,6 @@ def test_intern_converter(self):
             f"Token ids mismatch: paddle={paddle_token_ids}, torch={torch_token_ids}",
         )
 
-        # 对齐推理的 1e-2 的容差
         self.assertTrue(
             np.allclose(paddle_out, torch_out, atol=1e-2, rtol=1e-2), f"Max diff {max_diff} exceeds tolerance"
         )
diff --git a/tests/transformers/intern_lm2_5/test_tokenizer.py b/tests/transformers/intern_lm2_5/test_tokenizer.py
index f48f8d311b0..fa740887f94 100644
--- a/tests/transformers/intern_lm2_5/test_tokenizer.py
+++ b/tests/transformers/intern_lm2_5/test_tokenizer.py
@@ -1,5 +1,4 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,20 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 import tempfile
 import unittest
 
 from paddleformers.transformers import InternLM25Tokenizer
 
-model_path = "learncat/internlm2_5-1_8b-chat-paddle"
+model_path = "Shanghai_AI_Laboratory/internlm2_5-1_8b-chat"
 
 
 class TestTokenizer(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         try:
-            cls.tokenizer = InternLM25Tokenizer.from_pretrained(model_path)
+            cls.tokenizer = InternLM25Tokenizer.from_pretrained(model_path, download_hub="modelscope")
         except Exception:
             cls.tokenizer = None
 

From 252a16adc245292b6b356460f88b7fd46a610cf8 Mon Sep 17 00:00:00 2001
From: caoyuanye <cao_dev@foxmail.com>
Date: Thu, 21 May 2026 19:17:31 +0800
Subject: [PATCH 7/8] =?UTF-8?q?=E6=A0=B9=E6=8D=AEPR=E6=8C=87=E5=AF=BC?=
 =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E5=A4=8DInternLM2.5=E7=9A=84=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddleformers/transformers/intern/__init__.py |  2 +-
 .../transformers/intern/configuration.py      | 12 ++---
 .../intern_lm2_5/test_modeling.py             | 44 ++++++++++---------
 3 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/paddleformers/transformers/intern/__init__.py b/paddleformers/transformers/intern/__init__.py
index d62ce3aa491..72cdc1a5446 100644
--- a/paddleformers/transformers/intern/__init__.py
+++ b/paddleformers/transformers/intern/__init__.py
@@ -22,8 +22,8 @@
 from .configuration import InternLM2Config
 from .modeling import (
     InternLM2ForCausalLM,
-    InternLM2ForSequenceClassification,
     InternLM2ForQuestionAnswering,
+    InternLM2ForSequenceClassification,
     InternLM2ForTokenClassification,
     InternLM2Model,
     InternLM2PretrainedModel,
diff --git a/paddleformers/transformers/intern/configuration.py b/paddleformers/transformers/intern/configuration.py
index 3bf675d6dae..ac5ab00db0e 100644
--- a/paddleformers/transformers/intern/configuration.py
+++ b/paddleformers/transformers/intern/configuration.py
@@ -109,20 +109,14 @@ def _rope_scaling_validation(self):
 
         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
             raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, "
-                f"got {self.rope_scaling}"
+                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
             )
         rope_scaling_type = self.rope_scaling.get("type", None)
         rope_scaling_factor = self.rope_scaling.get("factor", None)
         if rope_scaling_type is None or rope_scaling_factor is None:
-            raise ValueError(
-                "`rope_scaling` must contain 'type' and 'factor' keys, "
-                f"got {self.rope_scaling}"
-            )
+            raise ValueError("`rope_scaling` must contain 'type' and 'factor' keys, " f"got {self.rope_scaling}")
         if rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling` type must be 'linear' or 'dynamic', got '{rope_scaling_type}'"
-            )
+            raise ValueError(f"`rope_scaling` type must be 'linear' or 'dynamic', got '{rope_scaling_type}'")
 
     @property
     def is_version_2_5(self):
diff --git a/tests/transformers/intern_lm2_5/test_modeling.py b/tests/transformers/intern_lm2_5/test_modeling.py
index 1308346b4c5..8c963dd1616 100644
--- a/tests/transformers/intern_lm2_5/test_modeling.py
+++ b/tests/transformers/intern_lm2_5/test_modeling.py
@@ -27,6 +27,7 @@
 )
 from tests.testing_utils import require_package, slow
 
+# https://www.modelscope.cn/models/Shanghai_AI_Laboratory/internlm2_5-1_8b-chat/summary
 modelscope_lm25_model_location = "Shanghai_AI_Laboratory/internlm2_5-1_8b-chat"
 
 
@@ -228,34 +229,36 @@ class InternLM25CompatibilityTest(unittest.TestCase):
     @classmethod
     @require_package("transformers", "torch")
     def setUpClass(cls) -> None:
-        import sys
-
-        if "transformers" in sys.modules:
-            del sys.modules["transformers"]
+        import json
 
         import torch
+        from modelscope import AutoConfig
+        from transformers import AutoModelForCausalLM
 
-        cls.torch_model_path = tempfile.TemporaryDirectory().name
+        cls.torch_model_path = tempfile.mkdtemp()
 
-        from transformers import AutoConfig, AutoModelForCausalLM
+        config = AutoConfig.from_pretrained(modelscope_lm25_model_location, trust_remote_code=True)
 
-        config = AutoConfig.from_pretrained(
-            "internlm/internlm2_5-1_8b-chat",
-            trust_remote_code=True,
-            hidden_size=128,
-            intermediate_size=384,
-            num_hidden_layers=4,
-            num_attention_heads=4,
-            num_key_value_heads=2,
-            max_position_embeddings=128,
-            vocab_size=10000,
-            use_cache=True,
-        )
+        # Override with small test parameters,  accelerate calc
+        config.hidden_size = 128
+        config.intermediate_size = 384
+        config.num_hidden_layers = 4
+        config.num_attention_heads = 4
+        config.num_key_value_heads = 4
+        config.vocab_size = 10000
+        config.max_position_embeddings = 128
 
         cls.torch_model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
-        cls.torch_model.config.save_pretrained(cls.torch_model_path)
+
         torch.save(cls.torch_model.state_dict(), f"{cls.torch_model_path}/pytorch_model.bin")
 
+        config_dict = config.to_dict()
+        for key in ["_commit_hash", "_name_or_path"]:
+            config_dict.pop(key, None)
+
+        with open(f"{cls.torch_model_path}/config.json", "w") as f:
+            json.dump(config_dict, f, indent=2)
+
     @require_package("transformers", "torch")
     def test_intern_converter(self):
         import torch
@@ -263,7 +266,8 @@ def test_intern_converter(self):
         input_ids = np.random.randint(100, 200, [1, 20])
 
         self.torch_model.eval()
-        torch_logit = self.torch_model(torch.tensor(input_ids), use_cache=False, return_dict=False)[0]
+        torch_output = self.torch_model(torch.tensor(input_ids), use_cache=False)
+        torch_logit = torch_output[0] if isinstance(torch_output, tuple) else torch_output.logits
 
         paddle_model = InternLM25ForCausalLM.from_pretrained(
             self.torch_model_path, convert_from_hf=True, load_checkpoint_format=""

From 87781174976075f7f764c1288912685775b11dc6 Mon Sep 17 00:00:00 2001
From: caoyuanye <cao_dev@foxmail.com>
Date: Thu, 21 May 2026 19:54:35 +0800
Subject: [PATCH 8/8] =?UTF-8?q?=E5=9B=BA=E5=AE=9A=E8=87=AA=E5=8A=A8?=
 =?UTF-8?q?=E5=AF=B9=E9=BD=90=E7=9A=84=E9=9A=8F=E6=9C=BA=E6=95=B0=E5=8F=91?=
 =?UTF-8?q?=E7=94=9F=E5=99=A8=EF=BC=8C=E5=87=8F=E5=B0=91=E4=B8=8D=E4=B8=80?=
 =?UTF-8?q?=E8=87=B4=E6=A6=82=E7=8E=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/transformers/intern_lm2_5/test_modeling.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/transformers/intern_lm2_5/test_modeling.py b/tests/transformers/intern_lm2_5/test_modeling.py
index 8c963dd1616..80a5e4275dc 100644
--- a/tests/transformers/intern_lm2_5/test_modeling.py
+++ b/tests/transformers/intern_lm2_5/test_modeling.py
@@ -231,10 +231,17 @@ class InternLM25CompatibilityTest(unittest.TestCase):
     def setUpClass(cls) -> None:
         import json
 
+        import numpy as np
         import torch
         from modelscope import AutoConfig
         from transformers import AutoModelForCausalLM
 
+        # Set random seeds for reproducibility
+        np.random.seed(42)
+        torch.manual_seed(42)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(42)
+
         cls.torch_model_path = tempfile.mkdtemp()
 
         config = AutoConfig.from_pretrained(modelscope_lm25_model_location, trust_remote_code=True)
@@ -261,8 +268,13 @@ def setUpClass(cls) -> None:
 
     @require_package("transformers", "torch")
     def test_intern_converter(self):
+        # Set seeds for reproducibility
+        import paddle
         import torch
 
+        paddle.seed(42)
+        np.random.seed(42)
+
         input_ids = np.random.randint(100, 200, [1, 20])
 
         self.torch_model.eval()