PaddlePaddle · zjjlivein · Jul 14, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/paddleformers/cli/utils/llm_utils.py b/paddleformers/cli/utils/llm_utils.py
@@ -440,6 +440,17 @@ def get_lora_target_modules(model):
             "model.visual.blocks.*mlp.up_proj.*",
             "model.visual.blocks.*mlp.down_proj.*",
         ]
+    elif model.config.model_type == "internlm2":
+        # Covers both InternLM2 2.0 and 2.5: both route through the unified
+        # `intern/` proxy with `model_type = "internlm2"` and share the same
+        # weight key names (wqkv/wo/w1/w2/w3).
+        target_modules = [
+            ".*wqkv.*",
+            ".*wo.*",
+            ".*w1.*",
+            ".*w2.*",
+            ".*w3.*",
+        ]
     else:
         raise ValueError(f"Unknown base_model_prefix: {model.config.model_type}.")
     return target_modules

diff --git a/paddleformers/datasets/template/template.py b/paddleformers/datasets/template/template.py
@@ -1004,6 +1004,16 @@ def _get_gpt_oss_prefix():
     chat_sep="<|assistant|>\n",
     mm_plugin=get_mm_plugin(name="glm_ocr", image_token="<|image|>"),
 )
+register_template(
+    name="internlm2_5",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=["<s>"]),
+    chat_sep="<|im_end|>\n",
+    suffix=["<|im_end|>\n"],
+    enable_thinking=None,
+)
 
 
 register_template(

diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py
@@ -348,6 +348,27 @@
     ],
     "glm_ocr.processor": ["Glm46VProcessor"],
     "glm_ocr.image_processor": ["Glm46VImageProcessor"],
+    "intern_lm2_5.configuration": ["InternLM25Config"],
+    "intern_lm2_5.modeling": [
+        "InternLM25DecoderLayer",
+        "InternLM25Model",
+        "InternLM25ForCausalLM",
+        "InternLM25PretrainedModel",
+        "InternLM25ForSequenceClassification",
+        "InternLM25ForQuestionAnswering",
+        "InternLM25ForTokenClassification",
+    ],
+    "intern_lm2_5.tokenizer": ["InternLM25Tokenizer"],
+    "intern.configuration": ["InternLM2Config"],
+    "intern.modeling": [
+        "InternLM2PretrainedModel",
+        "InternLM2Model",
+        "InternLM2ForCausalLM",
+        "InternLM2ForSequenceClassification",
+        "InternLM2ForQuestionAnswering",
+        "InternLM2ForTokenClassification",
+    ],
+    "intern_lm2.tokenizer": ["InternLM2Tokenizer"],
     "gemma4_moe.configuration": ["Gemma4MoeConfig"],
     "gemma4_moe.modeling": ["Gemma4MoeForCausalLM"],
     "gemma4_moe": [],
@@ -433,6 +454,9 @@
     from .phi3 import *
     from .gemma3_text import *
     from .glm_ocr import *
+    from .intern_lm2_5 import *
+    from .intern import *
+    from .intern_lm2 import InternLM2Tokenizer
     from .gemma4_moe import *
     from .phi4 import *
 else:

diff --git a/paddleformers/transformers/auto/configuration.py b/paddleformers/transformers/auto/configuration.py
@@ -64,6 +64,7 @@
         ("glm_ocr", "GlmOcrConfig"),
         ("qwen3_5", "Qwen3_5Config"),
         ("qwen3_5_moe", "Qwen3_5MoEConfig"),
+        ("internlm2", "InternLM2Config"),
         # TODO(VL): When Gemma4 VL is implemented, "gemma4" should point to Gemma4Config (VL wrapper)
         ("gemma4_text", "Gemma4MoeConfig"),
         ("gemma4", "Gemma4MoeConfig"),  # Temporary: no standalone text ckpt, extract text_config in from_dict
@@ -98,6 +99,7 @@
         ("minicpm", "MiniCPM"),
         ("qwen3_5_moe", "Qwen3_5MoEForConditionalGeneration"),
         ("qwen3_5", "Qwen3_5ForConditionalGeneration"),
+        ("internlm2", "InternLM2"),
         ("gemma4_moe", "Gemma4MoeForCausalLM"),
         ("gemma4_text", "Gemma4MoeForCausalLM"),
         ("gemma4", "Gemma4MoeForCausalLM"),
@@ -116,6 +118,7 @@
         ("qwen2_5_vl_text", "qwen2_5_vl"),
         ("qwen3_vl_text", "qwen3_vl"),
         ("qwen3_vl_moe_text", "qwen3_vl_moe"),
+        ("internlm2", "intern"),
         # TODO(VL): Remove these when Gemma4 VL module (gemma4/) is created
         ("gemma4_text", "gemma4_moe"),
         ("gemma4", "gemma4_moe"),

diff --git a/paddleformers/transformers/auto/modeling.py b/paddleformers/transformers/auto/modeling.py
@@ -83,6 +83,7 @@
         ("Gemma4Moe", "gemma4_moe"),
         ("Glm4vMoe", "glm4v_moe"),
         ("GlmOcr", "glm_ocr"),
+        ("InternLM2", "intern"),
     ]
 )
 

diff --git a/paddleformers/transformers/intern/__init__.py b/paddleformers/transformers/intern/__init__.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+InternLM2 Common Module
+
+This module provides unified access to both InternLM2 2.0 and 2.5 models.
+It automatically routes to the correct implementation based on the model configuration.
+"""
+
+from .configuration import InternLM2Config
+from .modeling import (
+    InternLM2ForCausalLM,
+    InternLM2ForQuestionAnswering,
+    InternLM2ForSequenceClassification,
+    InternLM2ForTokenClassification,
+    InternLM2Model,
+    InternLM2PretrainedModel,
+)
+
+# Alias for auto system compatibility
+InternLM2 = InternLM2Model
+
+__all__ = [
+    "InternLM2Config",
+    "InternLM2Model",
+    "InternLM2",
+    "InternLM2PretrainedModel",
+    "InternLM2ForCausalLM",
+    "InternLM2ForSequenceClassification",
+    "InternLM2ForQuestionAnswering",
+    "InternLM2ForTokenClassification",
+]
diff --git a/paddleformers/transformers/intern/configuration.py b/paddleformers/transformers/intern/configuration.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+InternLM2 Common Configuration
+
+This module provides a unified configuration for both InternLM2 2.0 and 2.5 models.
+It detects the version based on the configuration fields and routes accordingly.
+"""
+
+from paddleformers.transformers.configuration_utils import PretrainedConfig
+
+
+class InternLM2Config(PretrainedConfig):
+    """
+    InternLM2 configuration. This is a unified config that handles both 2.0 and 2.5 versions.
+
+    When loading from HuggingFace, the `model_type` will be "internlm2" (not "internlm2_5").
+    This config detects the actual version and routes to the appropriate implementation.
+    """
+
+    model_type = "internlm2"  # Important: must match HuggingFace config
+    _auto_class = "AutoConfig"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=92550,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        bias=True,
+        rope_theta=10000,
+        rope_scaling=None,
+        attn_implementation=None,
+        dtype="bfloat16",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.bias = bias
+
+        import paddle
+
+        if isinstance(dtype, str):
+            dtype_map = {
+                "float32": paddle.float32,
+                "float16": paddle.float16,
+                "bfloat16": paddle.bfloat16,
+            }
+            self.dtype = dtype_map.get(dtype.lower(), paddle.float32)
+        else:
+            self.dtype = dtype
+
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attn_implementation = attn_implementation
+        if self.attn_implementation is None:
+            self.attn_implementation = "eager"
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict):
+            raise ValueError(f"`rope_scaling` must be a dictionary, got {self.rope_scaling}")
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_factor is None:
+            raise ValueError("`rope_scaling` must contain 'type' and 'factor' keys, " f"got {self.rope_scaling}")
+        if rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(f"`rope_scaling` type must be 'linear' or 'dynamic', got '{rope_scaling_type}'")
+        if not isinstance(rope_scaling_factor, (int, float)) or rope_scaling_factor < 1.0:
+            raise ValueError(f"`rope_scaling` factor must be a number >= 1, got {rope_scaling_factor}")
+
+    @property
+    def is_version_2_5(self):
+        if hasattr(self, "auto_map") and self.auto_map is not None:
+            if "AutoModelForSequenceClassification" in self.auto_map:
+                return True
+        return False
diff --git a/paddleformers/transformers/intern/modeling.py b/paddleformers/transformers/intern/modeling.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+InternLM2 Common Modeling
+
+Factory-routing entry point. The classes here are never instantiated as
+themselves: `__new__` (direct construction) and `from_pretrained` (loading)
+both return a real implementation-class instance from `intern_lm2` (2.0) or
+`intern_lm2_5` (2.5), selected via `config.is_version_2_5`.
+
+"""
+
+from paddleformers.transformers.model_utils import PretrainedModel
+from paddleformers.utils.log import logger
+
+from .configuration import InternLM2Config
+
+
+def _select_impl_cls(cls_name, config):
+    """Pick the real implementation class by version. Returns the class object."""
+    if config.is_version_2_5:
+        logger.info("Detected InternLM2 2.5, loading 2.5 implementation")
+        from ..intern_lm2_5 import modeling as _impl_module
+    else:
+        logger.info("Detected InternLM2 2.0, loading 2.0 implementation")
+        from ..intern_lm2 import modeling as _impl_module
+
+    impl_cls = getattr(_impl_module, cls_name, None)
+    if impl_cls is None:
+        raise NotImplementedError(
+            f"{cls_name} is not implemented for InternLM2 "
+            f"{'2.5' if config.is_version_2_5 else '2.0'} in PaddleFormers."
+        )
+    return impl_cls
+
+
+class InternLM2PretrainedModel(PretrainedModel):
+    config_class = InternLM2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["InternLM2DecoderLayer", "InternLM25DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    transpose_weight_keys = ["wqkv", "wo", "w1", "w2", "w3", "output"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def __new__(cls, config, *args, **kwargs):
+        impl_cls = _select_impl_cls(cls.__name__, config)
+        return impl_cls(config, *args, **kwargs)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        # Read config first (without instantiating) to decide the version,
+        # then delegate to the real implementation class's from_pretrained.
+        cfg_kwargs = {}
+        for k in ("download_hub", "cache_dir", "subfolder", "trust_remote_code"):
+            if k in kwargs:
+                cfg_kwargs[k] = kwargs[k]
+        config = InternLM2Config.from_pretrained(pretrained_model_name_or_path, **cfg_kwargs)
+        impl_cls = _select_impl_cls(cls.__name__, config)
+        return impl_cls.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+
+    @classmethod
+    def _gen_aoa_config(cls, config):
+        impl_cls = _select_impl_cls(cls.__name__, config)
+        return impl_cls._gen_aoa_config(config)
+
+    @classmethod
+    def _gen_inv_aoa_config(cls, config):
+        impl_cls = _select_impl_cls(cls.__name__, config)
+        return impl_cls._gen_inv_aoa_config(config)
+
+
+class InternLM2Model(InternLM2PretrainedModel):
+    _auto_class = "AutoModel"
+
+
+class InternLM2ForCausalLM(InternLM2PretrainedModel):
+    _auto_class = "AutoModelForCausalLM"
+    _tied_weights_keys = ["output.weight"]
+
+
+class InternLM2ForSequenceClassification(InternLM2PretrainedModel):
+    _auto_class = "AutoModelForSequenceClassification"
+
+
+class InternLM2ForQuestionAnswering(InternLM2PretrainedModel):
+    _auto_class = "AutoModelForQuestionAnswering"
+
+
+class InternLM2ForTokenClassification(InternLM2PretrainedModel):
+    _auto_class = "AutoModelForTokenClassification"