feat: add ViT activation_offload for InternS1

NengXu001 · NengXu001 · commit bb133dd746f6 · 2026-03-23T14:11:21.000Z
diff --git a/xtuner/v1/model/compose/intern_s1/intern_s1_config.py b/xtuner/v1/model/compose/intern_s1/intern_s1_config.py
@@ -49,6 +49,7 @@ class InternS1VisionConfig(XTunerBaseModelConfig):
     use_mask_token: bool = False
     use_mean_pooling: bool = True
     attn_impl: Literal["flash_attention", "flex_attention", "eager_attention"] = "flash_attention"
+    text_hidden_layers: int = 0
 
     def model_post_init(self, _):
         if self.attn_impl == "flash_attention" and get_device() == "cuda":
@@ -143,6 +144,11 @@ class InternS1Config(InternS1BaseConfig):
         vocab_size=153216, hf_key_mapping={r"^model.": "model.language_model."}
     )
 
+    # FOR ACTIVATION_OFFLOAD PURPOSE, vision and text model need to exchange num_hidden_layers with each other.
+    def model_post_init(self, __context) -> None:
+        self.vision_config.text_hidden_layers = self.text_config.num_hidden_layers
+        self.text_config.vision_hidden_layers = self.vision_config.num_hidden_layers
+
 
 class InternS1MiniConfig(InternS1BaseConfig):
     vision_config: InternS1VisionConfig = InternS1VisionConfig()
diff --git a/xtuner/v1/model/compose/intern_s1/modeling_vision.py b/xtuner/v1/model/compose/intern_s1/modeling_vision.py
@@ -36,6 +36,8 @@
 from xtuner.v1.ops.act_fn import get_act_fn
 from xtuner.v1.utils import get_logger
 from xtuner.v1.module import AttnOutputs
+import os
+from xtuner.v1.utils.activation_offload import async_save_on_cpu
 
 DEVICE = get_device()
 DEVICE_MODULE = get_torch_device_module()
@@ -230,6 +232,7 @@ def __init__(self, config: InternS1VisionConfig) -> None:
         dpr = np.linspace(0.0, float(config.drop_path_rate), int(config.num_hidden_layers))
         self.layer = nn.ModuleList([
             InternS1VisionLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
+        self.offload_stream = torch.cuda.Stream()
 
     def forward(
         self,
@@ -241,8 +244,17 @@ def forward(
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)  # type: ignore
-
-            hidden_states = layer_module(hidden_states)
+            if int(os.getenv("XTUNER_ACTIVATION_OFFLOAD", "0")) == 1:
+                with async_save_on_cpu(
+                    h2d_stream=self.offload_stream,
+                    d2h_stream=self.offload_stream,
+                    block_idx=int(i),
+                    depth=len(self.layer) + self.config.text_hidden_layers,
+                    custom_check_fn=lambda x: x.data_ptr() == hidden_states.data_ptr(),
+                ):
+                    hidden_states = layer_module(hidden_states)
+            else:
+                hidden_states = layer_module(hidden_states)
 
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)  # type: ignore
diff --git a/xtuner/v1/model/moe/moe.py b/xtuner/v1/model/moe/moe.py
@@ -154,6 +154,7 @@ class MoEConfig(TransformerConfig):
     moe_bias: bool = False
     moe_act_fn_cfg: MoEActFnConfig = MoEActFnConfig()
     freeze_routers: bool = False
+    vision_hidden_layers: int = 0
 
     def build(self) -> "MoE":
         from xtuner.v1.model.moe.moe import MoE
@@ -430,8 +431,8 @@ def _micro_batch_forward(
                     with async_save_on_cpu(
                         h2d_stream=self.offload_stream,
                         d2h_stream=self.offload_stream,
-                        block_idx=layer_idx - self.config.first_k_dense_replace,
-                        depth=len(self.layers) - self.config.first_k_dense_replace,
+                        block_idx=layer_idx - self.config.first_k_dense_replace + self.config.vision_hidden_layers,
+                        depth=len(self.layers) - self.config.first_k_dense_replace + self.config.vision_hidden_layers,
                         custom_check_fn=lambda x: x.data_ptr()
                         in [hidden_states.data_ptr() for hidden_states in hidden_states_list],
                         prefetch=True,
@@ -577,8 +578,8 @@ def _forward(
                     with async_save_on_cpu(
                         h2d_stream=self.offload_stream,
                         d2h_stream=self.offload_stream,
-                        block_idx=int(idx),
-                        depth=len(self.layers),
+                        block_idx=int(idx) + self.config.vision_hidden_layers,
+                        depth=len(self.layers) + self.config.vision_hidden_layers,
                         custom_check_fn=lambda x: x.data_ptr() == hidden_states.data_ptr(),
                     ):
                         layer_results = decoder_layer(