apply trim memory

RangiLyu · RangiLyu · commit 6032f208d163 · 2026-03-31T18:57:31.000+08:00
diff --git a/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py b/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
@@ -9,7 +9,7 @@
 
 from xtuner.v1.data_proto.messages import ChatMessages
 from xtuner.v1.data_proto.templates import ChatTemplate, HybridChatTemplate
-from xtuner.v1.utils import get_logger
+from xtuner.v1.utils import get_logger, trim_memory
 
 from ..data_item import BaseMLLMDataItem, CacheItem
 from ..utils import CachableTokenizeFunction, tokenizer_xxhash, with_proxy_attention_flops
@@ -118,7 +118,8 @@ def replace_image_token(
 
 
 def load_image(image_path: str):
-    return Image.open(image_path).convert("RGB")
+    with Image.open(image_path) as img:
+        return img.convert("RGB")
 
 
 def get_image_path(image_path: str, media_root: str):
@@ -144,6 +145,7 @@ def __init__(
         data_name: str | None = None,
         llm_pack_weight: float = 1.0,
         visual_pack_weight: float = 0.0,
+        trim_memory_step: int = 1,
     ):
         self.max_length = max_length
         self._tokenizer_hash = tokenizer_hash
@@ -157,10 +159,17 @@ def __init__(
         self._image_wh_list: list[list] = []
         self._video_wh_list: list[list] = []
         self._video_extra_info_list: list[dict] = []
+        self._trim_memory_step = max(1, trim_memory_step)
+        self._trim_memory_count = 0
 
         self._hash_str += f"llm_pack_weight:{llm_pack_weight}_visual_pack_weight:{visual_pack_weight}"
         super().__init__(tokenizer, llm_pack_weight=llm_pack_weight, visual_pack_weight=visual_pack_weight)
 
+    def _maybe_trim_memory(self):
+        self._trim_memory_count += 1
+        if self._trim_memory_count % self._trim_memory_step == 0:
+            trim_memory(logger)
+
     def calc_num_tokens_multi_modal_get_item(self, data_item: dict) -> CacheItem:
         raise NotImplementedError
 
@@ -213,11 +222,13 @@ def __call__(self, item: dict, media_root: str = "", **kwargs) -> T | CacheItem:
                 ret = self.calc_num_tokens_multi_modal_get_item(item)
             else:
                 ret = self.multi_modal_get_item(item, media_root)
+                self._maybe_trim_memory()
         elif len(self._video_path) > 0:
             if self.state == "cache":
                 ret = self.calc_num_tokens_video_get_item(item)
             else:
                 ret = self.video_get_item(item, media_root)
+                self._maybe_trim_memory()
         else:
             if self.state == "cache":
                 ret = self.calc_num_tokens_pure_text_get_item(item)
@@ -257,6 +268,7 @@ class BaseMLLMTokenizeFnConfig(BaseModel):
     add_bos_token: bool = False  # for mllm pretrain
     llm_pack_weight: float = 1.0
     visual_pack_weight: float = 0.0
+    trim_memory_step: int = 1
 
     def build(
         self, tokenizer, tokenizer_hash: str | None = None, anno_name: str = "", **kwargs
diff --git a/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py b/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py
@@ -233,6 +233,7 @@ def __init__(
         hash: str | None = None,
         add_eos_token: bool = True,  # for mllm pretrain
         add_bos_token: bool = False,  # for mllm pretrain
+        trim_memory_step: int = 1,
     ):
         self.oss_loader = None
         self.debug = debug
@@ -335,6 +336,7 @@ def __init__(
             data_name=self.data_name,
             llm_pack_weight=llm_pack_weight,
             visual_pack_weight=visual_pack_weight,
+            trim_memory_step=trim_memory_step,
         )
 
     def _truncated_data_item(
@@ -903,6 +905,7 @@ class Qwen3VLTokenizeFnConfig(BaseMLLMTokenizeFnConfig):
     # When handling multiple images or multiple videos,
     # it's helpful to add labels to the images and videos for better reference.
     add_vision_id: bool = True
+    trim_memory_step: int = 1
 
     def build(
         self, tokenizer, tokenizer_hash: str | None = None, anno_name: str = "", **kwargs
@@ -932,4 +935,5 @@ def build(
             oss_time_log_thr=self.oss_time_log_thr,
             add_eos_token=self.add_eos_token,  # for mllm pretrain
             add_bos_token=self.add_bos_token,  # for mllm pretrain
+            trim_memory_step=self.trim_memory_step,
         )
diff --git a/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_utils.py b/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_utils.py
@@ -22,9 +22,9 @@
 
 
 def pil_loader(img_str):
-    buff = io.BytesIO(img_str)
-    img = Image.open(buff)
-    return img.convert("RGB")
+    with io.BytesIO(img_str) as buff:
+        with Image.open(buff) as img:
+            return img.convert("RGB")
 
 
 def extract_frame_number(filename):
@@ -109,12 +109,13 @@ def read_frames_folder(
             start_time = time.time()
             image_byte = client.get(image_list[frame_index])
             oss_read_time += time.time() - start_time
-            frame = Image.open(io.BytesIO(image_byte))
-            frame_list.append(np.array(frame))
+            with io.BytesIO(image_byte) as buff:
+                with Image.open(buff) as frame:
+                    frame_list.append(np.array(frame))
         else:
             fp = os.path.join(video_path, image_list[frame_index])
-            frame = Image.open(fp).convert("RGB")
-            frame_list.append(np.array(frame))
+            with Image.open(fp) as frame:
+                frame_list.append(np.array(frame.convert("RGB")))
 
     frames = numpy_to_tensor(frame_list)
     return frames, oss_read_time, len(frames), frames_indices, timestamps
diff --git a/xtuner/v1/utils/__init__.py b/xtuner/v1/utils/__init__.py
@@ -17,6 +17,7 @@
     is_hf_model_path,
     is_local_rank0,
     record_git_info,
+    trim_memory,
 )
 from .pad import pad_to_max_length, pad_to_multiple_of
 from .profile import profile_time, profile_time_and_memory, timer, timer_logger
@@ -61,4 +62,5 @@
     "ray_method",
     "profile_time",
     "clean_param_name",
+    "trim_memory",
 ]
diff --git a/xtuner/v1/utils/misc.py b/xtuner/v1/utils/misc.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import sys
 import threading
@@ -214,3 +215,23 @@ def clean_param_name(name: str) -> str:
     if "_orig_mod." in name:
         name = name.replace("_orig_mod.", "")
     return name
+
+
+_TRIM_MEMORY_WARNED = False
+
+
+def trim_memory(logger: logging.Logger | None = None):
+    """Try to return free heap pages to OS."""
+    global _TRIM_MEMORY_WARNED
+    if logger is None:
+        logger = get_logger()
+    try:
+        import ctypes
+
+        libc = ctypes.CDLL("libc.so.6")
+        return libc.malloc_trim(0)
+    except Exception as e:
+        if not _TRIM_MEMORY_WARNED:
+            logger.warning(f" >>>>>>>>> [trim_memory] Failed to trim memory: {e} <<<<<<<<")
+            _TRIM_MEMORY_WARNED = True
+        return False

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`is_hf_model_path,`
`18`	`18`	`is_local_rank0,`
`19`	`19`	`record_git_info,`
	`20`	`+ trim_memory,`
`20`	`21`	`)`
`21`	`22`	`from .pad import pad_to_max_length, pad_to_multiple_of`
`22`	`23`	`from .profile import profile_time, profile_time_and_memory, timer, timer_logger`
`@@ -61,4 +62,5 @@`
`61`	`62`	`"ray_method",`
`62`	`63`	`"profile_time",`
`63`	`64`	`"clean_param_name",`
	`65`	`+ "trim_memory",`
`64`	`66`	`]`