Merge branch 'main' of https://github.com/EvolvingLMMs-Lab/OneVision-Encoder

anxiangsir · anxiangsir · commit e1537014a8c0 · 2025-12-24T23:22:31.000+08:00
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 </p>
 
 <p align="center">
-  <strong>Fully HEVC-Style Vision Transformer </strong>
+  <strong>OneVision Encoder</strong>
 </p>
 
 ## 📖 Table of Contents
@@ -29,11 +29,11 @@ OneVision Encoder is a vision encoder designed for multimodal large language mod
 ### Input Method Comparison
 
 <table>
-  <caption style="caption-side: top; text-align: center; font-weight: bold; margin-bottom: 10px;">Comparison of Frame Sampling Input vs Codec Input</caption>
+  <caption style="caption-side: top; text-align: center; font-weight: bold; margin-bottom: 10px;">Frame Sampling Input vs Codec Input</caption>
   <tr>
     <td align="center">
       <img src="pages/images/example.gif" alt="Animated demonstration of traditional uniform frame sampling method for video processing" width="400"><br>
-      <b>抽帧输入 (Frame Sampling Input)</b><br>
+      <b>Frame Sampling Input</b><br>
       Traditional uniform frame sampling approach
     </td>
     <td align="center">
diff --git a/dataloader/ap_dataloader_dali_ip_mv.py b/dataloader/ap_dataloader_dali_ip_mv.py
@@ -55,6 +55,8 @@ def _mv_energy_norm(
     pct: float = 95.0,
 ):
     """Return (norm_HxW_float32_in_[0,1], scale_max_px). No gamma/colormap."""
+    if not _HAS_CV2:  # fix: check cv2 availability before use
+        raise ImportError("cv2 is required for _mv_energy_norm but not available")
     vx = mvx.astype(np.float32) / float(mv_unit_div)
     vy = mvy.astype(np.float32) / float(mv_unit_div)
     mag = np.sqrt(vx * vx + vy * vy)  # pixels
@@ -315,7 +317,7 @@ def __call__(self, sample_info):
         video_path, video_label = example_info
         try:
             combined_data, duration, frame_id_list = self.get_frame_id_list(video_path, self.sequence_length)
-        except:
+        except Exception:  # fix: avoid bare except to allow KeyboardInterrupt/SystemExit to propagate
             video_path, video_label = self.replace_example_info
             combined_data, duration, frame_id_list = self.get_frame_id_list(video_path, self.sequence_length)
         
diff --git a/onevision_encoder/configuration_onevision_encoder.py b/onevision_encoder/configuration_onevision_encoder.py
@@ -77,6 +77,7 @@ def __init__(
         attention_dropout=0.0,
         initializer_range=0.02,
         rope_theta=10000.0,
+        rope_temporal_size=None,
         use_head=True,
         **kwargs,
     ):
@@ -94,4 +95,5 @@ def __init__(
         self.attention_dropout = attention_dropout
         self.initializer_range = initializer_range
         self.rope_theta = rope_theta
+        self.rope_temporal_size = rope_temporal_size  # None=use actual frames, int=fixed size (legacy: 64)
         self.use_head = use_head
diff --git a/onevision_encoder/modeling_onevision_encoder.py b/onevision_encoder/modeling_onevision_encoder.py
@@ -547,7 +547,10 @@ def forward(
         # Determine video dimensions for RoPE
         # Note: pixel_values passed to embeddings can be 4D or 5D
         if pixel_values.dim() == 5:
-             t_frames = 64
+             # fix: use config.rope_temporal_size if set, otherwise use actual frames
+             # legacy behavior was hardcoded t_frames=64 (for padded 64-frame videos)
+             actual_frames = pixel_values.shape[2]
+             t_frames = self.config.rope_temporal_size if self.config.rope_temporal_size else actual_frames
              height = pixel_values.shape[3]
              width = pixel_values.shape[4]
         else:
@@ -578,6 +581,14 @@ def forward(
         # 4. Pre-Norm & Encoder
         hidden_states = self.layernorm_pre(hidden_states)
 
+        # fix: gather hidden_states to match freqs_visible when using sparse visible_indices
+        num_visible = visible_indices.shape[1]
+        if num_visible != total_patches:
+            # sparse mode: select only visible patches
+            hidden_states = hidden_states.gather(
+                1, visible_indices.unsqueeze(-1).expand(-1, -1, hidden_states.shape[-1])
+            )
+
         encoder_outputs = self.encoder(
             hidden_states,
             attention_mask=None,
diff --git a/tools/tools_for_hevc/hevc_feature_decoder_mv.py b/tools/tools_for_hevc/hevc_feature_decoder_mv.py
@@ -64,36 +64,6 @@ def ffprobe(filename):
 
 
 
-# ---------------- YUV plane parsers ----------------
-def _split_yuv420_planes(buf: bytes, H: int, W: int, layout: str):
-    """Return Y (H,W), U (H/2,W/2), V (H/2,W/2) for layout in {i420,yv12,nv12,nv21}."""
-    nY = H*W
-    nUV = (H//2)*(W//2)
-    arr = np.frombuffer(buf, dtype=np.uint8)
-    if layout in ("i420","yv12"):
-        Y = arr[:nY].reshape(H, W)
-        UV = arr[nY:]
-        # planar U and V (each nUV)
-        U_planar, V_planar = (UV[:nUV], UV[nUV:]) if layout=="i420" else (UV[nUV:], UV[:nUV])
-        U = U_planar.reshape(H//2, W//2)
-        V = V_planar.reshape(H//2, W//2)
-        return Y, U, V
-    elif layout in ("nv12","nv21"):
-        Y = arr[:nY].reshape(H, W)
-        UVint = arr[nY:].reshape(H//2, W)  # interleaved per row: UVUV or VUVU
-        U = np.empty((H//2, W//2), dtype=np.uint8)
-        V = np.empty((H//2, W//2), dtype=np.uint8)
-        if layout == "nv12":  # UVUV...
-            U[:] = UVint[:, 0::2]
-            V[:] = UVint[:, 1::2]
-        else:                  # nv21: VUVU...
-            V[:] = UVint[:, 0::2]
-            U[:] = UVint[:, 1::2]
-        return Y, U, V
-    else:
-        raise ValueError(layout)
-
-
 # ---------------- YUV plane parsers ----------------
 def _split_yuv420_planes(buf: bytes, H: int, W: int, layout: str):
     """Return Y (H,W), U (H/2,W/2), V (H/2,W/2) for layout in {i420,yv12,nv12,nv21}."""
@@ -564,9 +534,16 @@ def close(self):
         if self._proc is not None and self._proc.poll() is None:
             self._proc.stdin.close()
             self._proc.stdout.close()
-            # self._proc.stderr.close()
+            # stderr is redirected to DEVNULL, not a pipe
             self._terminate(0.2)
         self._proc = None
+        # fix: close DEVNULL file handle to prevent resource leak
+        if hasattr(self, 'DEVNULL') and self.DEVNULL is not None:
+            try:
+                self.DEVNULL.close()
+            except Exception:
+                pass
+            self.DEVNULL = None
 
     def _terminate(self, timeout=1.0):
         """Terminate the sub process."""
diff --git a/training/train.py b/training/train.py
@@ -20,7 +20,7 @@
 from training.lr_scheduler import PolynomialLRWarmup
 from onevision_encoder import OneVisionEncoderModel, OneVisionEncoderConfig
 
-torch._dynamo.config.optimize_ddp = True
+# fix: removed conflicting line (was: True immediately overwritten by False)
 torch._dynamo.config.optimize_ddp = False
 
 parser = argparse.ArgumentParser(description="Multi-dataset video training")
@@ -657,12 +657,15 @@ def wrap_ddp(model):
 
                 # 按 batch 固定划分：前50% residual, 中37.5% frame_sampling, 后12.5% collage
                 n1 = int(bs * 0.5)
-                n2 = int(bs * 0.375)
+                # fix: n2 must be cumulative threshold, not standalone percentage
+                # bug was: n2 = int(bs * 0.375) which gives n2=37 when bs=100
+                # this caused mask_frame_sampling = (idx >= 50) & (idx < 37) to be always False
+                n2 = int(bs * 0.875)  # cumulative: 50% + 37.5% = 87.5%
 
                 idx_range = torch.arange(bs, device=dev)
-                mask_residual = idx_range < n1
-                mask_frame_sampling = (idx_range >= n1) & (idx_range < n2)
-                mask_collage = idx_range >= n2
+                mask_residual = idx_range < n1                               # idx in [0, n1)
+                mask_frame_sampling = (idx_range >= n1) & (idx_range < n2)   # idx in [n1, n2)
+                mask_collage = idx_range >= n2                               # idx in [n2, bs)
 
                 # ---------- residual（前50%）: 生成 out 行 ----------
                 if mask_residual.any():
@@ -831,8 +834,8 @@ def wrap_ddp(model):
             opt.step()
             opt.zero_grad()
 
-        # 学习率更新
-        lr_scheduler.step()
+            # fix: lr update should only happen after opt.step(), not every micro-batch
+            lr_scheduler.step()
 
         batch_end_callback(
             global_step=global_step,