@@ -122,8 +122,8 @@ def __init__(self, mode, source_params):
122122
123123 def get_frame_id_list (self , video_path , sequence_length ,
124124 mv_unit_div : float = 4.0 , # quarter-pel -> pixel
125- mv_pct : float = 95.0 , # MV 归一化分位数(传给 _mv_energy_norm)
126- res_pct : float = 95.0 , # 残差归一化分位数(传给 _residual_energy_norm)
125+ mv_pct : float = 95.0 , # MV normalization percentile (passed to _mv_energy_norm)
126+ res_pct : float = 95.0 , # Residual normalization percentile (passed to _residual_energy_norm)
127127 fuse_mode : str = "weighted" ,
128128 w_mv : float = 1.0 ,
129129 w_res : float = 1.0 ,):
@@ -133,7 +133,7 @@ def get_frame_id_list(self, video_path, sequence_length,
133133
134134 if self .mode in ["train" , "val" ]:
135135
136- # 按照每一个seq进行分group
136+ # Group by each sequence
137137 # average_duration = duration // sequence_length
138138
139139 # if average_duration > 0:
@@ -153,24 +153,24 @@ def get_frame_id_list(self, video_path, sequence_length,
153153 elif hasattr (decord_vr , "get_keyframes" ):
154154 key_idx = decord_vr .get_keyframes ()
155155 if key_idx is not None :
156- # key_idx 可能是 NDArray;转成 Python list 的整型帧号集合 只保留一帧为I帧
156+ # key_idx may be NDArray; convert to Python list of integer frame numbers, keep only one frame as I-frame
157157 I_list = np .asarray (key_idx )
158158 I_list = I_list .tolist ()[0 ] if I_list .ndim > 1 else I_list .tolist ()
159159 I_list = [int (i ) for i in I_list if int (i ) in frame_id_list ]
160160 if len (I_list ) >= self .tokeq_target_frames :
161- # 如果 I 帧过多,优先保留前面的
161+ # If there are too many I-frames, prioritize keeping the earlier ones
162162 I_list = I_list [:self .tokeq_target_frames ]
163163 P_list = []
164164 else :
165165 P_list = [i for i in range (len (frame_id_list )) if i not in I_list ]
166166 except Exception :
167- # 保底处理:忽略异常,后续用默认策略
168- print ("没有读取成功 " )
167+ # Fallback: ignore exception, use default strategy later
168+ print ("Failed to read " )
169169 # gop = max(1,int(self.gop_size))
170170 # I_list = [i for i, fid in enumerate(frame_id_list)if(int(fid)% gop)== 0]
171- # 第一帧为I帧
171+ # First frame is I-frame
172172 I_list = [0 ]
173- # 其余为 P 帧
173+ # Rest are P-frames
174174 P_list = [i for i in range (len (frame_id_list ))if i not in I_list ]
175175 # Map absolute frame id -> position in the sampled sequence
176176 frame_ids = frame_id_list
@@ -179,11 +179,11 @@ def get_frame_id_list(self, video_path, sequence_length,
179179 frame_ids = frame_id_list
180180 pos_map = {fid : i for i , fid in enumerate (frame_ids )}
181181
182- # 读取视频帧
182+ # Read video frames
183183 decord_vr .seek (0 )
184184 video_data = decord_vr .get_batch (frame_id_list ).asnumpy ()
185185
186- # 转成 numpy array
186+ # Convert to numpy array
187187 I_list = np .array (I_list , dtype = np .int64 )
188188 P_list = np .array (P_list , dtype = np .int64 )
189189 I_pos_set = set (I_list .tolist ())
@@ -226,47 +226,47 @@ def get_frame_id_list(self, video_path, sequence_length,
226226 residual ,
227227 ) = frame_tuple
228228
229- # I 帧:直接置 0(与你 residual 逻辑一致)
229+ # I-frame: directly set to 0 (consistent with your residual logic)
230230 if pos in I_pos_set :
231231 if H0 is None :
232- # 用残差Y来确定输出尺寸/类型
232+ # Use residual Y to determine output size/type
233233 y0 = residual if residual .ndim == 2 else cv2 .cvtColor (residual , cv2 .COLOR_BGR2YUV )[:, :, 0 ]
234234 y0 = np .asarray (y0 )
235235 H0 , W0 , dtype0 = int (y0 .shape [0 ]), int (y0 .shape [1 ]), y0 .dtype
236236 residuals_y [pos ] = np .zeros ((H0 , W0 ), dtype = dtype0 or np .uint8 )
237237
238238 else :
239- # 1) 取 MV (L0) 并上采样到 H×W
239+ # 1) Get MV (L0) and upsample to H×W
240240 mvx_hw = rdr ._upsample_mv_to_hw (mv_x_L0 .astype (np .float32 ))
241241 mvy_hw = rdr ._upsample_mv_to_hw (mv_y_L0 .astype (np .float32 ))
242242
243- # 2) 取残差 Y
243+ # 2) Get residual Y
244244 Y_res = residual if residual .ndim == 2 else cv2 .cvtColor (residual , cv2 .COLOR_BGR2YUV )[:, :, 0 ]
245245
246- # 初始化输出尺寸/类型(只在第一次命中时做)
246+ # Initialize output size/type (only done on first hit)
247247 if H0 is None :
248248 H0 , W0 , dtype0 = int (Y_res .shape [0 ]), int (Y_res .shape [1 ]), Y_res .dtype
249249
250- # 若当前帧的尺寸与 H0×W0 不一致,做一次 resize 对齐(极少见,兜底)
250+ # If current frame size does not match H0×W0, do a resize alignment (rare, fallback)
251251 if (Y_res .shape [0 ] != H0 ) or (Y_res .shape [1 ] != W0 ):
252252 Y_res = cv2 .resize (Y_res , (W0 , H0 ), interpolation = cv2 .INTER_AREA )
253253 if (mvx_hw .shape [0 ] != H0 ) or (mvx_hw .shape [1 ] != W0 ):
254254 mvx_hw = cv2 .resize (mvx_hw , (W0 , H0 ), interpolation = cv2 .INTER_NEAREST )
255255 mvy_hw = cv2 .resize (mvy_hw , (W0 , H0 ), interpolation = cv2 .INTER_NEAREST )
256256
257- # 3) 归一化到 [0,1]
258- # 下面这些超参请确保在外层有定义;如果没有,你也可以给个默认值:
257+ # 3) Normalize to [0,1]
258+ # Make sure these hyperparameters are defined in outer scope; if not, you can give default values:
259259 # mv_unit_div, mv_pct, res_pct, fuse_mode, w_mv, w_res
260260 mv_norm , _ = _mv_energy_norm (mvx_hw , mvy_hw , H0 , W0 , mv_unit_div = mv_unit_div , pct = mv_pct )
261261 res_norm , _ = _residual_energy_norm (Y_res , pct = res_pct )
262262
263- # 4) 融合( weighted/sum/max/geomean 均可,默认 weighted)
263+ # 4) Fusion ( weighted/sum/max/geomean all work, default weighted)
264264 fused = _fuse_energy (mv_norm , res_norm , mode = fuse_mode , w_mv = w_mv , w_res = w_res )
265265
266- # 写回你原来的容器(保持最小改动,用 uint8 存)
266+ # Write back to your original container (minimal change, store as uint8)
267267 residuals_y [pos ] = (np .clip (fused , 0.0 , 1.0 ) * 255.0 ).astype (dtype0 or np .uint8 )
268268
269- # 结束条件
269+ # End condition
270270 if all (x is not None for x in residuals_y ):
271271 break
272272
@@ -289,9 +289,9 @@ def get_frame_id_list(self, video_path, sequence_length,
289289 combined_data = np .concatenate ([video_data , residuals_y ], axis = - 1 )
290290
291291 if H0 != video_data .shape [1 ] or W0 != video_data .shape [2 ]:
292- print ("[warn] residual尺寸与视频不一致 : res=(%d,%d) video=(%d,%d)" % (H0 , W0 , video_data .shape [1 ], video_data .shape [2 ]))
292+ print ("[warn] residual size does not match video : res=(%d,%d) video=(%d,%d)" % (H0 , W0 , video_data .shape [1 ], video_data .shape [2 ]))
293293 finally :
294- # 恢复环境变量
294+ # Restore environment variables
295295 if _prev_y_only is None :
296296 os .environ .pop ("UMT_HEVC_Y_ONLY" , None )
297297 else :
@@ -348,13 +348,13 @@ def dali_pipeline(mode, source_params):
348348 combined_data ,
349349 device = "gpu" ,
350350 crop = [input_size , input_size ],
351- crop_pos_x = 0.5 , # 中心裁剪
351+ crop_pos_x = 0.5 , # Center crop
352352 crop_pos_y = 0.5 ,
353353 dtype = types .UINT8 ,
354354 output_layout = "FHWC"
355355 )
356356
357- video_channels = source_params .get ('video_channels' , 3 ) # 例如 RGB=3
357+ video_channels = source_params .get ('video_channels' , 3 ) # e.g. RGB=3
358358 videos = fn .slice (combined_data , start = [0 ], shape = [video_channels ], axes = [3 ])
359359
360360 res_zero_masks = fn .slice (combined_data , start = [video_channels ], shape = [1 ], axes = [3 ])
@@ -395,7 +395,7 @@ def dali_pipeline(mode, source_params):
395395 combined_data = fn .resize (combined_data , device = "gpu" , resize_shorter = input_size , interp_type = types .INTERP_CUBIC )
396396 combined_data = fn .crop_mirror_normalize (combined_data , device = "gpu" , crop = [input_size , input_size ], crop_pos_x = 0.5 , crop_pos_y = 0.5 , dtype = types .UINT8 , output_layout = "FHWC" )
397397
398- video_channels = source_params .get ('video_channels' , 3 ) # 例如 RGB=3
398+ video_channels = source_params .get ('video_channels' , 3 ) # e.g. RGB=3
399399 videos = fn .slice (combined_data , start = [0 ], shape = [video_channels ], axes = [3 ])
400400
401401 res_zero_masks = fn .slice (combined_data , start = [video_channels ], shape = [1 ], axes = [3 ])
0 commit comments