Skip to content

Commit b33ab31

Browse files
committed
[save-images] Fix VFR seek accuracy for OpenCV and image position generation
OpenCV's CAP_PROP_POS_FRAMES does not map linearly to time in VFR video (e.g. at the same timestamp, PyAV and OpenCV report frame indices that differ by 35+ frames), causing thumbnails to land in the wrong scene. Two fixes: 1. VideoStreamCv2.seek(): switch from CAP_PROP_POS_FRAMES to CAP_PROP_POS_MSEC for time-accurate seeking on both CFR and VFR video. Seeking one nominal frame before the target ensures the subsequent read() returns the frame at the target. 2. ImageSaver.generate_timecode_list(): rewrite to use seconds-based arithmetic instead of frame-number ranges. This avoids the frame_num approximation (round(seconds * avg_fps)) which gives wrong indices for VFR video.
1 parent 590df64 commit b33ab31

File tree

5 files changed

+104
-53
lines changed

5 files changed

+104
-53
lines changed

scenedetect/backends/opencv.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -230,24 +230,22 @@ def seek(self, target: ty.Union[FrameTimecode, float, int]):
230230
if target < 0:
231231
raise ValueError("Target seek position cannot be negative!")
232232

233-
# Seeking is done via frame number since OpenCV doesn't support PTS-based seeking.
234-
# After seeking, position returns actual PTS from CAP_PROP_POS_MSEC.
235-
# Have to seek one behind and call grab() after so that the VideoCapture
236-
# returns a valid timestamp when using CAP_PROP_POS_MSEC.
237-
target_frame_cv2 = (self.base_timecode + target).frame_num
238-
if target_frame_cv2 > 0:
239-
target_frame_cv2 -= 1
240-
self._cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame_cv2)
233+
target_secs = (self.base_timecode + target).seconds
241234
self._has_grabbed = False
242-
# Preemptively grab the frame behind the target position if possible.
243-
if target > 0:
235+
if target_secs > 0:
236+
# Use CAP_PROP_POS_MSEC for time-accurate seeking (correct for both CFR and VFR).
237+
# Seek one frame before the target so the next read() returns the frame at target.
238+
one_frame_ms = 1000.0 / float(self._frame_rate)
239+
seek_ms = max(0.0, target_secs * 1000.0 - one_frame_ms)
240+
self._cap.set(cv2.CAP_PROP_POS_MSEC, seek_ms)
244241
self._has_grabbed = self._cap.grab()
245-
# If we seeked past the end of the video, need to seek one frame backwards
246-
# from the current position and grab that frame instead.
242+
# If we seeked past the end, back up one frame.
247243
if not self._has_grabbed:
248244
seek_pos = round(self._cap.get(cv2.CAP_PROP_POS_FRAMES) - 1.0)
249245
self._cap.set(cv2.CAP_PROP_POS_FRAMES, max(0, seek_pos))
250246
self._has_grabbed = self._cap.grab()
247+
else:
248+
self._cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
251249

252250
def reset(self):
253251
"""Close and re-open the VideoStream (should be equivalent to calling `seek(0)`)."""

scenedetect/detector.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,11 @@ def _filter_merge(self, timecode: FrameTimecode, above_threshold: bool) -> ty.Li
168168
self._last_above = timecode
169169
if self._merge_triggered:
170170
# This frame was under the threshold, see if enough frames passed to disable the filter.
171-
if min_length_met and not above_threshold and (self._last_above - self._merge_start) >= self._filter_secs:
171+
if (
172+
min_length_met
173+
and not above_threshold
174+
and (self._last_above - self._merge_start) >= self._filter_secs
175+
):
172176
self._merge_triggered = False
173177
return [self._last_above]
174178
# Keep merging until enough frames pass below the threshold.

scenedetect/output/image.py

Lines changed: 29 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -290,48 +290,37 @@ def image_save_thread(self, save_queue: queue.Queue, progress_bar: tqdm):
290290
if progress_bar is not None:
291291
progress_bar.update(1)
292292

293-
def generate_timecode_list(self, scene_list: SceneList) -> ty.List[ty.Iterable[FrameTimecode]]:
293+
def generate_timecode_list(self, scene_list: SceneList) -> ty.List[ty.List[FrameTimecode]]:
294294
"""Generates a list of timecodes for each scene in `scene_list` based on the current config
295-
parameters."""
296-
# TODO(v0.7): This needs to be fixed as part of PTS overhaul.
295+
parameters.
296+
297+
Uses PTS-accurate seconds-based timing so results are correct for both CFR and VFR video.
298+
"""
297299
framerate = scene_list[0][0].framerate
298-
# TODO(v1.0): Split up into multiple sub-expressions so auto-formatter works correctly.
299-
return [
300-
(
301-
FrameTimecode(int(f), fps=framerate)
302-
for f in (
303-
# middle frames
304-
a[len(a) // 2]
305-
if (0 < j < self._num_images - 1) or self._num_images == 1
306-
# first frame
307-
else min(a[0] + self._frame_margin, a[-1])
308-
if j == 0
309-
# last frame
310-
else max(a[-1] - self._frame_margin, a[0])
311-
# for each evenly-split array of frames in the scene list
312-
for j, a in enumerate(np.array_split(r, self._num_images))
313-
)
314-
)
315-
for r in (
316-
# pad ranges to number of images
317-
r
318-
if 1 + r[-1] - r[0] >= self._num_images
319-
else list(r) + [r[-1]] * (self._num_images - len(r))
320-
# create range of frames in scene
321-
for r in (
322-
range(
323-
start.frame_num,
324-
start.frame_num
325-
+ max(
326-
1, # guard against zero length scenes
327-
end.frame_num - start.frame_num,
328-
),
329-
)
330-
# for each scene in scene list
331-
for start, end in scene_list
332-
)
333-
)
334-
]
300+
# Convert frame_margin to seconds using the nominal framerate.
301+
margin_secs = self._frame_margin / framerate
302+
result = []
303+
for start, end in scene_list:
304+
duration_secs = (end - start).seconds
305+
if duration_secs <= 0:
306+
result.append([start] * self._num_images)
307+
continue
308+
segment_secs = duration_secs / self._num_images
309+
timecodes = []
310+
for j in range(self._num_images):
311+
seg_start = start.seconds + j * segment_secs
312+
seg_end = start.seconds + (j + 1) * segment_secs
313+
if self._num_images == 1:
314+
t = start.seconds + duration_secs / 2.0
315+
elif j == 0:
316+
t = min(seg_start + margin_secs, seg_end)
317+
elif j == self._num_images - 1:
318+
t = max(seg_end - margin_secs, seg_start)
319+
else:
320+
t = (seg_start + seg_end) / 2.0
321+
timecodes.append(FrameTimecode(t, fps=framerate))
322+
result.append(timecodes)
323+
return result
335324

336325
def resize_image(
337326
self,

tests/conftest.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,17 @@ def test_vfr_video() -> str:
114114
return check_exists("tests/resources/goldeneye-vfr.mp4")
115115

116116

117+
@pytest.fixture
118+
def test_vfr_drop3_video() -> str:
119+
"""Synthetic VFR video created from goldeneye.mp4 by dropping every 3rd frame.
120+
121+
Frame pattern: keeps frames where (n+1) % 3 != 0 (i.e. drops frames 2,5,8,...).
122+
Resulting PTS durations alternate: 1001, 2002, 1001, 2002, ... (time_base=1/24000).
123+
Nominal fps: 24000/1001. Average fps: ~16 fps. Duration: ~10s, 160 frames.
124+
"""
125+
return check_exists("tests/resources/goldeneye-vfr-drop3.mp4")
126+
127+
117128
@pytest.fixture
118129
def corrupt_video_file() -> str:
119130
"""Video containing a corrupted frame causing a decode failure."""

tests/test_vfr.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,15 @@
3131
("00:00:03.921", "00:00:09.676"),
3232
]
3333

34+
# Expected scene cuts for `goldeneye-vfr-drop3.mp4` — a synthetic VFR clip created from the first
35+
# 10s of goldeneye.mp4 by dropping every 3rd frame (frames 2,5,8,...). PTS durations alternate
36+
# between 1001 and 2002 (time_base=1/24000), nominal fps=24000/1001, avg fps≈16. The last scene
37+
# ends at the clip boundary and may vary slightly between backends.
38+
EXPECTED_SCENES_VFR_DROP3: ty.List[ty.Tuple[str, str]] = [
39+
("00:00:00.000", "00:00:03.754"),
40+
("00:00:03.754", "00:00:08.759"),
41+
]
42+
3443

3544
class TestVFR:
3645
"""Test VFR video handling."""
@@ -142,6 +151,46 @@ def test_vfr_csv_output(self, test_vfr_video: str, tmp_path):
142151
rows = list(reader)
143152
assert len(rows) >= 3 # 2 header rows + data
144153

154+
@pytest.mark.parametrize("backend", ["pyav", "opencv"])
155+
def test_vfr_drop3_scene_detection(self, test_vfr_drop3_video: str, backend: str):
156+
"""Synthetic VFR video (drop every 3rd frame, alternating 1x/2x durations) should produce
157+
timecodes matching known ground truth with both backends."""
158+
video = open_video(test_vfr_drop3_video, backend=backend)
159+
sm = SceneManager()
160+
sm.add_detector(ContentDetector())
161+
sm.detect_scenes(video=video, show_progress=False)
162+
scene_list = sm.get_scene_list()
163+
164+
assert len(scene_list) >= len(EXPECTED_SCENES_VFR_DROP3), (
165+
f"[{backend}] Expected at least {len(EXPECTED_SCENES_VFR_DROP3)} scenes, got {len(scene_list)}"
166+
)
167+
for i, ((start, end), (exp_start_tc, exp_end_tc)) in enumerate(
168+
zip(scene_list, EXPECTED_SCENES_VFR_DROP3, strict=False)
169+
):
170+
assert start.get_timecode() == exp_start_tc, (
171+
f"[{backend}] Scene {i + 1} start: expected {exp_start_tc!r}, got {start.get_timecode()!r}"
172+
)
173+
assert end.get_timecode() == exp_end_tc, (
174+
f"[{backend}] Scene {i + 1} end: expected {exp_end_tc!r}, got {end.get_timecode()!r}"
175+
)
176+
177+
@pytest.mark.parametrize("backend", ["pyav", "opencv"])
178+
def test_vfr_drop3_position_monotonic(self, test_vfr_drop3_video: str, backend: str):
179+
"""PTS-based position should be monotonically non-decreasing on synthetic VFR video."""
180+
video = open_video(test_vfr_drop3_video, backend=backend)
181+
last_seconds = -1.0
182+
frame_count = 0
183+
while True:
184+
if video.read() is False:
185+
break
186+
current = video.position.seconds
187+
assert current >= last_seconds, (
188+
f"[{backend}] Position decreased at frame {frame_count}: {current} < {last_seconds}"
189+
)
190+
last_seconds = current
191+
frame_count += 1
192+
assert frame_count == 160 # 2/3 of original 240 frames in 10s at 24000/1001
193+
145194
def test_cfr_position_is_timecode(self, test_movie_clip: str):
146195
"""CFR video positions should also be Timecode-backed with PTS support."""
147196
video = open_video(test_movie_clip, backend="pyav")

0 commit comments

Comments
 (0)