[save-images] Fix VFR seek accuracy for OpenCV and image position generation

Breakthrough · Breakthrough · commit b33ab31cebcd · 2026-04-05T14:33:22.000-04:00
OpenCV's CAP_PROP_POS_FRAMES does not map linearly to time in VFR video
(e.g. at the same timestamp, PyAV and OpenCV report frame indices that differ
by 35+ frames), causing thumbnails to land in the wrong scene.

Two fixes:
1. VideoStreamCv2.seek(): switch from CAP_PROP_POS_FRAMES to CAP_PROP_POS_MSEC
   for time-accurate seeking on both CFR and VFR video. Seeking one nominal frame
   before the target ensures the subsequent read() returns the frame at the target.

2. ImageSaver.generate_timecode_list(): rewrite to use seconds-based arithmetic
   instead of frame-number ranges. This avoids the frame_num approximation
   (round(seconds * avg_fps)) which gives wrong indices for VFR video.
diff --git a/scenedetect/backends/opencv.py b/scenedetect/backends/opencv.py
@@ -230,24 +230,22 @@ def seek(self, target: ty.Union[FrameTimecode, float, int]):
         if target < 0:
             raise ValueError("Target seek position cannot be negative!")
 
-        # Seeking is done via frame number since OpenCV doesn't support PTS-based seeking.
-        # After seeking, position returns actual PTS from CAP_PROP_POS_MSEC.
-        # Have to seek one behind and call grab() after so that the VideoCapture
-        # returns a valid timestamp when using CAP_PROP_POS_MSEC.
-        target_frame_cv2 = (self.base_timecode + target).frame_num
-        if target_frame_cv2 > 0:
-            target_frame_cv2 -= 1
-        self._cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame_cv2)
+        target_secs = (self.base_timecode + target).seconds
         self._has_grabbed = False
-        # Preemptively grab the frame behind the target position if possible.
-        if target > 0:
+        if target_secs > 0:
+            # Use CAP_PROP_POS_MSEC for time-accurate seeking (correct for both CFR and VFR).
+            # Seek one frame before the target so the next read() returns the frame at target.
+            one_frame_ms = 1000.0 / float(self._frame_rate)
+            seek_ms = max(0.0, target_secs * 1000.0 - one_frame_ms)
+            self._cap.set(cv2.CAP_PROP_POS_MSEC, seek_ms)
             self._has_grabbed = self._cap.grab()
-            # If we seeked past the end of the video, need to seek one frame backwards
-            # from the current position and grab that frame instead.
+            # If we seeked past the end, back up one frame.
             if not self._has_grabbed:
                 seek_pos = round(self._cap.get(cv2.CAP_PROP_POS_FRAMES) - 1.0)
                 self._cap.set(cv2.CAP_PROP_POS_FRAMES, max(0, seek_pos))
                 self._has_grabbed = self._cap.grab()
+        else:
+            self._cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
 
     def reset(self):
         """Close and re-open the VideoStream (should be equivalent to calling `seek(0)`)."""
diff --git a/scenedetect/detector.py b/scenedetect/detector.py
@@ -168,7 +168,11 @@ def _filter_merge(self, timecode: FrameTimecode, above_threshold: bool) -> ty.Li
             self._last_above = timecode
         if self._merge_triggered:
             # This frame was under the threshold, see if enough frames passed to disable the filter.
-            if min_length_met and not above_threshold and (self._last_above - self._merge_start) >= self._filter_secs:
+            if (
+                min_length_met
+                and not above_threshold
+                and (self._last_above - self._merge_start) >= self._filter_secs
+            ):
                 self._merge_triggered = False
                 return [self._last_above]
             # Keep merging until enough frames pass below the threshold.
diff --git a/scenedetect/output/image.py b/scenedetect/output/image.py
@@ -290,48 +290,37 @@ def image_save_thread(self, save_queue: queue.Queue, progress_bar: tqdm):
             if progress_bar is not None:
                 progress_bar.update(1)
 
-    def generate_timecode_list(self, scene_list: SceneList) -> ty.List[ty.Iterable[FrameTimecode]]:
+    def generate_timecode_list(self, scene_list: SceneList) -> ty.List[ty.List[FrameTimecode]]:
         """Generates a list of timecodes for each scene in `scene_list` based on the current config
-        parameters."""
-        # TODO(v0.7): This needs to be fixed as part of PTS overhaul.
+        parameters.
+
+        Uses PTS-accurate seconds-based timing so results are correct for both CFR and VFR video.
+        """
         framerate = scene_list[0][0].framerate
-        # TODO(v1.0): Split up into multiple sub-expressions so auto-formatter works correctly.
-        return [
-            (
-                FrameTimecode(int(f), fps=framerate)
-                for f in (
-                    # middle frames
-                    a[len(a) // 2]
-                    if (0 < j < self._num_images - 1) or self._num_images == 1
-                    # first frame
-                    else min(a[0] + self._frame_margin, a[-1])
-                    if j == 0
-                    # last frame
-                    else max(a[-1] - self._frame_margin, a[0])
-                    # for each evenly-split array of frames in the scene list
-                    for j, a in enumerate(np.array_split(r, self._num_images))
-                )
-            )
-            for r in (
-                # pad ranges to number of images
-                r
-                if 1 + r[-1] - r[0] >= self._num_images
-                else list(r) + [r[-1]] * (self._num_images - len(r))
-                # create range of frames in scene
-                for r in (
-                    range(
-                        start.frame_num,
-                        start.frame_num
-                        + max(
-                            1,  # guard against zero length scenes
-                            end.frame_num - start.frame_num,
-                        ),
-                    )
-                    # for each scene in scene list
-                    for start, end in scene_list
-                )
-            )
-        ]
+        # Convert frame_margin to seconds using the nominal framerate.
+        margin_secs = self._frame_margin / framerate
+        result = []
+        for start, end in scene_list:
+            duration_secs = (end - start).seconds
+            if duration_secs <= 0:
+                result.append([start] * self._num_images)
+                continue
+            segment_secs = duration_secs / self._num_images
+            timecodes = []
+            for j in range(self._num_images):
+                seg_start = start.seconds + j * segment_secs
+                seg_end = start.seconds + (j + 1) * segment_secs
+                if self._num_images == 1:
+                    t = start.seconds + duration_secs / 2.0
+                elif j == 0:
+                    t = min(seg_start + margin_secs, seg_end)
+                elif j == self._num_images - 1:
+                    t = max(seg_end - margin_secs, seg_start)
+                else:
+                    t = (seg_start + seg_end) / 2.0
+                timecodes.append(FrameTimecode(t, fps=framerate))
+            result.append(timecodes)
+        return result
 
     def resize_image(
         self,
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -114,6 +114,17 @@ def test_vfr_video() -> str:
     return check_exists("tests/resources/goldeneye-vfr.mp4")
 
 
+@pytest.fixture
+def test_vfr_drop3_video() -> str:
+    """Synthetic VFR video created from goldeneye.mp4 by dropping every 3rd frame.
+
+    Frame pattern: keeps frames where (n+1) % 3 != 0 (i.e. drops frames 2,5,8,...).
+    Resulting PTS durations alternate: 1001, 2002, 1001, 2002, ... (time_base=1/24000).
+    Nominal fps: 24000/1001. Average fps: ~16 fps. Duration: ~10s, 160 frames.
+    """
+    return check_exists("tests/resources/goldeneye-vfr-drop3.mp4")
+
+
 @pytest.fixture
 def corrupt_video_file() -> str:
     """Video containing a corrupted frame causing a decode failure."""
diff --git a/tests/test_vfr.py b/tests/test_vfr.py
@@ -31,6 +31,15 @@
     ("00:00:03.921", "00:00:09.676"),
 ]
 
+# Expected scene cuts for `goldeneye-vfr-drop3.mp4` — a synthetic VFR clip created from the first
+# 10s of goldeneye.mp4 by dropping every 3rd frame (frames 2,5,8,...). PTS durations alternate
+# between 1001 and 2002 (time_base=1/24000), nominal fps=24000/1001, avg fps≈16. The last scene
+# ends at the clip boundary and may vary slightly between backends.
+EXPECTED_SCENES_VFR_DROP3: ty.List[ty.Tuple[str, str]] = [
+    ("00:00:00.000", "00:00:03.754"),
+    ("00:00:03.754", "00:00:08.759"),
+]
+
 
 class TestVFR:
     """Test VFR video handling."""
@@ -142,6 +151,46 @@ def test_vfr_csv_output(self, test_vfr_video: str, tmp_path):
             rows = list(reader)
             assert len(rows) >= 3  # 2 header rows + data
 
+    @pytest.mark.parametrize("backend", ["pyav", "opencv"])
+    def test_vfr_drop3_scene_detection(self, test_vfr_drop3_video: str, backend: str):
+        """Synthetic VFR video (drop every 3rd frame, alternating 1x/2x durations) should produce
+        timecodes matching known ground truth with both backends."""
+        video = open_video(test_vfr_drop3_video, backend=backend)
+        sm = SceneManager()
+        sm.add_detector(ContentDetector())
+        sm.detect_scenes(video=video, show_progress=False)
+        scene_list = sm.get_scene_list()
+
+        assert len(scene_list) >= len(EXPECTED_SCENES_VFR_DROP3), (
+            f"[{backend}] Expected at least {len(EXPECTED_SCENES_VFR_DROP3)} scenes, got {len(scene_list)}"
+        )
+        for i, ((start, end), (exp_start_tc, exp_end_tc)) in enumerate(
+            zip(scene_list, EXPECTED_SCENES_VFR_DROP3, strict=False)
+        ):
+            assert start.get_timecode() == exp_start_tc, (
+                f"[{backend}] Scene {i + 1} start: expected {exp_start_tc!r}, got {start.get_timecode()!r}"
+            )
+            assert end.get_timecode() == exp_end_tc, (
+                f"[{backend}] Scene {i + 1} end: expected {exp_end_tc!r}, got {end.get_timecode()!r}"
+            )
+
+    @pytest.mark.parametrize("backend", ["pyav", "opencv"])
+    def test_vfr_drop3_position_monotonic(self, test_vfr_drop3_video: str, backend: str):
+        """PTS-based position should be monotonically non-decreasing on synthetic VFR video."""
+        video = open_video(test_vfr_drop3_video, backend=backend)
+        last_seconds = -1.0
+        frame_count = 0
+        while True:
+            if video.read() is False:
+                break
+            current = video.position.seconds
+            assert current >= last_seconds, (
+                f"[{backend}] Position decreased at frame {frame_count}: {current} < {last_seconds}"
+            )
+            last_seconds = current
+            frame_count += 1
+        assert frame_count == 160  # 2/3 of original 240 frames in 10s at 24000/1001
+
     def test_cfr_position_is_timecode(self, test_movie_clip: str):
         """CFR video positions should also be Timecode-backed with PTS support."""
         video = open_video(test_movie_clip, backend="pyav")