Skip to content

Commit 968e242

Browse files
authored
Merge branch 'main' into fix-ffmpeg-install-flake
2 parents 20181f5 + fda8c58 commit 968e242

29 files changed

Lines changed: 1001 additions & 369 deletions

docs/cli/backends.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,17 @@ It is mostly reliable and fast, although can occasionally run into issues proces
2121

2222
The OpenCV backend also supports image sequences as inputs (e.g. ``frame%02d.jpg`` if you want to load frame001.jpg, frame002.jpg, frame003.jpg...). Make sure to specify the framerate manually (``-f``/``--framerate``) to ensure accurate timing calculations.
2323

24+
Variable framerate (VFR) video is supported. Scene detection uses PTS-derived timestamps from ``CAP_PROP_POS_MSEC`` for accurate timecodes. Seeking compensates for OpenCV's average-fps-based internal seek approximation, so output timecodes remain accurate across the full video.
25+
2426

2527
=======================================================================
2628
PyAV
2729
=======================================================================
2830

2931
The `PyAV <https://github.com/PyAV-Org/PyAV>`_ backend (`av package <https://pypi.org/project/av/>`_) is a more robust backend that handles multiple audio tracks and frame decode errors gracefully.
3032

33+
Variable framerate (VFR) video is fully supported. PyAV uses native PTS timestamps directly from the container, giving the most accurate timecodes for VFR content.
34+
3135
This backend can be used by specifying ``-b pyav`` via command line, or setting ``backend = pyav`` under the ``[global]`` section of your :ref:`config file <scenedetect_cli-config_file>`.
3236

3337

@@ -41,4 +45,6 @@ MoviePy launches ffmpeg as a subprocess, and can be used with various types of i
4145

4246
The MoviePy backend is still under development and is not included with current Windows distribution. To enable MoviePy support, you must install PySceneDetect using `python` and `pip`.
4347

48+
Variable framerate (VFR) video is **not supported**. MoviePy assumes a fixed framerate, so timecodes for VFR content will be inaccurate. Use the PyAV or OpenCV backend instead.
49+
4450
This backend can be used by specifying ``-b moviepy`` via command line, or setting ``backend = moviepy`` under the ``[global]`` section of your :ref:`config file <scenedetect_cli-config_file>`.

scenedetect.cfg

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,8 @@
227227
# Compression amount for png images (0 to 9). Only affects size, not quality.
228228
#compression = 3
229229

230-
# Number of frames to ignore around each scene cut when selecting frames.
230+
# Padding around each scene cut when selecting frames. Accepts a number of frames (1),
231+
# seconds with `s` suffix (0.1s), or timecode (00:00:00.100).
231232
#frame-margin = 1
232233

233234
# Resize by scale factor (0.5 = half, 1.0 = same, 2.0 = double).

scenedetect/_cli/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1397,11 +1397,11 @@ def split_video_command(
13971397
@click.option(
13981398
"-m",
13991399
"--frame-margin",
1400-
metavar="N",
1400+
metavar="DURATION",
14011401
default=None,
1402-
type=click.INT,
1403-
help="Number of frames to ignore at beginning/end of scenes when saving images. Controls temporal padding on scene boundaries.%s"
1404-
% (USER_CONFIG.get_help_string("save-images", "num-images")),
1402+
type=click.STRING,
1403+
help="Padding around the beginning/end of each scene used when selecting which frames to extract. DURATION can be specified in frames (-m 1), in seconds with `s` suffix (-m 0.1s), or timecode (-m 00:00:00.100).%s"
1404+
% (USER_CONFIG.get_help_string("save-images", "frame-margin")),
14051405
)
14061406
@click.option(
14071407
"--scale",
@@ -1441,7 +1441,7 @@ def save_images_command(
14411441
quality: ty.Optional[int] = None,
14421442
png: bool = False,
14431443
compression: ty.Optional[int] = None,
1444-
frame_margin: ty.Optional[int] = None,
1444+
frame_margin: ty.Optional[str] = None,
14451445
scale: ty.Optional[float] = None,
14461446
height: ty.Optional[int] = None,
14471447
width: ty.Optional[int] = None,

scenedetect/_cli/commands.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -401,17 +401,19 @@ def _save_xml_fcp(
401401
sequence = ElementTree.SubElement(project, "sequence")
402402
ElementTree.SubElement(sequence, "name").text = context.video_stream.name
403403

404+
fps = float(context.video_stream.frame_rate)
405+
ntsc = "True" if context.video_stream.frame_rate.denominator != 1 else "False"
404406
duration = scenes[-1][1] - scenes[0][0]
405-
ElementTree.SubElement(sequence, "duration").text = f"{duration.frame_num}"
407+
ElementTree.SubElement(sequence, "duration").text = str(round(duration.seconds * fps))
406408

407409
rate = ElementTree.SubElement(sequence, "rate")
408-
ElementTree.SubElement(rate, "timebase").text = str(context.video_stream.frame_rate)
409-
ElementTree.SubElement(rate, "ntsc").text = "False"
410+
ElementTree.SubElement(rate, "timebase").text = str(round(fps))
411+
ElementTree.SubElement(rate, "ntsc").text = ntsc
410412

411413
timecode = ElementTree.SubElement(sequence, "timecode")
412414
tc_rate = ElementTree.SubElement(timecode, "rate")
413-
ElementTree.SubElement(tc_rate, "timebase").text = str(context.video_stream.frame_rate)
414-
ElementTree.SubElement(tc_rate, "ntsc").text = "False"
415+
ElementTree.SubElement(tc_rate, "timebase").text = str(round(fps))
416+
ElementTree.SubElement(tc_rate, "ntsc").text = ntsc
415417
ElementTree.SubElement(timecode, "frame").text = "0"
416418
ElementTree.SubElement(timecode, "displayformat").text = "NDF"
417419

@@ -427,13 +429,13 @@ def _save_xml_fcp(
427429
ElementTree.SubElement(clip, "name").text = f"Shot {i + 1}"
428430
ElementTree.SubElement(clip, "enabled").text = "TRUE"
429431
ElementTree.SubElement(clip, "rate").append(
430-
ElementTree.fromstring(f"<timebase>{context.video_stream.frame_rate}</timebase>")
432+
ElementTree.fromstring(f"<timebase>{round(fps)}</timebase>")
431433
)
432-
# TODO: Are these supposed to be frame numbers or another format?
433-
ElementTree.SubElement(clip, "start").text = str(start.frame_num)
434-
ElementTree.SubElement(clip, "end").text = str(end.frame_num)
435-
ElementTree.SubElement(clip, "in").text = str(start.frame_num)
436-
ElementTree.SubElement(clip, "out").text = str(end.frame_num)
434+
# Frame numbers relative to the declared <timebase> fps, computed from PTS seconds.
435+
ElementTree.SubElement(clip, "start").text = str(round(start.seconds * fps))
436+
ElementTree.SubElement(clip, "end").text = str(round(end.seconds * fps))
437+
ElementTree.SubElement(clip, "in").text = str(round(start.seconds * fps))
438+
ElementTree.SubElement(clip, "out").text = str(round(end.seconds * fps))
437439

438440
file_ref = ElementTree.SubElement(clip, "file", id=f"file{i + 1}")
439441
ElementTree.SubElement(file_ref, "name").text = context.video_stream.name
@@ -485,6 +487,9 @@ def save_xml(
485487
logger.error(f"Unknown format: {format}")
486488

487489

490+
# TODO: We have to export framerate as a float for OTIO's current format. When OTIO supports
491+
# fractional timecodes, we should export the framerate as a rational number instead.
492+
# https://github.com/AcademySoftwareFoundation/OpenTimelineIO/issues/190
488493
def save_otio(
489494
context: CliContext,
490495
scenes: SceneList,
@@ -501,7 +506,7 @@ def save_otio(
501506
video_name = context.video_stream.name
502507
video_path = os.path.abspath(context.video_stream.path)
503508
video_base_name = os.path.basename(context.video_stream.path)
504-
frame_rate = context.video_stream.frame_rate
509+
frame_rate = float(context.video_stream.frame_rate)
505510

506511
# List of track mapping to resource type.
507512
# TODO(https://scenedetect.com/issues/497): Allow OTIO export without an audio track.
@@ -534,12 +539,12 @@ def save_otio(
534539
"duration": {
535540
"OTIO_SCHEMA": "RationalTime.1",
536541
"rate": frame_rate,
537-
"value": float((end - start).frame_num),
542+
"value": round((end - start).seconds * frame_rate, 6),
538543
},
539544
"start_time": {
540545
"OTIO_SCHEMA": "RationalTime.1",
541546
"rate": frame_rate,
542-
"value": float(start.frame_num),
547+
"value": round(start.seconds * frame_rate, 6),
543548
},
544549
},
545550
"enabled": True,

scenedetect/_cli/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@ class XmlFormat(Enum):
412412
"compression": RangeValue(3, min_val=0, max_val=9),
413413
"filename": "$VIDEO_NAME-Scene-$SCENE_NUMBER-$IMAGE_NUMBER",
414414
"format": "jpeg",
415-
"frame-margin": 1,
415+
"frame-margin": TimecodeValue(1),
416416
"height": 0,
417417
"num-images": 3,
418418
"output": None,

scenedetect/backends/moviepy.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@
1717
"""
1818

1919
import typing as ty
20+
from fractions import Fraction
2021
from logging import getLogger
2122

2223
import cv2
2324
import numpy as np
2425
from moviepy.video.io.ffmpeg_reader import FFMPEG_VideoReader
2526

2627
from scenedetect.backends.opencv import VideoStreamCv2
27-
from scenedetect.common import _USE_PTS_IN_DEVELOPMENT, FrameTimecode
28+
from scenedetect.common import FrameTimecode, Timecode, framerate_to_fraction
2829
from scenedetect.platform import get_file_name
2930
from scenedetect.video_stream import SeekError, VideoOpenFailure, VideoStream
3031

@@ -83,9 +84,9 @@ def __init__(
8384
"""Unique name used to identify this backend."""
8485

8586
@property
86-
def frame_rate(self) -> float:
87-
"""Framerate in frames/sec."""
88-
return self._reader.fps
87+
def frame_rate(self) -> Fraction:
88+
"""Framerate in frames/sec as a rational Fraction."""
89+
return framerate_to_fraction(self._reader.fps)
8990

9091
@property
9192
def path(self) -> ty.Union[bytes, str]:
@@ -135,7 +136,14 @@ def position(self) -> FrameTimecode:
135136
calling `read`. This will always return 0 (e.g. be equal to `base_timecode`) if no frames
136137
have been `read` yet."""
137138
frame_number = max(self._frame_number - 1, 0)
138-
return FrameTimecode(frame_number, self.frame_rate)
139+
# Synthesize a Timecode from the frame count and rational framerate.
140+
# MoviePy assumes CFR, so this is equivalent to frame-based timing.
141+
# Use the framerate denominator as the time_base denominator for exact timing.
142+
fps = self.frame_rate
143+
time_base = Fraction(1, fps.numerator)
144+
pts = frame_number * fps.denominator
145+
timecode = Timecode(pts=pts, time_base=time_base)
146+
return FrameTimecode(timecode=timecode, fps=fps)
139147

140148
@property
141149
def position_ms(self) -> float:
@@ -173,10 +181,6 @@ def seek(self, target: ty.Union[FrameTimecode, float, int]):
173181
ValueError: `target` is not a valid value (i.e. it is negative).
174182
"""
175183
success = False
176-
if _USE_PTS_IN_DEVELOPMENT:
177-
# TODO(https://scenedetect.com/issue/168): Need to handle PTS here.
178-
raise NotImplementedError()
179-
180184
if not isinstance(target, FrameTimecode):
181185
target = FrameTimecode(target, self.frame_rate)
182186
try:

scenedetect/backends/opencv.py

Lines changed: 47 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import cv2
2828
import numpy as np
2929

30-
from scenedetect.common import _USE_PTS_IN_DEVELOPMENT, MAX_FPS_DELTA, FrameTimecode, Timecode
30+
from scenedetect.common import MAX_FPS_DELTA, FrameTimecode, Timecode, framerate_to_fraction
3131
from scenedetect.platform import get_file_name
3232
from scenedetect.video_stream import (
3333
FrameRateUnavailable,
@@ -111,7 +111,7 @@ def __init__(
111111
self._cap: ty.Optional[cv2.VideoCapture] = (
112112
None # Reference to underlying cv2.VideoCapture object.
113113
)
114-
self._frame_rate: ty.Optional[float] = None
114+
self._frame_rate: ty.Optional[Fraction] = None
115115

116116
# VideoCapture state
117117
self._has_grabbed = False
@@ -144,7 +144,7 @@ def capture(self) -> cv2.VideoCapture:
144144
"""Unique name used to identify this backend."""
145145

146146
@property
147-
def frame_rate(self) -> float:
147+
def frame_rate(self) -> Fraction:
148148
assert self._frame_rate
149149
return self._frame_rate
150150

@@ -196,30 +196,25 @@ def aspect_ratio(self) -> float:
196196

197197
@property
198198
def timecode(self) -> Timecode:
199-
"""Current position within stream as a Timecode. This is not frame accurate."""
199+
"""Current position within stream as a Timecode."""
200200
# *NOTE*: Although OpenCV has `CAP_PROP_PTS`, it doesn't seem to be reliable. For now, we
201-
# use `CAP_PROP_POS_MSEC` instead, with a time base of 1/1000. Unfortunately this means that
202-
# rounding errors will affect frame accuracy with this backend.
203-
pts = self._cap.get(cv2.CAP_PROP_POS_MSEC)
204-
time_base = Fraction(1, 1000)
205-
return Timecode(pts=round(pts), time_base=time_base)
201+
# use `CAP_PROP_POS_MSEC` instead, converting to microseconds for sufficient precision to
202+
# avoid frame-boundary rounding errors at common framerates like 24000/1001.
203+
ms = self._cap.get(cv2.CAP_PROP_POS_MSEC)
204+
time_base = Fraction(1, 1000000)
205+
return Timecode(pts=round(ms * 1000), time_base=time_base)
206206

207207
@property
208208
def position(self) -> FrameTimecode:
209-
# TODO(https://scenedetect.com/issue/168): See if there is a better way to do this, or
210-
# add a config option before landing this.
211-
if _USE_PTS_IN_DEVELOPMENT:
212-
timecode = self.timecode
213-
# If PTS is 0 but we've read frames, derive from frame number.
214-
# This handles image sequences and cases where CAP_PROP_POS_MSEC is unreliable.
215-
if timecode.pts == 0 and self.frame_number > 0:
216-
time_sec = (self.frame_number - 1) / self.frame_rate
217-
pts = round(time_sec * 1000)
218-
timecode = Timecode(pts=pts, time_base=Fraction(1, 1000))
219-
return FrameTimecode(timecode=timecode, fps=self.frame_rate)
220-
if self.frame_number < 1:
221-
return self.base_timecode
222-
return self.base_timecode + (self.frame_number - 1)
209+
timecode = self.timecode
210+
# If PTS is 0 but we've read frames, derive from frame number.
211+
# This handles image sequences and cases where CAP_PROP_POS_MSEC is unreliable.
212+
if timecode.pts == 0 and self.frame_number > 0:
213+
fps = self.frame_rate
214+
time_base = Fraction(1, fps.numerator)
215+
pts = (self.frame_number - 1) * fps.denominator
216+
timecode = Timecode(pts=pts, time_base=time_base)
217+
return FrameTimecode(timecode=timecode, fps=self.frame_rate)
223218

224219
@property
225220
def position_ms(self) -> float:
@@ -235,23 +230,32 @@ def seek(self, target: ty.Union[FrameTimecode, float, int]):
235230
if target < 0:
236231
raise ValueError("Target seek position cannot be negative!")
237232

238-
# TODO(https://scenedetect.com/issue/168): Shouldn't use frames for VFR video here.
239-
# Have to seek one behind and call grab() after to that the VideoCapture
240-
# returns a valid timestamp when using CAP_PROP_POS_MSEC.
241-
target_frame_cv2 = (self.base_timecode + target).frame_num
242-
if target_frame_cv2 > 0:
243-
target_frame_cv2 -= 1
244-
self._cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame_cv2)
233+
target_secs = (self.base_timecode + target).seconds
245234
self._has_grabbed = False
246-
# Preemptively grab the frame behind the target position if possible.
247-
if target > 0:
235+
if target_secs > 0:
236+
# Seek one frame before target so the next read() returns the frame at target.
237+
one_frame_ms = 1000.0 / float(self._frame_rate)
238+
seek_ms = max(0.0, target_secs * 1000.0 - one_frame_ms)
239+
self._cap.set(cv2.CAP_PROP_POS_MSEC, seek_ms)
248240
self._has_grabbed = self._cap.grab()
249-
# If we seeked past the end of the video, need to seek one frame backwards
250-
# from the current position and grab that frame instead.
241+
if self._has_grabbed:
242+
# VFR correction: set(CAP_PROP_POS_MSEC) converts time using avg_fps internally,
243+
# which can land ~1s too early for VFR video. Read forward until we reach the
244+
# intended position. The threshold (2x one_frame_ms) never triggers for CFR.
245+
actual_ms = self._cap.get(cv2.CAP_PROP_POS_MSEC)
246+
corrections = 0
247+
while actual_ms < seek_ms - 2.0 * one_frame_ms and corrections < 100:
248+
if not self._cap.grab():
249+
break
250+
actual_ms = self._cap.get(cv2.CAP_PROP_POS_MSEC)
251+
corrections += 1
252+
# If we seeked past the end, back up one frame.
251253
if not self._has_grabbed:
252254
seek_pos = round(self._cap.get(cv2.CAP_PROP_POS_FRAMES) - 1.0)
253255
self._cap.set(cv2.CAP_PROP_POS_FRAMES, max(0, seek_pos))
254256
self._has_grabbed = self._cap.grab()
257+
else:
258+
self._cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
255259

256260
def reset(self):
257261
"""Close and re-open the VideoStream (should be equivalent to calling `seek(0)`)."""
@@ -329,14 +333,11 @@ def _open_capture(self, framerate: ty.Optional[float] = None):
329333
raise FrameRateUnavailable()
330334

331335
self._cap = cap
332-
self._frame_rate = framerate
336+
self._frame_rate = framerate_to_fraction(framerate)
333337
self._has_grabbed = False
334338
cap.set(cv2.CAP_PROP_ORIENTATION_AUTO, 1.0) # https://github.com/opencv/opencv/issues/26795
335339

336340

337-
# TODO(https://scenedetect.com/issues/168): Support non-monotonic timing for `position`. VFR timecode
338-
# support is a prerequisite for this. Timecodes are currently calculated by multiplying the
339-
# framerate by number of frames. Actual elapsed time can be obtained via `position_ms` for now.
340341
class VideoCaptureAdapter(VideoStream):
341342
"""Adapter for existing VideoCapture objects. Unlike VideoStreamCv2, this class supports
342343
VideoCaptures which may not support seeking.
@@ -378,7 +379,7 @@ def __init__(
378379
raise FrameRateUnavailable()
379380

380381
self._cap = cap
381-
self._frame_rate: float = framerate
382+
self._frame_rate: Fraction = framerate_to_fraction(framerate)
382383
self._num_frames = 0
383384
self._max_read_attempts = max_read_attempts
384385
self._decode_failures = 0
@@ -408,7 +409,7 @@ def capture(self) -> cv2.VideoCapture:
408409
"""Unique name used to identify this backend."""
409410

410411
@property
411-
def frame_rate(self) -> float:
412+
def frame_rate(self) -> Fraction:
412413
"""Framerate in frames/sec."""
413414
assert self._frame_rate
414415
return self._frame_rate
@@ -439,8 +440,6 @@ def frame_size(self) -> ty.Tuple[int, int]:
439440
@property
440441
def duration(self) -> ty.Optional[FrameTimecode]:
441442
"""Duration of the stream as a FrameTimecode, or None if non terminating."""
442-
# TODO(https://scenedetect.com/issue/168): This will be incorrect for VFR. See if there is
443-
# another property we can use to estimate the video length correctly.
444443
frame_count = math.trunc(self._cap.get(cv2.CAP_PROP_FRAME_COUNT))
445444
if frame_count > 0:
446445
return self.base_timecode + frame_count
@@ -455,7 +454,12 @@ def aspect_ratio(self) -> float:
455454
def position(self) -> FrameTimecode:
456455
if self.frame_number < 1:
457456
return self.base_timecode
458-
return self.base_timecode + (self.frame_number - 1)
457+
# Synthesize a Timecode from frame count and rational framerate.
458+
fps = self.frame_rate
459+
time_base = Fraction(1, fps.numerator)
460+
pts = (self.frame_number - 1) * fps.denominator
461+
timecode = Timecode(pts=pts, time_base=time_base)
462+
return FrameTimecode(timecode=timecode, fps=fps)
459463

460464
@property
461465
def position_ms(self) -> float:

0 commit comments

Comments
 (0)