Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/cli/backends.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,17 @@ It is mostly reliable and fast, although can occasionally run into issues proces

The OpenCV backend also supports image sequences as inputs (e.g. ``frame%02d.jpg`` if you want to load frame001.jpg, frame002.jpg, frame003.jpg...). Make sure to specify the framerate manually (``-f``/``--framerate``) to ensure accurate timing calculations.

Variable framerate (VFR) video is supported. Scene detection uses PTS-derived timestamps from ``CAP_PROP_POS_MSEC`` for accurate timecodes. Seeking compensates for OpenCV's average-fps-based internal seek approximation, so output timecodes remain accurate across the full video.


=======================================================================
PyAV
=======================================================================

The `PyAV <https://github.com/PyAV-Org/PyAV>`_ backend (`av package <https://pypi.org/project/av/>`_) is a more robust backend that handles multiple audio tracks and frame decode errors gracefully.

Variable framerate (VFR) video is fully supported. PyAV uses native PTS timestamps directly from the container, giving the most accurate timecodes for VFR content.

This backend can be used by specifying ``-b pyav`` via command line, or setting ``backend = pyav`` under the ``[global]`` section of your :ref:`config file <scenedetect_cli-config_file>`.


Expand All @@ -41,4 +45,6 @@ MoviePy launches ffmpeg as a subprocess, and can be used with various types of i

The MoviePy backend is still under development and is not included with current Windows distribution. To enable MoviePy support, you must install PySceneDetect using `python` and `pip`.

Variable framerate (VFR) video is **not supported**. MoviePy assumes a fixed framerate, so timecodes for VFR content will be inaccurate. Use the PyAV or OpenCV backend instead.

This backend can be used by specifying ``-b moviepy`` via command line, or setting ``backend = moviepy`` under the ``[global]`` section of your :ref:`config file <scenedetect_cli-config_file>`.
33 changes: 19 additions & 14 deletions scenedetect/_cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,17 +401,19 @@ def _save_xml_fcp(
sequence = ElementTree.SubElement(project, "sequence")
ElementTree.SubElement(sequence, "name").text = context.video_stream.name

fps = float(context.video_stream.frame_rate)
ntsc = "True" if context.video_stream.frame_rate.denominator != 1 else "False"
duration = scenes[-1][1] - scenes[0][0]
ElementTree.SubElement(sequence, "duration").text = f"{duration.frame_num}"
ElementTree.SubElement(sequence, "duration").text = str(round(duration.seconds * fps))

rate = ElementTree.SubElement(sequence, "rate")
ElementTree.SubElement(rate, "timebase").text = str(context.video_stream.frame_rate)
ElementTree.SubElement(rate, "ntsc").text = "False"
ElementTree.SubElement(rate, "timebase").text = str(round(fps))
ElementTree.SubElement(rate, "ntsc").text = ntsc

timecode = ElementTree.SubElement(sequence, "timecode")
tc_rate = ElementTree.SubElement(timecode, "rate")
ElementTree.SubElement(tc_rate, "timebase").text = str(context.video_stream.frame_rate)
ElementTree.SubElement(tc_rate, "ntsc").text = "False"
ElementTree.SubElement(tc_rate, "timebase").text = str(round(fps))
ElementTree.SubElement(tc_rate, "ntsc").text = ntsc
ElementTree.SubElement(timecode, "frame").text = "0"
ElementTree.SubElement(timecode, "displayformat").text = "NDF"

Expand All @@ -427,13 +429,13 @@ def _save_xml_fcp(
ElementTree.SubElement(clip, "name").text = f"Shot {i + 1}"
ElementTree.SubElement(clip, "enabled").text = "TRUE"
ElementTree.SubElement(clip, "rate").append(
ElementTree.fromstring(f"<timebase>{context.video_stream.frame_rate}</timebase>")
ElementTree.fromstring(f"<timebase>{round(fps)}</timebase>")
)
# TODO: Are these supposed to be frame numbers or another format?
ElementTree.SubElement(clip, "start").text = str(start.frame_num)
ElementTree.SubElement(clip, "end").text = str(end.frame_num)
ElementTree.SubElement(clip, "in").text = str(start.frame_num)
ElementTree.SubElement(clip, "out").text = str(end.frame_num)
# Frame numbers relative to the declared <timebase> fps, computed from PTS seconds.
ElementTree.SubElement(clip, "start").text = str(round(start.seconds * fps))
ElementTree.SubElement(clip, "end").text = str(round(end.seconds * fps))
ElementTree.SubElement(clip, "in").text = str(round(start.seconds * fps))
ElementTree.SubElement(clip, "out").text = str(round(end.seconds * fps))

file_ref = ElementTree.SubElement(clip, "file", id=f"file{i + 1}")
ElementTree.SubElement(file_ref, "name").text = context.video_stream.name
Expand Down Expand Up @@ -485,6 +487,9 @@ def save_xml(
logger.error(f"Unknown format: {format}")


# TODO: We have to export framerate as a float for OTIO's current format. When OTIO supports
# fractional timecodes, we should export the framerate as a rational number instead.
# https://github.com/AcademySoftwareFoundation/OpenTimelineIO/issues/190
def save_otio(
context: CliContext,
scenes: SceneList,
Expand All @@ -501,7 +506,7 @@ def save_otio(
video_name = context.video_stream.name
video_path = os.path.abspath(context.video_stream.path)
video_base_name = os.path.basename(context.video_stream.path)
frame_rate = context.video_stream.frame_rate
frame_rate = float(context.video_stream.frame_rate)

# List of track mapping to resource type.
# TODO(https://scenedetect.com/issues/497): Allow OTIO export without an audio track.
Expand Down Expand Up @@ -534,12 +539,12 @@ def save_otio(
"duration": {
"OTIO_SCHEMA": "RationalTime.1",
"rate": frame_rate,
"value": float((end - start).frame_num),
"value": round((end - start).seconds * frame_rate, 6),
},
"start_time": {
"OTIO_SCHEMA": "RationalTime.1",
"rate": frame_rate,
"value": float(start.frame_num),
"value": round(start.seconds * frame_rate, 6),
},
},
"enabled": True,
Expand Down
22 changes: 13 additions & 9 deletions scenedetect/backends/moviepy.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
"""

import typing as ty
from fractions import Fraction
from logging import getLogger

import cv2
import numpy as np
from moviepy.video.io.ffmpeg_reader import FFMPEG_VideoReader

from scenedetect.backends.opencv import VideoStreamCv2
from scenedetect.common import _USE_PTS_IN_DEVELOPMENT, FrameTimecode
from scenedetect.common import FrameTimecode, Timecode, framerate_to_fraction
from scenedetect.platform import get_file_name
from scenedetect.video_stream import SeekError, VideoOpenFailure, VideoStream

Expand Down Expand Up @@ -83,9 +84,9 @@ def __init__(
"""Unique name used to identify this backend."""

@property
def frame_rate(self) -> float:
"""Framerate in frames/sec."""
return self._reader.fps
def frame_rate(self) -> Fraction:
"""Framerate in frames/sec as a rational Fraction."""
return framerate_to_fraction(self._reader.fps)

@property
def path(self) -> ty.Union[bytes, str]:
Expand Down Expand Up @@ -135,7 +136,14 @@ def position(self) -> FrameTimecode:
calling `read`. This will always return 0 (e.g. be equal to `base_timecode`) if no frames
have been `read` yet."""
frame_number = max(self._frame_number - 1, 0)
return FrameTimecode(frame_number, self.frame_rate)
# Synthesize a Timecode from the frame count and rational framerate.
# MoviePy assumes CFR, so this is equivalent to frame-based timing.
# Use the framerate denominator as the time_base denominator for exact timing.
fps = self.frame_rate
time_base = Fraction(1, fps.numerator)
pts = frame_number * fps.denominator
timecode = Timecode(pts=pts, time_base=time_base)
return FrameTimecode(timecode=timecode, fps=fps)

@property
def position_ms(self) -> float:
Expand Down Expand Up @@ -173,10 +181,6 @@ def seek(self, target: ty.Union[FrameTimecode, float, int]):
ValueError: `target` is not a valid value (i.e. it is negative).
"""
success = False
if _USE_PTS_IN_DEVELOPMENT:
# TODO(https://scenedetect.com/issue/168): Need to handle PTS here.
raise NotImplementedError()

if not isinstance(target, FrameTimecode):
target = FrameTimecode(target, self.frame_rate)
try:
Expand Down
90 changes: 47 additions & 43 deletions scenedetect/backends/opencv.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import cv2
import numpy as np

from scenedetect.common import _USE_PTS_IN_DEVELOPMENT, MAX_FPS_DELTA, FrameTimecode, Timecode
from scenedetect.common import MAX_FPS_DELTA, FrameTimecode, Timecode, framerate_to_fraction
from scenedetect.platform import get_file_name
from scenedetect.video_stream import (
FrameRateUnavailable,
Expand Down Expand Up @@ -111,7 +111,7 @@ def __init__(
self._cap: ty.Optional[cv2.VideoCapture] = (
None # Reference to underlying cv2.VideoCapture object.
)
self._frame_rate: ty.Optional[float] = None
self._frame_rate: ty.Optional[Fraction] = None

# VideoCapture state
self._has_grabbed = False
Expand Down Expand Up @@ -144,7 +144,7 @@ def capture(self) -> cv2.VideoCapture:
"""Unique name used to identify this backend."""

@property
def frame_rate(self) -> float:
def frame_rate(self) -> Fraction:
assert self._frame_rate
return self._frame_rate

Expand Down Expand Up @@ -196,30 +196,25 @@ def aspect_ratio(self) -> float:

@property
def timecode(self) -> Timecode:
"""Current position within stream as a Timecode. This is not frame accurate."""
"""Current position within stream as a Timecode."""
# *NOTE*: Although OpenCV has `CAP_PROP_PTS`, it doesn't seem to be reliable. For now, we
# use `CAP_PROP_POS_MSEC` instead, with a time base of 1/1000. Unfortunately this means that
# rounding errors will affect frame accuracy with this backend.
pts = self._cap.get(cv2.CAP_PROP_POS_MSEC)
time_base = Fraction(1, 1000)
return Timecode(pts=round(pts), time_base=time_base)
# use `CAP_PROP_POS_MSEC` instead, converting to microseconds for sufficient precision to
# avoid frame-boundary rounding errors at common framerates like 24000/1001.
ms = self._cap.get(cv2.CAP_PROP_POS_MSEC)
time_base = Fraction(1, 1000000)
return Timecode(pts=round(ms * 1000), time_base=time_base)

@property
def position(self) -> FrameTimecode:
# TODO(https://scenedetect.com/issue/168): See if there is a better way to do this, or
# add a config option before landing this.
if _USE_PTS_IN_DEVELOPMENT:
timecode = self.timecode
# If PTS is 0 but we've read frames, derive from frame number.
# This handles image sequences and cases where CAP_PROP_POS_MSEC is unreliable.
if timecode.pts == 0 and self.frame_number > 0:
time_sec = (self.frame_number - 1) / self.frame_rate
pts = round(time_sec * 1000)
timecode = Timecode(pts=pts, time_base=Fraction(1, 1000))
return FrameTimecode(timecode=timecode, fps=self.frame_rate)
if self.frame_number < 1:
return self.base_timecode
return self.base_timecode + (self.frame_number - 1)
timecode = self.timecode
# If PTS is 0 but we've read frames, derive from frame number.
# This handles image sequences and cases where CAP_PROP_POS_MSEC is unreliable.
if timecode.pts == 0 and self.frame_number > 0:
fps = self.frame_rate
time_base = Fraction(1, fps.numerator)
pts = (self.frame_number - 1) * fps.denominator
timecode = Timecode(pts=pts, time_base=time_base)
return FrameTimecode(timecode=timecode, fps=self.frame_rate)

@property
def position_ms(self) -> float:
Expand All @@ -235,23 +230,32 @@ def seek(self, target: ty.Union[FrameTimecode, float, int]):
if target < 0:
raise ValueError("Target seek position cannot be negative!")

# TODO(https://scenedetect.com/issue/168): Shouldn't use frames for VFR video here.
# Have to seek one behind and call grab() after to that the VideoCapture
# returns a valid timestamp when using CAP_PROP_POS_MSEC.
target_frame_cv2 = (self.base_timecode + target).frame_num
if target_frame_cv2 > 0:
target_frame_cv2 -= 1
self._cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame_cv2)
target_secs = (self.base_timecode + target).seconds
self._has_grabbed = False
# Preemptively grab the frame behind the target position if possible.
if target > 0:
if target_secs > 0:
# Seek one frame before target so the next read() returns the frame at target.
one_frame_ms = 1000.0 / float(self._frame_rate)
seek_ms = max(0.0, target_secs * 1000.0 - one_frame_ms)
self._cap.set(cv2.CAP_PROP_POS_MSEC, seek_ms)
self._has_grabbed = self._cap.grab()
# If we seeked past the end of the video, need to seek one frame backwards
# from the current position and grab that frame instead.
if self._has_grabbed:
# VFR correction: set(CAP_PROP_POS_MSEC) converts time using avg_fps internally,
# which can land ~1s too early for VFR video. Read forward until we reach the
# intended position. The threshold (2x one_frame_ms) never triggers for CFR.
actual_ms = self._cap.get(cv2.CAP_PROP_POS_MSEC)
corrections = 0
while actual_ms < seek_ms - 2.0 * one_frame_ms and corrections < 100:
if not self._cap.grab():
break
actual_ms = self._cap.get(cv2.CAP_PROP_POS_MSEC)
corrections += 1
# If we seeked past the end, back up one frame.
if not self._has_grabbed:
seek_pos = round(self._cap.get(cv2.CAP_PROP_POS_FRAMES) - 1.0)
self._cap.set(cv2.CAP_PROP_POS_FRAMES, max(0, seek_pos))
self._has_grabbed = self._cap.grab()
else:
self._cap.set(cv2.CAP_PROP_POS_FRAMES, 0)

def reset(self):
"""Close and re-open the VideoStream (should be equivalent to calling `seek(0)`)."""
Expand Down Expand Up @@ -329,14 +333,11 @@ def _open_capture(self, framerate: ty.Optional[float] = None):
raise FrameRateUnavailable()

self._cap = cap
self._frame_rate = framerate
self._frame_rate = framerate_to_fraction(framerate)
self._has_grabbed = False
cap.set(cv2.CAP_PROP_ORIENTATION_AUTO, 1.0) # https://github.com/opencv/opencv/issues/26795


# TODO(https://scenedetect.com/issues/168): Support non-monotonic timing for `position`. VFR timecode
# support is a prerequisite for this. Timecodes are currently calculated by multiplying the
# framerate by number of frames. Actual elapsed time can be obtained via `position_ms` for now.
class VideoCaptureAdapter(VideoStream):
"""Adapter for existing VideoCapture objects. Unlike VideoStreamCv2, this class supports
VideoCaptures which may not support seeking.
Expand Down Expand Up @@ -378,7 +379,7 @@ def __init__(
raise FrameRateUnavailable()

self._cap = cap
self._frame_rate: float = framerate
self._frame_rate: Fraction = framerate_to_fraction(framerate)
self._num_frames = 0
self._max_read_attempts = max_read_attempts
self._decode_failures = 0
Expand Down Expand Up @@ -408,7 +409,7 @@ def capture(self) -> cv2.VideoCapture:
"""Unique name used to identify this backend."""

@property
def frame_rate(self) -> float:
def frame_rate(self) -> Fraction:
"""Framerate in frames/sec."""
assert self._frame_rate
return self._frame_rate
Expand Down Expand Up @@ -439,8 +440,6 @@ def frame_size(self) -> ty.Tuple[int, int]:
@property
def duration(self) -> ty.Optional[FrameTimecode]:
"""Duration of the stream as a FrameTimecode, or None if non terminating."""
# TODO(https://scenedetect.com/issue/168): This will be incorrect for VFR. See if there is
# another property we can use to estimate the video length correctly.
frame_count = math.trunc(self._cap.get(cv2.CAP_PROP_FRAME_COUNT))
if frame_count > 0:
return self.base_timecode + frame_count
Expand All @@ -455,7 +454,12 @@ def aspect_ratio(self) -> float:
def position(self) -> FrameTimecode:
if self.frame_number < 1:
return self.base_timecode
return self.base_timecode + (self.frame_number - 1)
# Synthesize a Timecode from frame count and rational framerate.
fps = self.frame_rate
time_base = Fraction(1, fps.numerator)
pts = (self.frame_number - 1) * fps.denominator
timecode = Timecode(pts=pts, time_base=time_base)
return FrameTimecode(timecode=timecode, fps=fps)

@property
def position_ms(self) -> float:
Expand Down
Loading
Loading