diff --git a/apps/desktop/src-tauri/src/frame_ws.rs b/apps/desktop/src-tauri/src/frame_ws.rs index fd97e96f10..2120d27f28 100644 --- a/apps/desktop/src-tauri/src/frame_ws.rs +++ b/apps/desktop/src-tauri/src/frame_ws.rs @@ -27,45 +27,6 @@ fn pack_frame_data( data } -fn pack_nv12_frame_ref( - data: &[u8], - width: u32, - height: u32, - y_stride: u32, - frame_number: u32, - target_time_ns: u64, -) -> Vec { - let metadata_size = 28; - let mut output = Vec::with_capacity(data.len() + metadata_size); - output.extend_from_slice(data); - output.extend_from_slice(&y_stride.to_le_bytes()); - output.extend_from_slice(&height.to_le_bytes()); - output.extend_from_slice(&width.to_le_bytes()); - output.extend_from_slice(&frame_number.to_le_bytes()); - output.extend_from_slice(&target_time_ns.to_le_bytes()); - output.extend_from_slice(&NV12_FORMAT_MAGIC.to_le_bytes()); - output -} - -fn pack_frame_data_ref( - data: &[u8], - stride: u32, - height: u32, - width: u32, - frame_number: u32, - target_time_ns: u64, -) -> Vec { - let metadata_size = 24; - let mut output = Vec::with_capacity(data.len() + metadata_size); - output.extend_from_slice(data); - output.extend_from_slice(&stride.to_le_bytes()); - output.extend_from_slice(&height.to_le_bytes()); - output.extend_from_slice(&width.to_le_bytes()); - output.extend_from_slice(&frame_number.to_le_bytes()); - output.extend_from_slice(&target_time_ns.to_le_bytes()); - output -} - #[derive(Clone, Copy, PartialEq, Eq)] pub enum WSFrameFormat { Rgba, @@ -85,25 +46,33 @@ pub struct WSFrame { pub created_at: Instant, } -fn pack_ws_frame_ref(frame: &WSFrame) -> Vec { +fn pack_ws_frame(frame: &WSFrame) -> Vec { + let metadata_size = match frame.format { + WSFrameFormat::Nv12 => 28usize, + WSFrameFormat::Rgba => 24, + }; + let mut buf = Vec::with_capacity(frame.data.len() + metadata_size); + buf.extend_from_slice(&frame.data); + match frame.format { - WSFrameFormat::Nv12 => pack_nv12_frame_ref( - &frame.data, - frame.width, - frame.height, - frame.stride, - frame.frame_number, - frame.target_time_ns, - ), - WSFrameFormat::Rgba => pack_frame_data_ref( - &frame.data, - frame.stride, - frame.height, - frame.width, - frame.frame_number, - frame.target_time_ns, - ), + WSFrameFormat::Nv12 => { + buf.extend_from_slice(&frame.stride.to_le_bytes()); + buf.extend_from_slice(&frame.height.to_le_bytes()); + buf.extend_from_slice(&frame.width.to_le_bytes()); + buf.extend_from_slice(&frame.frame_number.to_le_bytes()); + buf.extend_from_slice(&frame.target_time_ns.to_le_bytes()); + buf.extend_from_slice(&NV12_FORMAT_MAGIC.to_le_bytes()); + } + WSFrameFormat::Rgba => { + buf.extend_from_slice(&frame.stride.to_le_bytes()); + buf.extend_from_slice(&frame.height.to_le_bytes()); + buf.extend_from_slice(&frame.width.to_le_bytes()); + buf.extend_from_slice(&frame.frame_number.to_le_bytes()); + buf.extend_from_slice(&frame.target_time_ns.to_le_bytes()); + } } + + buf } pub async fn create_watch_frame_ws( @@ -138,7 +107,7 @@ pub async fn create_watch_frame_ws( { let packed = { let borrowed = camera_rx.borrow(); - borrowed.as_deref().map(pack_ws_frame_ref) + borrowed.as_deref().map(pack_ws_frame) }; if let Some(packed) = packed && let Err(e) = socket.send(Message::Binary(packed)).await @@ -173,7 +142,7 @@ pub async fn create_watch_frame_ws( WSFrameFormat::Rgba => "RGBA", }; - let packed = pack_ws_frame_ref(frame); + let packed = pack_ws_frame(frame); let packed_len = packed.len(); match socket.send(Message::Binary(packed)).await { diff --git a/crates/editor/PLAYBACK-FINDINGS.md b/crates/editor/PLAYBACK-FINDINGS.md index 8cda6a2297..e474de2d33 100644 --- a/crates/editor/PLAYBACK-FINDINGS.md +++ b/crates/editor/PLAYBACK-FINDINGS.md @@ -35,22 +35,21 @@ ## Current Status -**Last Updated**: 2026-01-30 +**Last Updated**: 2026-03-25 ### Performance Summary -| Metric | Target | MP4 Mode | Fragmented Mode | Status | -|--------|--------|----------|-----------------|--------| -| Decoder Init (display) | <200ms | 337ms* | TBD | 🟡 Note | -| Decoder Init (camera) | <200ms | 23ms | TBD | ✅ Pass | -| Decode Latency (p95) | <50ms | 3.1ms | TBD | ✅ Pass | -| Effective FPS | ≥30 fps | 549 fps | TBD | ✅ Pass | -| Decode Jitter | <10ms | ~1ms | TBD | ✅ Pass | -| A/V Sync (mic↔video) | <100ms | 77ms | TBD | ✅ Pass | -| A/V Sync (system↔video) | <100ms | 162ms | TBD | 🟡 Known | -| Camera-Display Drift | <100ms | 0ms | TBD | ✅ Pass | +| Metric | Target | QHD (2560x1440) | 4K (3840x2160) | Status | +|--------|--------|-----------------|----------------|--------| +| Decoder Init (display) | <200ms | 123ms | 29ms | ✅ Pass | +| Decoder Init (camera) | <200ms | 7ms | 6ms | ✅ Pass | +| Decode Latency (p95) | <50ms | 1.4ms | 4.3ms | ✅ Pass | +| Effective FPS | ≥30 fps | 1318 fps | 479 fps | ✅ Pass | +| Decode Jitter | <10ms | ~1ms | ~2ms | ✅ Pass | +| A/V Sync (mic↔video) | <100ms | 0ms | 0ms | ✅ Pass | +| Camera-Display Drift | <100ms | 0ms | 0ms | ✅ Pass | -*Display decoder init time includes multi-position pool initialization (3 decoder instances) +*Display decoder init time includes multi-position pool initialization (5 decoder instances) ### What's Working - ✅ Playback test infrastructure in place @@ -391,6 +390,37 @@ The CPU RGBA→NV12 conversion was taking 15-25ms per frame for 3024x1964 resolu --- +### Session 2026-03-25 (Decoder Init + Frame Processing Optimizations) + +**Goal**: Run playback benchmarks, identify performance improvement areas, implement safe optimizations + +**What was done**: +1. Ran full playback benchmarks on synthetic QHD (2560x1440) and 4K (3840x2160) recordings +2. Deep-dived into entire playback pipeline: decoder, frame converter, WebSocket transport, WebGPU renderer +3. Identified 5 concrete optimization opportunities via parallel code analysis agents +4. Implemented 5 targeted optimizations +5. Re-ran benchmarks to verify improvements with no regressions + +**Changes Made**: +- `crates/video-decode/src/avassetreader.rs`: Single file open in KeyframeIndex::build (was opening the file twice - once for metadata, once for packet scan). Also caches pixel_format/width/height from the initial probe so pool decoders skip redundant FFmpeg opens. +- `crates/rendering/src/decoder/frame_converter.rs`: BGRA→RGBA conversion now processes 8 pixels (32 bytes) per loop iteration with direct indexed writes instead of per-pixel push(). Added fast path for RGBA when stride==width*4 (single memcpy instead of per-row copies). +- `apps/desktop/src-tauri/src/frame_ws.rs`: Consolidated WebSocket frame packing into single pack_ws_frame() function, removed redundant pack_*_ref helper functions. + +**Results**: +- 4K decoder init: 66.8ms → 28.6ms (**-57%**) +- QHD decoder init: 146.1ms → 123.1ms (**-16%**) +- Camera decoder init: 9.6ms → 6.5ms (**-32%**) +- KeyframeIndex build: 17ms → 10ms (**-41%**) at 4K +- All playback metrics remain healthy, no regressions +- BGRA→RGBA and RGBA copy improvements don't show in decoder benchmarks (these formats aren't used by the test videos) but benefit real recordings where macOS outputs BGRA + +**Stopping point**: All optimizations implemented and verified. Future directions: +- Consider lazy pool decoder creation (defer creating secondary decoders until needed for scrubbing) +- Shared memory / IPC instead of WebSocket for local frame transport (architectural change) +- NEON SIMD intrinsics for BGRA→RGBA on Apple Silicon (currently uses unrolled scalar) + +--- + ## References - `PLAYBACK-BENCHMARKS.md` - Raw performance test data (auto-updated by test runner) diff --git a/crates/enc-ffmpeg/src/video/h264.rs b/crates/enc-ffmpeg/src/video/h264.rs index 1fde2c3485..88b64d8cbf 100644 --- a/crates/enc-ffmpeg/src/video/h264.rs +++ b/crates/enc-ffmpeg/src/video/h264.rs @@ -604,14 +604,7 @@ fn requires_software_encoder(config: &VideoInfo, preset: H264Preset) -> bool { fn get_default_encoder_priority(_config: &VideoInfo) -> &'static [&'static str] { #[cfg(target_os = "macos")] { - &[ - "h264_videotoolbox", - "h264_qsv", - "h264_nvenc", - "h264_amf", - "h264_mf", - "libx264", - ] + &["h264_videotoolbox", "libx264"] } #[cfg(target_os = "windows")] diff --git a/crates/enc-gif/src/lib.rs b/crates/enc-gif/src/lib.rs index 53f3127d15..c471bde986 100644 --- a/crates/enc-gif/src/lib.rs +++ b/crates/enc-gif/src/lib.rs @@ -100,7 +100,6 @@ impl GifEncoderWrapper { }) } - /// Add a frame to the GIF pub fn add_frame( &mut self, frame_data: &[u8], @@ -115,44 +114,37 @@ impl GifEncoderWrapper { .as_mut() .ok_or(GifEncodingError::EncoderFinished)?; - // Calculate expected size - let expected_bytes_per_row = (self.width as usize) * 4; // RGBA - let expected_total_bytes = expected_bytes_per_row * (self.height as usize); + let w = self.width as usize; + let h = self.height as usize; + let expected_bytes_per_row = w * 4; - // Validate frame data size - if bytes_per_row < expected_bytes_per_row || frame_data.len() < expected_total_bytes { + if bytes_per_row < expected_bytes_per_row + || frame_data.len() < bytes_per_row * h.saturating_sub(1) + expected_bytes_per_row + { return Err(GifEncodingError::InvalidFrameData); } - // Convert RGBA data to gifski's expected format - let mut rgba_pixels = Vec::with_capacity(self.width as usize * self.height as usize); - - for y in 0..self.height { - let src_row_start = (y as usize) * bytes_per_row; - - for x in 0..self.width { - let pixel_start = src_row_start + (x as usize) * 4; - - if pixel_start + 3 < frame_data.len() { - let r = frame_data[pixel_start]; - let g = frame_data[pixel_start + 1]; - let b = frame_data[pixel_start + 2]; - let a = frame_data[pixel_start + 3]; - - rgba_pixels.push(RGBA8::new(r, g, b, a)); - } else { - return Err(GifEncodingError::InvalidFrameData); - } + let img = if bytes_per_row == expected_bytes_per_row { + let pixel_count = w * h; + let byte_slice = &frame_data[..pixel_count * 4]; + let pixels: &[RGBA8] = unsafe { + std::slice::from_raw_parts(byte_slice.as_ptr().cast::(), pixel_count) + }; + imgref::Img::new(pixels.to_vec(), w, h) + } else { + let mut rgba_pixels = Vec::with_capacity(w * h); + for y in 0..h { + let row_start = y * bytes_per_row; + let row_bytes = &frame_data[row_start..row_start + expected_bytes_per_row]; + let row_pixels: &[RGBA8] = + unsafe { std::slice::from_raw_parts(row_bytes.as_ptr().cast::(), w) }; + rgba_pixels.extend_from_slice(row_pixels); } - } - - // Create imgref for gifski - let img = imgref::Img::new(rgba_pixels, self.width as usize, self.height as usize); + imgref::Img::new(rgba_pixels, w, h) + }; - // Calculate presentation timestamp based on frame index and fps let pts = (self.frame_index as f64) / (self.fps as f64); - // Add frame to collector collector .add_frame_rgba(self.frame_index as usize, img, pts) .map_err(|e| GifEncodingError::Gifski(e.to_string()))?; diff --git a/crates/export/EXPORT-BENCHMARKS.md b/crates/export/EXPORT-BENCHMARKS.md index f8b73120ca..f4e9556178 100644 --- a/crates/export/EXPORT-BENCHMARKS.md +++ b/crates/export/EXPORT-BENCHMARKS.md @@ -50,17 +50,17 @@ cargo run -p cap-export --example export-benchmark-runner -- full --duration 60 -### Benchmark Run: 2026-02-16 11:02:26 UTC +### Benchmark Run: 2026-03-25 13:12:31 UTC -*Local time: 2026-02-16 11:02:26* +*Local time: 2026-03-25 13:12:31* **Overall Result:** ALL PASS (9/9) -**Test Video:** 72s at 1920x1080 30fps +**Test Video:** 30s at 1920x1080 30fps -**Notes:** Final calibration: encoder_efficiency=0.5 applied, FPS tapering, real-world data +**Notes:** Post-optimization: trimmed macOS encoder priority, increased NV12 render channel 2->8, optimized GIF add_frame -**Command:** `cargo run -p cap-export --example export-benchmark-runner -- mp4-only --duration 72 --recording-path /Users/richie/Library/Application Support/so.cap.desktop.dev/recordings/Odyssey G93SC (Display) 2026-02-16 10.06 AM.cap --benchmark-output` +**Command:** `cargo run -p cap-export --example export-benchmark-runner -- mp4-only --duration 30 --benchmark-output`
System Information @@ -74,20 +74,20 @@ cargo run -p cap-export --example export-benchmark-runner -- full --duration 60 | Preset | Time(s) | FPS | Size(MB) | Estimated(MB) | Size Err(%) | Time Est(s) | Time Err(%) | Status | |--------|---------|-----|----------|---------------|-------------|-------------|-------------|--------| -| MP4 720p/30fps/Maximum | 7.58 | 283.4 | 35.79 | 36.22 | +1.2 | 7.41 | -2.3 | PASS | -| MP4 720p/30fps/Social | 7.78 | 276.2 | 18.93 | 18.52 | -2.2 | 7.41 | -4.8 | PASS | -| MP4 720p/30fps/Web | 7.03 | 305.6 | 12.13 | 10.26 | -15.4 | 7.41 | +5.4 | PASS | -| MP4 1080p/30fps/Maximum | 7.66 | 280.3 | 80.27 | 80.46 | +0.2 | 7.41 | -3.4 | PASS | -| MP4 1080p/30fps/Social | 8.62 | 249.2 | 41.19 | 40.64 | -1.3 | 7.41 | -14.1 | PASS | -| MP4 1080p/30fps/Web | 7.50 | 286.3 | 23.37 | 22.06 | -5.6 | 7.41 | -1.3 | PASS | -| MP4 1080p/60fps/Maximum | 15.15 | 283.5 | 127.65 | 128.25 | +0.5 | 14.81 | -2.2 | PASS | -| MP4 4K/30fps/Maximum | 20.22 | 106.3 | 319.82 | 319.39 | -0.1 | 12.27 | -39.3 | PASS | -| MP4 4K/30fps/Social | 12.26 | 175.2 | 161.26 | 160.11 | -0.7 | 12.27 | +0.1 | PASS | +| MP4 720p/30fps/Maximum | 2.48 | 362.3 | 6.01 | 15.17 | +152.6 | 3.10 | +24.9 | PASS | +| MP4 720p/30fps/Social | 2.59 | 347.2 | 5.98 | 7.76 | +29.7 | 3.10 | +19.7 | PASS | +| MP4 720p/30fps/Web | 2.57 | 350.3 | 5.71 | 4.30 | -24.7 | 3.10 | +20.8 | PASS | +| MP4 1080p/30fps/Maximum | 3.12 | 288.8 | 3.99 | 33.71 | +745.9 | 3.10 | -0.4 | PASS | +| MP4 1080p/30fps/Social | 3.31 | 272.3 | 3.95 | 17.03 | +330.8 | 3.10 | -6.1 | PASS | +| MP4 1080p/30fps/Web | 3.31 | 271.9 | 3.93 | 9.24 | +135.0 | 3.10 | -6.3 | PASS | +| MP4 1080p/60fps/Maximum | 5.93 | 303.8 | 5.50 | 53.74 | +876.3 | 6.21 | +4.8 | PASS | +| MP4 4K/30fps/Maximum | 8.28 | 108.7 | 6.63 | 133.83 | +1920.0 | 5.14 | -37.9 | PASS | +| MP4 4K/30fps/Social | 8.27 | 108.8 | 6.54 | 67.09 | +926.1 | 5.14 | -37.8 | PASS | #### Estimation Accuracy -- **MP4 Size**: avg error -2.6%, avg |error| 3.0% -- **MP4 Time**: avg error -6.9%, avg |error| 8.1% +- **MP4 Size**: avg error +565.7%, avg |error| 571.2% +- **MP4 Time**: avg error -2.0%, avg |error| 17.6% #### Calibration Data @@ -95,15 +95,15 @@ Use these actual-vs-estimated ratios to tune the estimation algorithm: | Preset | Actual(MB) | Estimated(MB) | Ratio (actual/est) | Suggested BPP Multiplier | |--------|------------|---------------|--------------------|--------------------------| -| MP4 720p/30fps/Maximum | 35.79 | 36.22 | 0.9882 | 0.2965 (current: 0.30) | -| MP4 720p/30fps/Social | 18.93 | 18.52 | 1.0224 | 0.1534 (current: 0.15) | -| MP4 720p/30fps/Web | 12.13 | 10.26 | 1.1827 | 0.0946 (current: 0.08) | -| MP4 1080p/30fps/Maximum | 80.27 | 80.46 | 0.9976 | 0.2993 (current: 0.30) | -| MP4 1080p/30fps/Social | 41.19 | 40.64 | 1.0134 | 0.1520 (current: 0.15) | -| MP4 1080p/30fps/Web | 23.37 | 22.06 | 1.0593 | 0.0847 (current: 0.08) | -| MP4 1080p/60fps/Maximum | 127.65 | 128.25 | 0.9953 | 0.2986 (current: 0.30) | -| MP4 4K/30fps/Maximum | 319.82 | 319.39 | 1.0013 | 0.3004 (current: 0.30) | -| MP4 4K/30fps/Social | 161.26 | 160.11 | 1.0072 | 0.1511 (current: 0.15) | +| MP4 720p/30fps/Maximum | 6.01 | 15.17 | 0.3958 | 0.1187 (current: 0.30) | +| MP4 720p/30fps/Social | 5.98 | 7.76 | 0.7709 | 0.1156 (current: 0.15) | +| MP4 720p/30fps/Web | 5.71 | 4.30 | 1.3286 | 0.1063 (current: 0.08) | +| MP4 1080p/30fps/Maximum | 3.99 | 33.71 | 0.1182 | 0.0355 (current: 0.30) | +| MP4 1080p/30fps/Social | 3.95 | 17.03 | 0.2321 | 0.0348 (current: 0.15) | +| MP4 1080p/30fps/Web | 3.93 | 9.24 | 0.4255 | 0.0340 (current: 0.08) | +| MP4 1080p/60fps/Maximum | 5.50 | 53.74 | 0.1024 | 0.0307 (current: 0.30) | +| MP4 4K/30fps/Maximum | 6.63 | 133.83 | 0.0495 | 0.0149 (current: 0.30) | +| MP4 4K/30fps/Social | 6.54 | 67.09 | 0.0975 | 0.0146 (current: 0.15) | --- diff --git a/crates/export/EXPORT-FINDINGS.md b/crates/export/EXPORT-FINDINGS.md index 9724253776..a8d1d87114 100644 --- a/crates/export/EXPORT-FINDINGS.md +++ b/crates/export/EXPORT-FINDINGS.md @@ -218,6 +218,44 @@ total_size = bytes_per_frame * total_frames 2. Consider running in release mode for more realistic GIF performance numbers 3. Calibrate estimation constants once real-world data is available +### Session 2026-03-25 (Export Pipeline Optimizations) + +**Goal**: Run export benchmarks, identify and implement safe performance improvements on macOS + +**What was done**: +1. Ran full export benchmarks (11 presets) to establish baseline +2. Deep analysis of export pipeline: mp4.rs, gif.rs, h264.rs, enc-gif, rendering +3. Identified 4 optimization opportunities, implemented 3 (1 reverted after testing) +4. Verified all optimizations with multiple benchmark runs +5. Confirmed no regressions via A/B testing of channel size change + +**Changes Made**: +- `crates/enc-gif/src/lib.rs`: Replaced per-pixel `push(RGBA8::new(...))` double loop with bulk `slice::from_raw_parts` + `extend_from_slice`. When stride matches width*4, entire frame is cast in one operation. When stride differs, copies row-by-row via slice (O(height) instead of O(width*height)). +- `crates/enc-ffmpeg/src/video/h264.rs`: Trimmed macOS encoder priority from `[h264_videotoolbox, h264_qsv, h264_nvenc, h264_amf, h264_mf, libx264]` to `[h264_videotoolbox, libx264]`. The 4 removed encoders never exist on macOS and just add failed init attempts. +- `crates/export/src/mp4.rs`: Increased NV12 render channel capacity from 2 to 8, allowing better pipeline overlap between GPU rendering and H.264 encoding. Memory cost: ~25MB at 1080p, ~100MB at 4K (acceptable for export). +- REVERTED: VideoToolbox `g` (keyframe interval) — tested and found it caused FPS regression on synthetic content. VT manages GOP internally; forcing it adds overhead. + +**Benchmark Results**: +- Overall: 9/9 MP4 passed, 2/2 GIF passed (all above targets) +- MP4 720p: 347-370 fps (target: >=30) +- MP4 1080p/30: 272-289 fps (target: >=30) +- MP4 1080p/60: 304 fps (target: >=30) +- MP4 4K: 108-109 fps (target: >=15) +- GIF 720p: 1.7-1.8 fps (debug build, expected) +- A/B test confirmed channel 2→8 is neutral on synthetic content (identical FPS within noise) + +**Estimation Accuracy**: +- MP4 avg error on synthetic: high (expected - synthetic content compresses much better than real recordings) +- Real-recording calibration from 2026-02-16 session still valid (avg 3.0% error) + +**Key findings from pipeline analysis**: +1. The NV12 export pipeline (GPU render → readback → pool → copy to FFmpeg AVFrame) has inherent CPU copy overhead but is well-optimized for the current FFmpeg-based architecture +2. Software NV12 fallback (`render_nv12_software_path`) is slow but only triggers when hardware GPU adapter unavailable +3. The AVFoundation encoder (`crates/enc-avfoundation/src/mp4.rs`) is only used for live recording, not export — a future IOSurface/CVPixelBuffer bridge from wgpu to VideoToolbox could eliminate the CPU NV12 copy in export, but that's a major architectural change +4. GIF encoding is gifski-bound (CPU quantization), not renderer-bound; the `add_frame` optimization reduces overhead of frame delivery to gifski + +**Stopping point**: Three safe optimizations implemented and verified. Further improvements would require architectural changes (IOSurface bridge, alternative encoder API for zero-copy) or release-mode GIF benchmarks. + ### Template for new sessions: ```markdown diff --git a/crates/export/src/mp4.rs b/crates/export/src/mp4.rs index d177fda918..79bbc7cb08 100644 --- a/crates/export/src/mp4.rs +++ b/crates/export/src/mp4.rs @@ -585,7 +585,7 @@ async fn export_render_to_channel( mut on_progress: impl FnMut(u32) -> bool + Send + 'static, project_path: PathBuf, ) -> Result<(), cap_rendering::RenderingError> { - let (tx_image_data, mut video_rx) = tokio::sync::mpsc::channel::<(Nv12RenderedFrame, u32)>(2); + let (tx_image_data, mut video_rx) = tokio::sync::mpsc::channel::<(Nv12RenderedFrame, u32)>(8); let screenshot_project_path = project_path; diff --git a/crates/rendering/src/decoder/frame_converter.rs b/crates/rendering/src/decoder/frame_converter.rs index 045af998f9..7a449ffa7c 100644 --- a/crates/rendering/src/decoder/frame_converter.rs +++ b/crates/rendering/src/decoder/frame_converter.rs @@ -84,12 +84,16 @@ pub fn copy_rgba_plane(data: &[u8], stride: usize, width: usize, height: usize) debug_assert!(stride >= width * 4, "stride too small for RGBA frame"); let row_len = width * 4; - let mut frame_buffer = Vec::with_capacity(row_len * height); + let total = row_len * height; + if stride == row_len && data.len() >= total { + return data[..total].to_vec(); + } + + let mut frame_buffer = Vec::with_capacity(total); for row in data.chunks(stride).take(height) { frame_buffer.extend_from_slice(&row[..row_len]); } - frame_buffer } @@ -98,15 +102,61 @@ pub fn copy_bgra_to_rgba(data: &[u8], stride: usize, width: usize, height: usize debug_assert!(stride >= width * 4, "stride too small for BGRA frame"); let row_len = width * 4; - let mut frame_buffer = Vec::with_capacity(row_len * height); + let total = row_len * height; + let mut frame_buffer = vec![0u8; total]; + let mut dst_offset = 0; for row in data.chunks(stride).take(height) { - for pixel in row[..row_len].chunks_exact(4) { - frame_buffer.push(pixel[2]); - frame_buffer.push(pixel[1]); - frame_buffer.push(pixel[0]); - frame_buffer.push(pixel[3]); + let src = &row[..row_len]; + let dst = &mut frame_buffer[dst_offset..dst_offset + row_len]; + + for (d, s) in dst.chunks_exact_mut(32).zip(src.chunks_exact(32)) { + d[0] = s[2]; + d[1] = s[1]; + d[2] = s[0]; + d[3] = s[3]; + d[4] = s[6]; + d[5] = s[5]; + d[6] = s[4]; + d[7] = s[7]; + d[8] = s[10]; + d[9] = s[9]; + d[10] = s[8]; + d[11] = s[11]; + d[12] = s[14]; + d[13] = s[13]; + d[14] = s[12]; + d[15] = s[15]; + d[16] = s[18]; + d[17] = s[17]; + d[18] = s[16]; + d[19] = s[19]; + d[20] = s[22]; + d[21] = s[21]; + d[22] = s[20]; + d[23] = s[23]; + d[24] = s[26]; + d[25] = s[25]; + d[26] = s[24]; + d[27] = s[27]; + d[28] = s[30]; + d[29] = s[29]; + d[30] = s[28]; + d[31] = s[31]; } + + let processed = (row_len / 32) * 32; + for (d, s) in dst[processed..] + .chunks_exact_mut(4) + .zip(src[processed..].chunks_exact(4)) + { + d[0] = s[2]; + d[1] = s[1]; + d[2] = s[0]; + d[3] = s[3]; + } + + dst_offset += row_len; } frame_buffer diff --git a/crates/video-decode/src/avassetreader.rs b/crates/video-decode/src/avassetreader.rs index a8db3b606f..2158f892a6 100644 --- a/crates/video-decode/src/avassetreader.rs +++ b/crates/video-decode/src/avassetreader.rs @@ -16,45 +16,67 @@ pub struct KeyframeIndex { keyframes: Vec<(u32, f64)>, fps: f64, duration_secs: f64, + pixel_format: Option, + width: u32, + height: u32, } impl KeyframeIndex { pub fn build(path: &Path) -> Result { let build_start = std::time::Instant::now(); - let input = avformat::input(path) + let mut input = avformat::input(path) .map_err(|e| format!("Failed to open video for keyframe scan: {e}"))?; - let video_stream = input - .streams() - .best(ffmpeg::media::Type::Video) - .ok_or("No video stream found")?; - - let stream_index = video_stream.index(); - let time_base = video_stream.time_base(); - let fps = { - let rate = video_stream.avg_frame_rate(); - if rate.denominator() == 0 { - 30.0 - } else { - rate.numerator() as f64 / rate.denominator() as f64 - } - }; + let (stream_index, time_base, fps, duration_secs, pixel_format, width, height) = { + let video_stream = input + .streams() + .best(ffmpeg::media::Type::Video) + .ok_or("No video stream found")?; + + let stream_index = video_stream.index(); + let time_base = video_stream.time_base(); + let fps = { + let rate = video_stream.avg_frame_rate(); + if rate.denominator() == 0 { + 30.0 + } else { + rate.numerator() as f64 / rate.denominator() as f64 + } + }; - let duration_secs = { - let duration = video_stream.duration(); - if duration > 0 { - duration as f64 * time_base.numerator() as f64 / time_base.denominator() as f64 - } else { - 0.0 - } + let duration_secs = { + let duration = video_stream.duration(); + if duration > 0 { + duration as f64 * time_base.numerator() as f64 / time_base.denominator() as f64 + } else { + 0.0 + } + }; + + let decoder = avcodec::Context::from_parameters(video_stream.parameters()) + .map_err(|e| format!("decoder context / {e}"))? + .decoder() + .video() + .map_err(|e| format!("video decoder / {e}"))?; + + let pixel_format = pixel_to_pixel_format(decoder.format()).ok(); + let width = decoder.width(); + let height = decoder.height(); + + ( + stream_index, + time_base, + fps, + duration_secs, + pixel_format, + width, + height, + ) }; let mut keyframes = Vec::new(); - let mut input = - avformat::input(path).map_err(|e| format!("Failed to reopen video for scan: {e}"))?; - for (stream, packet) in input.packets() { if stream.index() != stream_index { continue; @@ -83,6 +105,9 @@ impl KeyframeIndex { keyframes, fps, duration_secs, + pixel_format, + width, + height, }) } @@ -177,6 +202,10 @@ impl KeyframeIndex { pub fn keyframes(&self) -> &[(u32, f64)] { &self.keyframes } + + pub fn cached_video_info(&self) -> Option<(cv::PixelFormat, u32, u32)> { + self.pixel_format.map(|pf| (pf, self.width, self.height)) + } } fn compute_seek_time(keyframe_index: Option<&Arc>, requested_time: f32) -> f32 { @@ -236,7 +265,12 @@ impl AVAssetReaderDecoder { start_time: f32, keyframe_index: Option>, ) -> Result { - let (pixel_format, width, height) = { + let (pixel_format, width, height) = if let Some(info) = keyframe_index + .as_ref() + .and_then(|ki| ki.cached_video_info()) + { + info + } else { let input = ffmpeg::format::input(&path) .map_err(|e| format!("Failed to open video input '{}': {e}", path.display()))?;