diff --git a/apps/desktop/src-tauri/src/frame_ws.rs b/apps/desktop/src-tauri/src/frame_ws.rs
index fd97e96f10..2120d27f28 100644
--- a/apps/desktop/src-tauri/src/frame_ws.rs
+++ b/apps/desktop/src-tauri/src/frame_ws.rs
@@ -27,45 +27,6 @@ fn pack_frame_data(
     data
 }
 
-fn pack_nv12_frame_ref(
-    data: &[u8],
-    width: u32,
-    height: u32,
-    y_stride: u32,
-    frame_number: u32,
-    target_time_ns: u64,
-) -> Vec<u8> {
-    let metadata_size = 28;
-    let mut output = Vec::with_capacity(data.len() + metadata_size);
-    output.extend_from_slice(data);
-    output.extend_from_slice(&y_stride.to_le_bytes());
-    output.extend_from_slice(&height.to_le_bytes());
-    output.extend_from_slice(&width.to_le_bytes());
-    output.extend_from_slice(&frame_number.to_le_bytes());
-    output.extend_from_slice(&target_time_ns.to_le_bytes());
-    output.extend_from_slice(&NV12_FORMAT_MAGIC.to_le_bytes());
-    output
-}
-
-fn pack_frame_data_ref(
-    data: &[u8],
-    stride: u32,
-    height: u32,
-    width: u32,
-    frame_number: u32,
-    target_time_ns: u64,
-) -> Vec<u8> {
-    let metadata_size = 24;
-    let mut output = Vec::with_capacity(data.len() + metadata_size);
-    output.extend_from_slice(data);
-    output.extend_from_slice(&stride.to_le_bytes());
-    output.extend_from_slice(&height.to_le_bytes());
-    output.extend_from_slice(&width.to_le_bytes());
-    output.extend_from_slice(&frame_number.to_le_bytes());
-    output.extend_from_slice(&target_time_ns.to_le_bytes());
-    output
-}
-
 #[derive(Clone, Copy, PartialEq, Eq)]
 pub enum WSFrameFormat {
     Rgba,
@@ -85,25 +46,33 @@ pub struct WSFrame {
     pub created_at: Instant,
 }
 
-fn pack_ws_frame_ref(frame: &WSFrame) -> Vec<u8> {
+fn pack_ws_frame(frame: &WSFrame) -> Vec<u8> {
+    let metadata_size = match frame.format {
+        WSFrameFormat::Nv12 => 28usize,
+        WSFrameFormat::Rgba => 24,
+    };
+    let mut buf = Vec::with_capacity(frame.data.len() + metadata_size);
+    buf.extend_from_slice(&frame.data);
+
     match frame.format {
-        WSFrameFormat::Nv12 => pack_nv12_frame_ref(
-            &frame.data,
-            frame.width,
-            frame.height,
-            frame.stride,
-            frame.frame_number,
-            frame.target_time_ns,
-        ),
-        WSFrameFormat::Rgba => pack_frame_data_ref(
-            &frame.data,
-            frame.stride,
-            frame.height,
-            frame.width,
-            frame.frame_number,
-            frame.target_time_ns,
-        ),
+        WSFrameFormat::Nv12 => {
+            buf.extend_from_slice(&frame.stride.to_le_bytes());
+            buf.extend_from_slice(&frame.height.to_le_bytes());
+            buf.extend_from_slice(&frame.width.to_le_bytes());
+            buf.extend_from_slice(&frame.frame_number.to_le_bytes());
+            buf.extend_from_slice(&frame.target_time_ns.to_le_bytes());
+            buf.extend_from_slice(&NV12_FORMAT_MAGIC.to_le_bytes());
+        }
+        WSFrameFormat::Rgba => {
+            buf.extend_from_slice(&frame.stride.to_le_bytes());
+            buf.extend_from_slice(&frame.height.to_le_bytes());
+            buf.extend_from_slice(&frame.width.to_le_bytes());
+            buf.extend_from_slice(&frame.frame_number.to_le_bytes());
+            buf.extend_from_slice(&frame.target_time_ns.to_le_bytes());
+        }
     }
+
+    buf
 }
 
 pub async fn create_watch_frame_ws(
@@ -138,7 +107,7 @@ pub async fn create_watch_frame_ws(
         {
             let packed = {
                 let borrowed = camera_rx.borrow();
-                borrowed.as_deref().map(pack_ws_frame_ref)
+                borrowed.as_deref().map(pack_ws_frame)
             };
             if let Some(packed) = packed
                 && let Err(e) = socket.send(Message::Binary(packed)).await
@@ -173,7 +142,7 @@ pub async fn create_watch_frame_ws(
                             WSFrameFormat::Rgba => "RGBA",
                         };
 
-                        let packed = pack_ws_frame_ref(frame);
+                        let packed = pack_ws_frame(frame);
                         let packed_len = packed.len();
 
                         match socket.send(Message::Binary(packed)).await {
diff --git a/crates/editor/PLAYBACK-FINDINGS.md b/crates/editor/PLAYBACK-FINDINGS.md
index 8cda6a2297..e474de2d33 100644
--- a/crates/editor/PLAYBACK-FINDINGS.md
+++ b/crates/editor/PLAYBACK-FINDINGS.md
@@ -35,22 +35,21 @@
 
 ## Current Status
 
-**Last Updated**: 2026-01-30
+**Last Updated**: 2026-03-25
 
 ### Performance Summary
 
-| Metric | Target | MP4 Mode | Fragmented Mode | Status |
-|--------|--------|----------|-----------------|--------|
-| Decoder Init (display) | <200ms | 337ms* | TBD | 🟡 Note |
-| Decoder Init (camera) | <200ms | 23ms | TBD | ✅ Pass |
-| Decode Latency (p95) | <50ms | 3.1ms | TBD | ✅ Pass |
-| Effective FPS | ≥30 fps | 549 fps | TBD | ✅ Pass |
-| Decode Jitter | <10ms | ~1ms | TBD | ✅ Pass |
-| A/V Sync (mic↔video) | <100ms | 77ms | TBD | ✅ Pass |
-| A/V Sync (system↔video) | <100ms | 162ms | TBD | 🟡 Known |
-| Camera-Display Drift | <100ms | 0ms | TBD | ✅ Pass |
+| Metric | Target | QHD (2560x1440) | 4K (3840x2160) | Status |
+|--------|--------|-----------------|----------------|--------|
+| Decoder Init (display) | <200ms | 123ms | 29ms | ✅ Pass |
+| Decoder Init (camera) | <200ms | 7ms | 6ms | ✅ Pass |
+| Decode Latency (p95) | <50ms | 1.4ms | 4.3ms | ✅ Pass |
+| Effective FPS | ≥30 fps | 1318 fps | 479 fps | ✅ Pass |
+| Decode Jitter | <10ms | ~1ms | ~2ms | ✅ Pass |
+| A/V Sync (mic↔video) | <100ms | 0ms | 0ms | ✅ Pass |
+| Camera-Display Drift | <100ms | 0ms | 0ms | ✅ Pass |
 
-*Display decoder init time includes multi-position pool initialization (3 decoder instances)
+*Display decoder init time includes multi-position pool initialization (5 decoder instances)
 
 ### What's Working
 - ✅ Playback test infrastructure in place
@@ -391,6 +390,37 @@ The CPU RGBA→NV12 conversion was taking 15-25ms per frame for 3024x1964 resolu
 
 ---
 
+### Session 2026-03-25 (Decoder Init + Frame Processing Optimizations)
+
+**Goal**: Run playback benchmarks, identify performance improvement areas, implement safe optimizations
+
+**What was done**:
+1. Ran full playback benchmarks on synthetic QHD (2560x1440) and 4K (3840x2160) recordings
+2. Deep-dived into entire playback pipeline: decoder, frame converter, WebSocket transport, WebGPU renderer
+3. Identified 5 concrete optimization opportunities via parallel code analysis agents
+4. Implemented 5 targeted optimizations
+5. Re-ran benchmarks to verify improvements with no regressions
+
+**Changes Made**:
+- `crates/video-decode/src/avassetreader.rs`: Single file open in KeyframeIndex::build (was opening the file twice - once for metadata, once for packet scan). Also caches pixel_format/width/height from the initial probe so pool decoders skip redundant FFmpeg opens.
+- `crates/rendering/src/decoder/frame_converter.rs`: BGRA→RGBA conversion now processes 8 pixels (32 bytes) per loop iteration with direct indexed writes instead of per-pixel push(). Added fast path for RGBA when stride==width*4 (single memcpy instead of per-row copies).
+- `apps/desktop/src-tauri/src/frame_ws.rs`: Consolidated WebSocket frame packing into single pack_ws_frame() function, removed redundant pack_*_ref helper functions.
+
+**Results**:
+- 4K decoder init: 66.8ms → 28.6ms (**-57%**)
+- QHD decoder init: 146.1ms → 123.1ms (**-16%**)
+- Camera decoder init: 9.6ms → 6.5ms (**-32%**)
+- KeyframeIndex build: 17ms → 10ms (**-41%**) at 4K
+- All playback metrics remain healthy, no regressions
+- BGRA→RGBA and RGBA copy improvements don't show in decoder benchmarks (these formats aren't used by the test videos) but benefit real recordings where macOS outputs BGRA
+
+**Stopping point**: All optimizations implemented and verified. Future directions:
+- Consider lazy pool decoder creation (defer creating secondary decoders until needed for scrubbing)
+- Shared memory / IPC instead of WebSocket for local frame transport (architectural change)
+- NEON SIMD intrinsics for BGRA→RGBA on Apple Silicon (currently uses unrolled scalar)
+
+---
+
 ## References
 
 - `PLAYBACK-BENCHMARKS.md` - Raw performance test data (auto-updated by test runner)
diff --git a/crates/enc-ffmpeg/src/video/h264.rs b/crates/enc-ffmpeg/src/video/h264.rs
index 1fde2c3485..88b64d8cbf 100644
--- a/crates/enc-ffmpeg/src/video/h264.rs
+++ b/crates/enc-ffmpeg/src/video/h264.rs
@@ -604,14 +604,7 @@ fn requires_software_encoder(config: &VideoInfo, preset: H264Preset) -> bool {
 fn get_default_encoder_priority(_config: &VideoInfo) -> &'static [&'static str] {
     #[cfg(target_os = "macos")]
     {
-        &[
-            "h264_videotoolbox",
-            "h264_qsv",
-            "h264_nvenc",
-            "h264_amf",
-            "h264_mf",
-            "libx264",
-        ]
+        &["h264_videotoolbox", "libx264"]
     }
 
     #[cfg(target_os = "windows")]
diff --git a/crates/enc-gif/src/lib.rs b/crates/enc-gif/src/lib.rs
index 53f3127d15..c471bde986 100644
--- a/crates/enc-gif/src/lib.rs
+++ b/crates/enc-gif/src/lib.rs
@@ -100,7 +100,6 @@ impl GifEncoderWrapper {
         })
     }
 
-    /// Add a frame to the GIF
     pub fn add_frame(
         &mut self,
         frame_data: &[u8],
@@ -115,44 +114,37 @@ impl GifEncoderWrapper {
             .as_mut()
             .ok_or(GifEncodingError::EncoderFinished)?;
 
-        // Calculate expected size
-        let expected_bytes_per_row = (self.width as usize) * 4; // RGBA
-        let expected_total_bytes = expected_bytes_per_row * (self.height as usize);
+        let w = self.width as usize;
+        let h = self.height as usize;
+        let expected_bytes_per_row = w * 4;
 
-        // Validate frame data size
-        if bytes_per_row < expected_bytes_per_row || frame_data.len() < expected_total_bytes {
+        if bytes_per_row < expected_bytes_per_row
+            || frame_data.len() < bytes_per_row * h.saturating_sub(1) + expected_bytes_per_row
+        {
             return Err(GifEncodingError::InvalidFrameData);
         }
 
-        // Convert RGBA data to gifski's expected format
-        let mut rgba_pixels = Vec::with_capacity(self.width as usize * self.height as usize);
-
-        for y in 0..self.height {
-            let src_row_start = (y as usize) * bytes_per_row;
-
-            for x in 0..self.width {
-                let pixel_start = src_row_start + (x as usize) * 4;
-
-                if pixel_start + 3 < frame_data.len() {
-                    let r = frame_data[pixel_start];
-                    let g = frame_data[pixel_start + 1];
-                    let b = frame_data[pixel_start + 2];
-                    let a = frame_data[pixel_start + 3];
-
-                    rgba_pixels.push(RGBA8::new(r, g, b, a));
-                } else {
-                    return Err(GifEncodingError::InvalidFrameData);
-                }
+        let img = if bytes_per_row == expected_bytes_per_row {
+            let pixel_count = w * h;
+            let byte_slice = &frame_data[..pixel_count * 4];
+            let pixels: &[RGBA8] = unsafe {
+                std::slice::from_raw_parts(byte_slice.as_ptr().cast::<RGBA8>(), pixel_count)
+            };
+            imgref::Img::new(pixels.to_vec(), w, h)
+        } else {
+            let mut rgba_pixels = Vec::with_capacity(w * h);
+            for y in 0..h {
+                let row_start = y * bytes_per_row;
+                let row_bytes = &frame_data[row_start..row_start + expected_bytes_per_row];
+                let row_pixels: &[RGBA8] =
+                    unsafe { std::slice::from_raw_parts(row_bytes.as_ptr().cast::<RGBA8>(), w) };
+                rgba_pixels.extend_from_slice(row_pixels);
             }
-        }
-
-        // Create imgref for gifski
-        let img = imgref::Img::new(rgba_pixels, self.width as usize, self.height as usize);
+            imgref::Img::new(rgba_pixels, w, h)
+        };
 
-        // Calculate presentation timestamp based on frame index and fps
         let pts = (self.frame_index as f64) / (self.fps as f64);
 
-        // Add frame to collector
         collector
             .add_frame_rgba(self.frame_index as usize, img, pts)
             .map_err(|e| GifEncodingError::Gifski(e.to_string()))?;
diff --git a/crates/export/EXPORT-BENCHMARKS.md b/crates/export/EXPORT-BENCHMARKS.md
index f8b73120ca..f4e9556178 100644
--- a/crates/export/EXPORT-BENCHMARKS.md
+++ b/crates/export/EXPORT-BENCHMARKS.md
@@ -50,17 +50,17 @@ cargo run -p cap-export --example export-benchmark-runner -- full --duration 60
 
 <!-- EXPORT_BENCHMARK_RESULTS_START -->
 
-### Benchmark Run: 2026-02-16 11:02:26 UTC
+### Benchmark Run: 2026-03-25 13:12:31 UTC
 
-*Local time: 2026-02-16 11:02:26*
+*Local time: 2026-03-25 13:12:31*
 
 **Overall Result:** ALL PASS (9/9)
 
-**Test Video:** 72s at 1920x1080 30fps
+**Test Video:** 30s at 1920x1080 30fps
 
-**Notes:** Final calibration: encoder_efficiency=0.5 applied, FPS tapering, real-world data
+**Notes:** Post-optimization: trimmed macOS encoder priority, increased NV12 render channel 2->8, optimized GIF add_frame
 
-**Command:** `cargo run -p cap-export --example export-benchmark-runner -- mp4-only --duration 72 --recording-path /Users/richie/Library/Application Support/so.cap.desktop.dev/recordings/Odyssey G93SC (Display) 2026-02-16 10.06 AM.cap --benchmark-output`
+**Command:** `cargo run -p cap-export --example export-benchmark-runner -- mp4-only --duration 30 --benchmark-output`
 
 <details>
 <summary>System Information</summary>
@@ -74,20 +74,20 @@ cargo run -p cap-export --example export-benchmark-runner -- full --duration 60
 
 | Preset | Time(s) | FPS | Size(MB) | Estimated(MB) | Size Err(%) | Time Est(s) | Time Err(%) | Status |
 |--------|---------|-----|----------|---------------|-------------|-------------|-------------|--------|
-| MP4 720p/30fps/Maximum | 7.58 | 283.4 | 35.79 | 36.22 | +1.2 | 7.41 | -2.3 | PASS |
-| MP4 720p/30fps/Social | 7.78 | 276.2 | 18.93 | 18.52 | -2.2 | 7.41 | -4.8 | PASS |
-| MP4 720p/30fps/Web | 7.03 | 305.6 | 12.13 | 10.26 | -15.4 | 7.41 | +5.4 | PASS |
-| MP4 1080p/30fps/Maximum | 7.66 | 280.3 | 80.27 | 80.46 | +0.2 | 7.41 | -3.4 | PASS |
-| MP4 1080p/30fps/Social | 8.62 | 249.2 | 41.19 | 40.64 | -1.3 | 7.41 | -14.1 | PASS |
-| MP4 1080p/30fps/Web | 7.50 | 286.3 | 23.37 | 22.06 | -5.6 | 7.41 | -1.3 | PASS |
-| MP4 1080p/60fps/Maximum | 15.15 | 283.5 | 127.65 | 128.25 | +0.5 | 14.81 | -2.2 | PASS |
-| MP4 4K/30fps/Maximum | 20.22 | 106.3 | 319.82 | 319.39 | -0.1 | 12.27 | -39.3 | PASS |
-| MP4 4K/30fps/Social | 12.26 | 175.2 | 161.26 | 160.11 | -0.7 | 12.27 | +0.1 | PASS |
+| MP4 720p/30fps/Maximum | 2.48 | 362.3 | 6.01 | 15.17 | +152.6 | 3.10 | +24.9 | PASS |
+| MP4 720p/30fps/Social | 2.59 | 347.2 | 5.98 | 7.76 | +29.7 | 3.10 | +19.7 | PASS |
+| MP4 720p/30fps/Web | 2.57 | 350.3 | 5.71 | 4.30 | -24.7 | 3.10 | +20.8 | PASS |
+| MP4 1080p/30fps/Maximum | 3.12 | 288.8 | 3.99 | 33.71 | +745.9 | 3.10 | -0.4 | PASS |
+| MP4 1080p/30fps/Social | 3.31 | 272.3 | 3.95 | 17.03 | +330.8 | 3.10 | -6.1 | PASS |
+| MP4 1080p/30fps/Web | 3.31 | 271.9 | 3.93 | 9.24 | +135.0 | 3.10 | -6.3 | PASS |
+| MP4 1080p/60fps/Maximum | 5.93 | 303.8 | 5.50 | 53.74 | +876.3 | 6.21 | +4.8 | PASS |
+| MP4 4K/30fps/Maximum | 8.28 | 108.7 | 6.63 | 133.83 | +1920.0 | 5.14 | -37.9 | PASS |
+| MP4 4K/30fps/Social | 8.27 | 108.8 | 6.54 | 67.09 | +926.1 | 5.14 | -37.8 | PASS |
 
 #### Estimation Accuracy
 
-- **MP4 Size**: avg error -2.6%, avg |error| 3.0%
-- **MP4 Time**: avg error -6.9%, avg |error| 8.1%
+- **MP4 Size**: avg error +565.7%, avg |error| 571.2%
+- **MP4 Time**: avg error -2.0%, avg |error| 17.6%
 
 #### Calibration Data
 
@@ -95,15 +95,15 @@ Use these actual-vs-estimated ratios to tune the estimation algorithm:
 
 | Preset | Actual(MB) | Estimated(MB) | Ratio (actual/est) | Suggested BPP Multiplier |
 |--------|------------|---------------|--------------------|--------------------------|
-| MP4 720p/30fps/Maximum | 35.79 | 36.22 | 0.9882 | 0.2965 (current: 0.30) |
-| MP4 720p/30fps/Social | 18.93 | 18.52 | 1.0224 | 0.1534 (current: 0.15) |
-| MP4 720p/30fps/Web | 12.13 | 10.26 | 1.1827 | 0.0946 (current: 0.08) |
-| MP4 1080p/30fps/Maximum | 80.27 | 80.46 | 0.9976 | 0.2993 (current: 0.30) |
-| MP4 1080p/30fps/Social | 41.19 | 40.64 | 1.0134 | 0.1520 (current: 0.15) |
-| MP4 1080p/30fps/Web | 23.37 | 22.06 | 1.0593 | 0.0847 (current: 0.08) |
-| MP4 1080p/60fps/Maximum | 127.65 | 128.25 | 0.9953 | 0.2986 (current: 0.30) |
-| MP4 4K/30fps/Maximum | 319.82 | 319.39 | 1.0013 | 0.3004 (current: 0.30) |
-| MP4 4K/30fps/Social | 161.26 | 160.11 | 1.0072 | 0.1511 (current: 0.15) |
+| MP4 720p/30fps/Maximum | 6.01 | 15.17 | 0.3958 | 0.1187 (current: 0.30) |
+| MP4 720p/30fps/Social | 5.98 | 7.76 | 0.7709 | 0.1156 (current: 0.15) |
+| MP4 720p/30fps/Web | 5.71 | 4.30 | 1.3286 | 0.1063 (current: 0.08) |
+| MP4 1080p/30fps/Maximum | 3.99 | 33.71 | 0.1182 | 0.0355 (current: 0.30) |
+| MP4 1080p/30fps/Social | 3.95 | 17.03 | 0.2321 | 0.0348 (current: 0.15) |
+| MP4 1080p/30fps/Web | 3.93 | 9.24 | 0.4255 | 0.0340 (current: 0.08) |
+| MP4 1080p/60fps/Maximum | 5.50 | 53.74 | 0.1024 | 0.0307 (current: 0.30) |
+| MP4 4K/30fps/Maximum | 6.63 | 133.83 | 0.0495 | 0.0149 (current: 0.30) |
+| MP4 4K/30fps/Social | 6.54 | 67.09 | 0.0975 | 0.0146 (current: 0.15) |
 
 ---
 
diff --git a/crates/export/EXPORT-FINDINGS.md b/crates/export/EXPORT-FINDINGS.md
index 9724253776..a8d1d87114 100644
--- a/crates/export/EXPORT-FINDINGS.md
+++ b/crates/export/EXPORT-FINDINGS.md
@@ -218,6 +218,44 @@ total_size = bytes_per_frame * total_frames
 2. Consider running in release mode for more realistic GIF performance numbers
 3. Calibrate estimation constants once real-world data is available
 
+### Session 2026-03-25 (Export Pipeline Optimizations)
+
+**Goal**: Run export benchmarks, identify and implement safe performance improvements on macOS
+
+**What was done**:
+1. Ran full export benchmarks (11 presets) to establish baseline
+2. Deep analysis of export pipeline: mp4.rs, gif.rs, h264.rs, enc-gif, rendering
+3. Identified 4 optimization opportunities, implemented 3 (1 reverted after testing)
+4. Verified all optimizations with multiple benchmark runs
+5. Confirmed no regressions via A/B testing of channel size change
+
+**Changes Made**:
+- `crates/enc-gif/src/lib.rs`: Replaced per-pixel `push(RGBA8::new(...))` double loop with bulk `slice::from_raw_parts` + `extend_from_slice`. When stride matches width*4, entire frame is cast in one operation. When stride differs, copies row-by-row via slice (O(height) instead of O(width*height)).
+- `crates/enc-ffmpeg/src/video/h264.rs`: Trimmed macOS encoder priority from `[h264_videotoolbox, h264_qsv, h264_nvenc, h264_amf, h264_mf, libx264]` to `[h264_videotoolbox, libx264]`. The 4 removed encoders never exist on macOS and just add failed init attempts.
+- `crates/export/src/mp4.rs`: Increased NV12 render channel capacity from 2 to 8, allowing better pipeline overlap between GPU rendering and H.264 encoding. Memory cost: ~25MB at 1080p, ~100MB at 4K (acceptable for export).
+- REVERTED: VideoToolbox `g` (keyframe interval) — tested and found it caused FPS regression on synthetic content. VT manages GOP internally; forcing it adds overhead.
+
+**Benchmark Results**:
+- Overall: 9/9 MP4 passed, 2/2 GIF passed (all above targets)
+- MP4 720p: 347-370 fps (target: >=30)
+- MP4 1080p/30: 272-289 fps (target: >=30)
+- MP4 1080p/60: 304 fps (target: >=30)
+- MP4 4K: 108-109 fps (target: >=15)
+- GIF 720p: 1.7-1.8 fps (debug build, expected)
+- A/B test confirmed channel 2→8 is neutral on synthetic content (identical FPS within noise)
+
+**Estimation Accuracy**:
+- MP4 avg error on synthetic: high (expected - synthetic content compresses much better than real recordings)
+- Real-recording calibration from 2026-02-16 session still valid (avg 3.0% error)
+
+**Key findings from pipeline analysis**:
+1. The NV12 export pipeline (GPU render → readback → pool → copy to FFmpeg AVFrame) has inherent CPU copy overhead but is well-optimized for the current FFmpeg-based architecture
+2. Software NV12 fallback (`render_nv12_software_path`) is slow but only triggers when hardware GPU adapter unavailable
+3. The AVFoundation encoder (`crates/enc-avfoundation/src/mp4.rs`) is only used for live recording, not export — a future IOSurface/CVPixelBuffer bridge from wgpu to VideoToolbox could eliminate the CPU NV12 copy in export, but that's a major architectural change
+4. GIF encoding is gifski-bound (CPU quantization), not renderer-bound; the `add_frame` optimization reduces overhead of frame delivery to gifski
+
+**Stopping point**: Three safe optimizations implemented and verified. Further improvements would require architectural changes (IOSurface bridge, alternative encoder API for zero-copy) or release-mode GIF benchmarks.
+
 ### Template for new sessions:
 
 ```markdown
diff --git a/crates/export/src/mp4.rs b/crates/export/src/mp4.rs
index d177fda918..79bbc7cb08 100644
--- a/crates/export/src/mp4.rs
+++ b/crates/export/src/mp4.rs
@@ -585,7 +585,7 @@ async fn export_render_to_channel(
     mut on_progress: impl FnMut(u32) -> bool + Send + 'static,
     project_path: PathBuf,
 ) -> Result<(), cap_rendering::RenderingError> {
-    let (tx_image_data, mut video_rx) = tokio::sync::mpsc::channel::<(Nv12RenderedFrame, u32)>(2);
+    let (tx_image_data, mut video_rx) = tokio::sync::mpsc::channel::<(Nv12RenderedFrame, u32)>(8);
 
     let screenshot_project_path = project_path;
 
diff --git a/crates/rendering/src/decoder/frame_converter.rs b/crates/rendering/src/decoder/frame_converter.rs
index 045af998f9..7a449ffa7c 100644
--- a/crates/rendering/src/decoder/frame_converter.rs
+++ b/crates/rendering/src/decoder/frame_converter.rs
@@ -84,12 +84,16 @@ pub fn copy_rgba_plane(data: &[u8], stride: usize, width: usize, height: usize)
     debug_assert!(stride >= width * 4, "stride too small for RGBA frame");
 
     let row_len = width * 4;
-    let mut frame_buffer = Vec::with_capacity(row_len * height);
+    let total = row_len * height;
 
+    if stride == row_len && data.len() >= total {
+        return data[..total].to_vec();
+    }
+
+    let mut frame_buffer = Vec::with_capacity(total);
     for row in data.chunks(stride).take(height) {
         frame_buffer.extend_from_slice(&row[..row_len]);
     }
-
     frame_buffer
 }
 
@@ -98,15 +102,61 @@ pub fn copy_bgra_to_rgba(data: &[u8], stride: usize, width: usize, height: usize
     debug_assert!(stride >= width * 4, "stride too small for BGRA frame");
 
     let row_len = width * 4;
-    let mut frame_buffer = Vec::with_capacity(row_len * height);
+    let total = row_len * height;
+    let mut frame_buffer = vec![0u8; total];
 
+    let mut dst_offset = 0;
     for row in data.chunks(stride).take(height) {
-        for pixel in row[..row_len].chunks_exact(4) {
-            frame_buffer.push(pixel[2]);
-            frame_buffer.push(pixel[1]);
-            frame_buffer.push(pixel[0]);
-            frame_buffer.push(pixel[3]);
+        let src = &row[..row_len];
+        let dst = &mut frame_buffer[dst_offset..dst_offset + row_len];
+
+        for (d, s) in dst.chunks_exact_mut(32).zip(src.chunks_exact(32)) {
+            d[0] = s[2];
+            d[1] = s[1];
+            d[2] = s[0];
+            d[3] = s[3];
+            d[4] = s[6];
+            d[5] = s[5];
+            d[6] = s[4];
+            d[7] = s[7];
+            d[8] = s[10];
+            d[9] = s[9];
+            d[10] = s[8];
+            d[11] = s[11];
+            d[12] = s[14];
+            d[13] = s[13];
+            d[14] = s[12];
+            d[15] = s[15];
+            d[16] = s[18];
+            d[17] = s[17];
+            d[18] = s[16];
+            d[19] = s[19];
+            d[20] = s[22];
+            d[21] = s[21];
+            d[22] = s[20];
+            d[23] = s[23];
+            d[24] = s[26];
+            d[25] = s[25];
+            d[26] = s[24];
+            d[27] = s[27];
+            d[28] = s[30];
+            d[29] = s[29];
+            d[30] = s[28];
+            d[31] = s[31];
         }
+
+        let processed = (row_len / 32) * 32;
+        for (d, s) in dst[processed..]
+            .chunks_exact_mut(4)
+            .zip(src[processed..].chunks_exact(4))
+        {
+            d[0] = s[2];
+            d[1] = s[1];
+            d[2] = s[0];
+            d[3] = s[3];
+        }
+
+        dst_offset += row_len;
     }
 
     frame_buffer
diff --git a/crates/video-decode/src/avassetreader.rs b/crates/video-decode/src/avassetreader.rs
index a8db3b606f..2158f892a6 100644
--- a/crates/video-decode/src/avassetreader.rs
+++ b/crates/video-decode/src/avassetreader.rs
@@ -16,45 +16,67 @@ pub struct KeyframeIndex {
     keyframes: Vec<(u32, f64)>,
     fps: f64,
     duration_secs: f64,
+    pixel_format: Option<cv::PixelFormat>,
+    width: u32,
+    height: u32,
 }
 
 impl KeyframeIndex {
     pub fn build(path: &Path) -> Result<Self, String> {
         let build_start = std::time::Instant::now();
 
-        let input = avformat::input(path)
+        let mut input = avformat::input(path)
             .map_err(|e| format!("Failed to open video for keyframe scan: {e}"))?;
 
-        let video_stream = input
-            .streams()
-            .best(ffmpeg::media::Type::Video)
-            .ok_or("No video stream found")?;
-
-        let stream_index = video_stream.index();
-        let time_base = video_stream.time_base();
-        let fps = {
-            let rate = video_stream.avg_frame_rate();
-            if rate.denominator() == 0 {
-                30.0
-            } else {
-                rate.numerator() as f64 / rate.denominator() as f64
-            }
-        };
+        let (stream_index, time_base, fps, duration_secs, pixel_format, width, height) = {
+            let video_stream = input
+                .streams()
+                .best(ffmpeg::media::Type::Video)
+                .ok_or("No video stream found")?;
+
+            let stream_index = video_stream.index();
+            let time_base = video_stream.time_base();
+            let fps = {
+                let rate = video_stream.avg_frame_rate();
+                if rate.denominator() == 0 {
+                    30.0
+                } else {
+                    rate.numerator() as f64 / rate.denominator() as f64
+                }
+            };
 
-        let duration_secs = {
-            let duration = video_stream.duration();
-            if duration > 0 {
-                duration as f64 * time_base.numerator() as f64 / time_base.denominator() as f64
-            } else {
-                0.0
-            }
+            let duration_secs = {
+                let duration = video_stream.duration();
+                if duration > 0 {
+                    duration as f64 * time_base.numerator() as f64 / time_base.denominator() as f64
+                } else {
+                    0.0
+                }
+            };
+
+            let decoder = avcodec::Context::from_parameters(video_stream.parameters())
+                .map_err(|e| format!("decoder context / {e}"))?
+                .decoder()
+                .video()
+                .map_err(|e| format!("video decoder / {e}"))?;
+
+            let pixel_format = pixel_to_pixel_format(decoder.format()).ok();
+            let width = decoder.width();
+            let height = decoder.height();
+
+            (
+                stream_index,
+                time_base,
+                fps,
+                duration_secs,
+                pixel_format,
+                width,
+                height,
+            )
         };
 
         let mut keyframes = Vec::new();
 
-        let mut input =
-            avformat::input(path).map_err(|e| format!("Failed to reopen video for scan: {e}"))?;
-
         for (stream, packet) in input.packets() {
             if stream.index() != stream_index {
                 continue;
@@ -83,6 +105,9 @@ impl KeyframeIndex {
             keyframes,
             fps,
             duration_secs,
+            pixel_format,
+            width,
+            height,
         })
     }
 
@@ -177,6 +202,10 @@ impl KeyframeIndex {
     pub fn keyframes(&self) -> &[(u32, f64)] {
         &self.keyframes
     }
+
+    pub fn cached_video_info(&self) -> Option<(cv::PixelFormat, u32, u32)> {
+        self.pixel_format.map(|pf| (pf, self.width, self.height))
+    }
 }
 
 fn compute_seek_time(keyframe_index: Option<&Arc<KeyframeIndex>>, requested_time: f32) -> f32 {
@@ -236,7 +265,12 @@ impl AVAssetReaderDecoder {
         start_time: f32,
         keyframe_index: Option<Arc<KeyframeIndex>>,
     ) -> Result<Self, String> {
-        let (pixel_format, width, height) = {
+        let (pixel_format, width, height) = if let Some(info) = keyframe_index
+            .as_ref()
+            .and_then(|ki| ki.cached_video_info())
+        {
+            info
+        } else {
             let input = ffmpeg::format::input(&path)
                 .map_err(|e| format!("Failed to open video input '{}': {e}", path.display()))?;