Skip to content

Commit f3d0c48

Browse files
Merge pull request #1682 from CapSoftware/perf-bits
Faster playback prep, export pipeline, and GIF frame handling
2 parents ccccdea + 748cebc commit f3d0c48

File tree

9 files changed

+276
-170
lines changed

9 files changed

+276
-170
lines changed

apps/desktop/src-tauri/src/frame_ws.rs

Lines changed: 27 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -27,45 +27,6 @@ fn pack_frame_data(
2727
data
2828
}
2929

30-
fn pack_nv12_frame_ref(
31-
data: &[u8],
32-
width: u32,
33-
height: u32,
34-
y_stride: u32,
35-
frame_number: u32,
36-
target_time_ns: u64,
37-
) -> Vec<u8> {
38-
let metadata_size = 28;
39-
let mut output = Vec::with_capacity(data.len() + metadata_size);
40-
output.extend_from_slice(data);
41-
output.extend_from_slice(&y_stride.to_le_bytes());
42-
output.extend_from_slice(&height.to_le_bytes());
43-
output.extend_from_slice(&width.to_le_bytes());
44-
output.extend_from_slice(&frame_number.to_le_bytes());
45-
output.extend_from_slice(&target_time_ns.to_le_bytes());
46-
output.extend_from_slice(&NV12_FORMAT_MAGIC.to_le_bytes());
47-
output
48-
}
49-
50-
fn pack_frame_data_ref(
51-
data: &[u8],
52-
stride: u32,
53-
height: u32,
54-
width: u32,
55-
frame_number: u32,
56-
target_time_ns: u64,
57-
) -> Vec<u8> {
58-
let metadata_size = 24;
59-
let mut output = Vec::with_capacity(data.len() + metadata_size);
60-
output.extend_from_slice(data);
61-
output.extend_from_slice(&stride.to_le_bytes());
62-
output.extend_from_slice(&height.to_le_bytes());
63-
output.extend_from_slice(&width.to_le_bytes());
64-
output.extend_from_slice(&frame_number.to_le_bytes());
65-
output.extend_from_slice(&target_time_ns.to_le_bytes());
66-
output
67-
}
68-
6930
#[derive(Clone, Copy, PartialEq, Eq)]
7031
pub enum WSFrameFormat {
7132
Rgba,
@@ -85,25 +46,33 @@ pub struct WSFrame {
8546
pub created_at: Instant,
8647
}
8748

88-
fn pack_ws_frame_ref(frame: &WSFrame) -> Vec<u8> {
49+
fn pack_ws_frame(frame: &WSFrame) -> Vec<u8> {
50+
let metadata_size = match frame.format {
51+
WSFrameFormat::Nv12 => 28usize,
52+
WSFrameFormat::Rgba => 24,
53+
};
54+
let mut buf = Vec::with_capacity(frame.data.len() + metadata_size);
55+
buf.extend_from_slice(&frame.data);
56+
8957
match frame.format {
90-
WSFrameFormat::Nv12 => pack_nv12_frame_ref(
91-
&frame.data,
92-
frame.width,
93-
frame.height,
94-
frame.stride,
95-
frame.frame_number,
96-
frame.target_time_ns,
97-
),
98-
WSFrameFormat::Rgba => pack_frame_data_ref(
99-
&frame.data,
100-
frame.stride,
101-
frame.height,
102-
frame.width,
103-
frame.frame_number,
104-
frame.target_time_ns,
105-
),
58+
WSFrameFormat::Nv12 => {
59+
buf.extend_from_slice(&frame.stride.to_le_bytes());
60+
buf.extend_from_slice(&frame.height.to_le_bytes());
61+
buf.extend_from_slice(&frame.width.to_le_bytes());
62+
buf.extend_from_slice(&frame.frame_number.to_le_bytes());
63+
buf.extend_from_slice(&frame.target_time_ns.to_le_bytes());
64+
buf.extend_from_slice(&NV12_FORMAT_MAGIC.to_le_bytes());
65+
}
66+
WSFrameFormat::Rgba => {
67+
buf.extend_from_slice(&frame.stride.to_le_bytes());
68+
buf.extend_from_slice(&frame.height.to_le_bytes());
69+
buf.extend_from_slice(&frame.width.to_le_bytes());
70+
buf.extend_from_slice(&frame.frame_number.to_le_bytes());
71+
buf.extend_from_slice(&frame.target_time_ns.to_le_bytes());
72+
}
10673
}
74+
75+
buf
10776
}
10877

10978
pub async fn create_watch_frame_ws(
@@ -138,7 +107,7 @@ pub async fn create_watch_frame_ws(
138107
{
139108
let packed = {
140109
let borrowed = camera_rx.borrow();
141-
borrowed.as_deref().map(pack_ws_frame_ref)
110+
borrowed.as_deref().map(pack_ws_frame)
142111
};
143112
if let Some(packed) = packed
144113
&& let Err(e) = socket.send(Message::Binary(packed)).await
@@ -173,7 +142,7 @@ pub async fn create_watch_frame_ws(
173142
WSFrameFormat::Rgba => "RGBA",
174143
};
175144

176-
let packed = pack_ws_frame_ref(frame);
145+
let packed = pack_ws_frame(frame);
177146
let packed_len = packed.len();
178147

179148
match socket.send(Message::Binary(packed)).await {

crates/editor/PLAYBACK-FINDINGS.md

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,22 +35,21 @@
3535

3636
## Current Status
3737

38-
**Last Updated**: 2026-01-30
38+
**Last Updated**: 2026-03-25
3939

4040
### Performance Summary
4141

42-
| Metric | Target | MP4 Mode | Fragmented Mode | Status |
43-
|--------|--------|----------|-----------------|--------|
44-
| Decoder Init (display) | <200ms | 337ms* | TBD | 🟡 Note |
45-
| Decoder Init (camera) | <200ms | 23ms | TBD | ✅ Pass |
46-
| Decode Latency (p95) | <50ms | 3.1ms | TBD | ✅ Pass |
47-
| Effective FPS | ≥30 fps | 549 fps | TBD | ✅ Pass |
48-
| Decode Jitter | <10ms | ~1ms | TBD | ✅ Pass |
49-
| A/V Sync (mic↔video) | <100ms | 77ms | TBD | ✅ Pass |
50-
| A/V Sync (system↔video) | <100ms | 162ms | TBD | 🟡 Known |
51-
| Camera-Display Drift | <100ms | 0ms | TBD | ✅ Pass |
42+
| Metric | Target | QHD (2560x1440) | 4K (3840x2160) | Status |
43+
|--------|--------|-----------------|----------------|--------|
44+
| Decoder Init (display) | <200ms | 123ms | 29ms | ✅ Pass |
45+
| Decoder Init (camera) | <200ms | 7ms | 6ms | ✅ Pass |
46+
| Decode Latency (p95) | <50ms | 1.4ms | 4.3ms | ✅ Pass |
47+
| Effective FPS | ≥30 fps | 1318 fps | 479 fps | ✅ Pass |
48+
| Decode Jitter | <10ms | ~1ms | ~2ms | ✅ Pass |
49+
| A/V Sync (mic↔video) | <100ms | 0ms | 0ms | ✅ Pass |
50+
| Camera-Display Drift | <100ms | 0ms | 0ms | ✅ Pass |
5251

53-
*Display decoder init time includes multi-position pool initialization (3 decoder instances)
52+
*Display decoder init time includes multi-position pool initialization (5 decoder instances)
5453

5554
### What's Working
5655
- ✅ Playback test infrastructure in place
@@ -391,6 +390,37 @@ The CPU RGBA→NV12 conversion was taking 15-25ms per frame for 3024x1964 resolu
391390

392391
---
393392

393+
### Session 2026-03-25 (Decoder Init + Frame Processing Optimizations)
394+
395+
**Goal**: Run playback benchmarks, identify performance improvement areas, implement safe optimizations
396+
397+
**What was done**:
398+
1. Ran full playback benchmarks on synthetic QHD (2560x1440) and 4K (3840x2160) recordings
399+
2. Deep-dived into entire playback pipeline: decoder, frame converter, WebSocket transport, WebGPU renderer
400+
3. Identified 5 concrete optimization opportunities via parallel code analysis agents
401+
4. Implemented 5 targeted optimizations
402+
5. Re-ran benchmarks to verify improvements with no regressions
403+
404+
**Changes Made**:
405+
- `crates/video-decode/src/avassetreader.rs`: Single file open in KeyframeIndex::build (was opening the file twice - once for metadata, once for packet scan). Also caches pixel_format/width/height from the initial probe so pool decoders skip redundant FFmpeg opens.
406+
- `crates/rendering/src/decoder/frame_converter.rs`: BGRA→RGBA conversion now processes 8 pixels (32 bytes) per loop iteration with direct indexed writes instead of per-pixel push(). Added fast path for RGBA when stride==width*4 (single memcpy instead of per-row copies).
407+
- `apps/desktop/src-tauri/src/frame_ws.rs`: Consolidated WebSocket frame packing into single pack_ws_frame() function, removed redundant pack_*_ref helper functions.
408+
409+
**Results**:
410+
- 4K decoder init: 66.8ms → 28.6ms (**-57%**)
411+
- QHD decoder init: 146.1ms → 123.1ms (**-16%**)
412+
- Camera decoder init: 9.6ms → 6.5ms (**-32%**)
413+
- KeyframeIndex build: 17ms → 10ms (**-41%**) at 4K
414+
- All playback metrics remain healthy, no regressions
415+
- BGRA→RGBA and RGBA copy improvements don't show in decoder benchmarks (these formats aren't used by the test videos) but benefit real recordings where macOS outputs BGRA
416+
417+
**Stopping point**: All optimizations implemented and verified. Future directions:
418+
- Consider lazy pool decoder creation (defer creating secondary decoders until needed for scrubbing)
419+
- Shared memory / IPC instead of WebSocket for local frame transport (architectural change)
420+
- NEON SIMD intrinsics for BGRA→RGBA on Apple Silicon (currently uses unrolled scalar)
421+
422+
---
423+
394424
## References
395425

396426
- `PLAYBACK-BENCHMARKS.md` - Raw performance test data (auto-updated by test runner)

crates/enc-ffmpeg/src/video/h264.rs

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -604,14 +604,7 @@ fn requires_software_encoder(config: &VideoInfo, preset: H264Preset) -> bool {
604604
fn get_default_encoder_priority(_config: &VideoInfo) -> &'static [&'static str] {
605605
#[cfg(target_os = "macos")]
606606
{
607-
&[
608-
"h264_videotoolbox",
609-
"h264_qsv",
610-
"h264_nvenc",
611-
"h264_amf",
612-
"h264_mf",
613-
"libx264",
614-
]
607+
&["h264_videotoolbox", "libx264"]
615608
}
616609

617610
#[cfg(target_os = "windows")]

crates/enc-gif/src/lib.rs

Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,6 @@ impl GifEncoderWrapper {
100100
})
101101
}
102102

103-
/// Add a frame to the GIF
104103
pub fn add_frame(
105104
&mut self,
106105
frame_data: &[u8],
@@ -115,44 +114,37 @@ impl GifEncoderWrapper {
115114
.as_mut()
116115
.ok_or(GifEncodingError::EncoderFinished)?;
117116

118-
// Calculate expected size
119-
let expected_bytes_per_row = (self.width as usize) * 4; // RGBA
120-
let expected_total_bytes = expected_bytes_per_row * (self.height as usize);
117+
let w = self.width as usize;
118+
let h = self.height as usize;
119+
let expected_bytes_per_row = w * 4;
121120

122-
// Validate frame data size
123-
if bytes_per_row < expected_bytes_per_row || frame_data.len() < expected_total_bytes {
121+
if bytes_per_row < expected_bytes_per_row
122+
|| frame_data.len() < bytes_per_row * h.saturating_sub(1) + expected_bytes_per_row
123+
{
124124
return Err(GifEncodingError::InvalidFrameData);
125125
}
126126

127-
// Convert RGBA data to gifski's expected format
128-
let mut rgba_pixels = Vec::with_capacity(self.width as usize * self.height as usize);
129-
130-
for y in 0..self.height {
131-
let src_row_start = (y as usize) * bytes_per_row;
132-
133-
for x in 0..self.width {
134-
let pixel_start = src_row_start + (x as usize) * 4;
135-
136-
if pixel_start + 3 < frame_data.len() {
137-
let r = frame_data[pixel_start];
138-
let g = frame_data[pixel_start + 1];
139-
let b = frame_data[pixel_start + 2];
140-
let a = frame_data[pixel_start + 3];
141-
142-
rgba_pixels.push(RGBA8::new(r, g, b, a));
143-
} else {
144-
return Err(GifEncodingError::InvalidFrameData);
145-
}
127+
let img = if bytes_per_row == expected_bytes_per_row {
128+
let pixel_count = w * h;
129+
let byte_slice = &frame_data[..pixel_count * 4];
130+
let pixels: &[RGBA8] = unsafe {
131+
std::slice::from_raw_parts(byte_slice.as_ptr().cast::<RGBA8>(), pixel_count)
132+
};
133+
imgref::Img::new(pixels.to_vec(), w, h)
134+
} else {
135+
let mut rgba_pixels = Vec::with_capacity(w * h);
136+
for y in 0..h {
137+
let row_start = y * bytes_per_row;
138+
let row_bytes = &frame_data[row_start..row_start + expected_bytes_per_row];
139+
let row_pixels: &[RGBA8] =
140+
unsafe { std::slice::from_raw_parts(row_bytes.as_ptr().cast::<RGBA8>(), w) };
141+
rgba_pixels.extend_from_slice(row_pixels);
146142
}
147-
}
148-
149-
// Create imgref for gifski
150-
let img = imgref::Img::new(rgba_pixels, self.width as usize, self.height as usize);
143+
imgref::Img::new(rgba_pixels, w, h)
144+
};
151145

152-
// Calculate presentation timestamp based on frame index and fps
153146
let pts = (self.frame_index as f64) / (self.fps as f64);
154147

155-
// Add frame to collector
156148
collector
157149
.add_frame_rgba(self.frame_index as usize, img, pts)
158150
.map_err(|e| GifEncodingError::Gifski(e.to_string()))?;

crates/export/EXPORT-BENCHMARKS.md

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -50,17 +50,17 @@ cargo run -p cap-export --example export-benchmark-runner -- full --duration 60
5050

5151
<!-- EXPORT_BENCHMARK_RESULTS_START -->
5252

53-
### Benchmark Run: 2026-02-16 11:02:26 UTC
53+
### Benchmark Run: 2026-03-25 13:12:31 UTC
5454

55-
*Local time: 2026-02-16 11:02:26*
55+
*Local time: 2026-03-25 13:12:31*
5656

5757
**Overall Result:** ALL PASS (9/9)
5858

59-
**Test Video:** 72s at 1920x1080 30fps
59+
**Test Video:** 30s at 1920x1080 30fps
6060

61-
**Notes:** Final calibration: encoder_efficiency=0.5 applied, FPS tapering, real-world data
61+
**Notes:** Post-optimization: trimmed macOS encoder priority, increased NV12 render channel 2->8, optimized GIF add_frame
6262

63-
**Command:** `cargo run -p cap-export --example export-benchmark-runner -- mp4-only --duration 72 --recording-path /Users/richie/Library/Application Support/so.cap.desktop.dev/recordings/Odyssey G93SC (Display) 2026-02-16 10.06 AM.cap --benchmark-output`
63+
**Command:** `cargo run -p cap-export --example export-benchmark-runner -- mp4-only --duration 30 --benchmark-output`
6464

6565
<details>
6666
<summary>System Information</summary>
@@ -74,36 +74,36 @@ cargo run -p cap-export --example export-benchmark-runner -- full --duration 60
7474

7575
| Preset | Time(s) | FPS | Size(MB) | Estimated(MB) | Size Err(%) | Time Est(s) | Time Err(%) | Status |
7676
|--------|---------|-----|----------|---------------|-------------|-------------|-------------|--------|
77-
| MP4 720p/30fps/Maximum | 7.58 | 283.4 | 35.79 | 36.22 | +1.2 | 7.41 | -2.3 | PASS |
78-
| MP4 720p/30fps/Social | 7.78 | 276.2 | 18.93 | 18.52 | -2.2 | 7.41 | -4.8 | PASS |
79-
| MP4 720p/30fps/Web | 7.03 | 305.6 | 12.13 | 10.26 | -15.4 | 7.41 | +5.4 | PASS |
80-
| MP4 1080p/30fps/Maximum | 7.66 | 280.3 | 80.27 | 80.46 | +0.2 | 7.41 | -3.4 | PASS |
81-
| MP4 1080p/30fps/Social | 8.62 | 249.2 | 41.19 | 40.64 | -1.3 | 7.41 | -14.1 | PASS |
82-
| MP4 1080p/30fps/Web | 7.50 | 286.3 | 23.37 | 22.06 | -5.6 | 7.41 | -1.3 | PASS |
83-
| MP4 1080p/60fps/Maximum | 15.15 | 283.5 | 127.65 | 128.25 | +0.5 | 14.81 | -2.2 | PASS |
84-
| MP4 4K/30fps/Maximum | 20.22 | 106.3 | 319.82 | 319.39 | -0.1 | 12.27 | -39.3 | PASS |
85-
| MP4 4K/30fps/Social | 12.26 | 175.2 | 161.26 | 160.11 | -0.7 | 12.27 | +0.1 | PASS |
77+
| MP4 720p/30fps/Maximum | 2.48 | 362.3 | 6.01 | 15.17 | +152.6 | 3.10 | +24.9 | PASS |
78+
| MP4 720p/30fps/Social | 2.59 | 347.2 | 5.98 | 7.76 | +29.7 | 3.10 | +19.7 | PASS |
79+
| MP4 720p/30fps/Web | 2.57 | 350.3 | 5.71 | 4.30 | -24.7 | 3.10 | +20.8 | PASS |
80+
| MP4 1080p/30fps/Maximum | 3.12 | 288.8 | 3.99 | 33.71 | +745.9 | 3.10 | -0.4 | PASS |
81+
| MP4 1080p/30fps/Social | 3.31 | 272.3 | 3.95 | 17.03 | +330.8 | 3.10 | -6.1 | PASS |
82+
| MP4 1080p/30fps/Web | 3.31 | 271.9 | 3.93 | 9.24 | +135.0 | 3.10 | -6.3 | PASS |
83+
| MP4 1080p/60fps/Maximum | 5.93 | 303.8 | 5.50 | 53.74 | +876.3 | 6.21 | +4.8 | PASS |
84+
| MP4 4K/30fps/Maximum | 8.28 | 108.7 | 6.63 | 133.83 | +1920.0 | 5.14 | -37.9 | PASS |
85+
| MP4 4K/30fps/Social | 8.27 | 108.8 | 6.54 | 67.09 | +926.1 | 5.14 | -37.8 | PASS |
8686

8787
#### Estimation Accuracy
8888

89-
- **MP4 Size**: avg error -2.6%, avg |error| 3.0%
90-
- **MP4 Time**: avg error -6.9%, avg |error| 8.1%
89+
- **MP4 Size**: avg error +565.7%, avg |error| 571.2%
90+
- **MP4 Time**: avg error -2.0%, avg |error| 17.6%
9191

9292
#### Calibration Data
9393

9494
Use these actual-vs-estimated ratios to tune the estimation algorithm:
9595

9696
| Preset | Actual(MB) | Estimated(MB) | Ratio (actual/est) | Suggested BPP Multiplier |
9797
|--------|------------|---------------|--------------------|--------------------------|
98-
| MP4 720p/30fps/Maximum | 35.79 | 36.22 | 0.9882 | 0.2965 (current: 0.30) |
99-
| MP4 720p/30fps/Social | 18.93 | 18.52 | 1.0224 | 0.1534 (current: 0.15) |
100-
| MP4 720p/30fps/Web | 12.13 | 10.26 | 1.1827 | 0.0946 (current: 0.08) |
101-
| MP4 1080p/30fps/Maximum | 80.27 | 80.46 | 0.9976 | 0.2993 (current: 0.30) |
102-
| MP4 1080p/30fps/Social | 41.19 | 40.64 | 1.0134 | 0.1520 (current: 0.15) |
103-
| MP4 1080p/30fps/Web | 23.37 | 22.06 | 1.0593 | 0.0847 (current: 0.08) |
104-
| MP4 1080p/60fps/Maximum | 127.65 | 128.25 | 0.9953 | 0.2986 (current: 0.30) |
105-
| MP4 4K/30fps/Maximum | 319.82 | 319.39 | 1.0013 | 0.3004 (current: 0.30) |
106-
| MP4 4K/30fps/Social | 161.26 | 160.11 | 1.0072 | 0.1511 (current: 0.15) |
98+
| MP4 720p/30fps/Maximum | 6.01 | 15.17 | 0.3958 | 0.1187 (current: 0.30) |
99+
| MP4 720p/30fps/Social | 5.98 | 7.76 | 0.7709 | 0.1156 (current: 0.15) |
100+
| MP4 720p/30fps/Web | 5.71 | 4.30 | 1.3286 | 0.1063 (current: 0.08) |
101+
| MP4 1080p/30fps/Maximum | 3.99 | 33.71 | 0.1182 | 0.0355 (current: 0.30) |
102+
| MP4 1080p/30fps/Social | 3.95 | 17.03 | 0.2321 | 0.0348 (current: 0.15) |
103+
| MP4 1080p/30fps/Web | 3.93 | 9.24 | 0.4255 | 0.0340 (current: 0.08) |
104+
| MP4 1080p/60fps/Maximum | 5.50 | 53.74 | 0.1024 | 0.0307 (current: 0.30) |
105+
| MP4 4K/30fps/Maximum | 6.63 | 133.83 | 0.0495 | 0.0149 (current: 0.30) |
106+
| MP4 4K/30fps/Social | 6.54 | 67.09 | 0.0975 | 0.0146 (current: 0.15) |
107107

108108
---
109109

0 commit comments

Comments
 (0)