Skip to content

Commit c40b853

Browse files
perf(rendering): add software-optimized NV12 path bypassing GPU compute
When running on a software wgpu adapter (lavapipe, llvmpipe, etc.), skip the GPU RGBA→NV12 compute shader and instead: 1. Read back RGBA via the standard pipelined readback path 2. Convert RGBA→NV12 directly on CPU with inline color conversion This avoids the overhead of: - GPU compute shader dispatch through software Vulkan - Extra GPU storage buffer allocation for NV12 data - GPU command encoder submission for the compute pass - Additional GPU buffer readback through wgpu On software adapters, the GPU compute path has significant overhead from the wgpu abstraction layer, making direct CPU conversion faster. Co-authored-by: Richie McIlroy <richiemcilroy@users.noreply.github.com>
1 parent dc5530b commit c40b853

1 file changed

Lines changed: 106 additions & 0 deletions

File tree

crates/rendering/src/lib.rs

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2882,6 +2882,112 @@ impl<'a> FrameRenderer<'a> {
28822882
uniforms: ProjectUniforms,
28832883
cursor: &CursorEvents,
28842884
layers: &mut RendererLayers,
2885+
) -> Result<Option<frame_pipeline::Nv12RenderedFrame>, RenderingError> {
2886+
if self.constants.is_software_adapter {
2887+
return self
2888+
.render_nv12_software_path(segment_frames, uniforms, cursor, layers)
2889+
.await;
2890+
}
2891+
2892+
self.render_nv12_gpu_path(segment_frames, uniforms, cursor, layers)
2893+
.await
2894+
}
2895+
2896+
async fn render_nv12_software_path(
2897+
&mut self,
2898+
segment_frames: DecodedSegmentFrames,
2899+
uniforms: ProjectUniforms,
2900+
cursor: &CursorEvents,
2901+
layers: &mut RendererLayers,
2902+
) -> Result<Option<frame_pipeline::Nv12RenderedFrame>, RenderingError> {
2903+
let rgba_frame = self
2904+
.render(segment_frames, uniforms.clone(), cursor, layers)
2905+
.await?;
2906+
2907+
let Some(rgba_frame) = rgba_frame else {
2908+
return Ok(None);
2909+
};
2910+
2911+
let width = rgba_frame.width;
2912+
let height = rgba_frame.height;
2913+
let padded_bytes_per_row = rgba_frame.padded_bytes_per_row;
2914+
let frame_number = rgba_frame.frame_number;
2915+
let target_time_ns = rgba_frame.target_time_ns;
2916+
2917+
let nv12_size = (width as usize) * (height as usize) * 3 / 2;
2918+
let mut nv12_buf = self.nv12_buffer_pool.acquire(nv12_size);
2919+
2920+
let y_stride = width as usize;
2921+
let uv_stride = width as usize;
2922+
let y_plane_size = y_stride * height as usize;
2923+
let uv_plane_size = uv_stride * (height as usize / 2);
2924+
nv12_buf.resize(y_plane_size + uv_plane_size, 0);
2925+
2926+
let src_data = &rgba_frame.data;
2927+
let src_stride = padded_bytes_per_row as usize;
2928+
2929+
for row in 0..height as usize {
2930+
let src_row = &src_data[row * src_stride..row * src_stride + width as usize * 4];
2931+
let y_row = &mut nv12_buf[row * y_stride..(row + 1) * y_stride];
2932+
for col in 0..width as usize {
2933+
let r = src_row[col * 4] as i32;
2934+
let g = src_row[col * 4 + 1] as i32;
2935+
let b = src_row[col * 4 + 2] as i32;
2936+
y_row[col] = ((16 + ((65 * r + 129 * g + 25 * b + 128) >> 8)) as u8).clamp(16, 235);
2937+
}
2938+
}
2939+
2940+
let uv_offset = y_plane_size;
2941+
for row in 0..(height as usize / 2) {
2942+
let src_row0 =
2943+
&src_data[row * 2 * src_stride..row * 2 * src_stride + width as usize * 4];
2944+
let src_row1 = &src_data
2945+
[(row * 2 + 1) * src_stride..(row * 2 + 1) * src_stride + width as usize * 4];
2946+
let uv_row =
2947+
&mut nv12_buf[uv_offset + row * uv_stride..uv_offset + (row + 1) * uv_stride];
2948+
for col in 0..(width as usize / 2) {
2949+
let r = (src_row0[col * 8] as i32
2950+
+ src_row0[col * 8 + 4] as i32
2951+
+ src_row1[col * 8] as i32
2952+
+ src_row1[col * 8 + 4] as i32
2953+
+ 2)
2954+
/ 4;
2955+
let g = (src_row0[col * 8 + 1] as i32
2956+
+ src_row0[col * 8 + 5] as i32
2957+
+ src_row1[col * 8 + 1] as i32
2958+
+ src_row1[col * 8 + 5] as i32
2959+
+ 2)
2960+
/ 4;
2961+
let b = (src_row0[col * 8 + 2] as i32
2962+
+ src_row0[col * 8 + 6] as i32
2963+
+ src_row1[col * 8 + 2] as i32
2964+
+ src_row1[col * 8 + 6] as i32
2965+
+ 2)
2966+
/ 4;
2967+
uv_row[col * 2] =
2968+
((128 + ((-38 * r - 74 * g + 112 * b + 128) >> 8)) as u8).clamp(16, 240);
2969+
uv_row[col * 2 + 1] =
2970+
((128 + ((112 * r - 94 * g - 18 * b + 128) >> 8)) as u8).clamp(16, 240);
2971+
}
2972+
}
2973+
2974+
Ok(Some(frame_pipeline::Nv12RenderedFrame {
2975+
data: Arc::new(nv12_buf),
2976+
width,
2977+
height,
2978+
y_stride: width,
2979+
frame_number,
2980+
target_time_ns,
2981+
format: frame_pipeline::GpuOutputFormat::Nv12,
2982+
}))
2983+
}
2984+
2985+
async fn render_nv12_gpu_path(
2986+
&mut self,
2987+
segment_frames: DecodedSegmentFrames,
2988+
uniforms: ProjectUniforms,
2989+
cursor: &CursorEvents,
2990+
layers: &mut RendererLayers,
28852991
) -> Result<Option<frame_pipeline::Nv12RenderedFrame>, RenderingError> {
28862992
let mut last_error = None;
28872993

0 commit comments

Comments
 (0)