Skip to content

Commit 581e18b

Browse files
committed
feat(splat3d): render-depth certification P2 — SIMD batch kernels
Add F32x16-wide `camera_depth_variance_batch` (16-wide quat→R→W·Σ·Wᵀ depth spine, same convention as project_chunk_x16) and `certify_batch_simd` (SIMD interval math + per-lane branchy writeback, the project.rs SIMD-bulk/scalar-tail split). Share ZERO_CERT for culled slots across scalar+SIMD paths. Consumes crate::simd::F32x16 only — no dispatch changes. Two Tier-2 parity tests: depth-variance SIMD-vs-scalar within 1e-3, and full certify_batch SIMD-vs-scalar end-to-end (incl. culled ZERO_CERT slots). https://claude.ai/code/session_017GFLBnDy23AWBqvkbHHC41
1 parent d67cfda commit 581e18b

1 file changed

Lines changed: 320 additions & 13 deletions

File tree

src/hpc/splat3d/depth_cert.rs

Lines changed: 320 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,10 @@
5151
//! path (P2) consumes [`super::project::ProjectedBatch`] and must match this
5252
//! reference within tolerance.
5353
54-
use super::project::ProjectedBatch;
54+
use super::gaussian::GaussianBatch;
55+
use super::project::{Camera, ProjectedBatch};
5556
use super::spd3::Spd3;
57+
use crate::simd::F32x16;
5658

5759
/// Cesium / OGC 3D Tiles screen-space error for a feature of world-space
5860
/// extent `geometric_error` seen at camera-space `distance`, lifted from
@@ -239,33 +241,242 @@ pub fn certify_batch_scalar(
239241
out.reserve(batch.len);
240242
for i in 0..batch.len {
241243
if batch.valid[i] == 0 {
242-
out.push(RenderDepthCertificate {
243-
min_depth: 0.0,
244-
max_depth: 0.0,
245-
depth_variance: 0.0,
246-
projected_radius_px: 0.0,
247-
occlusion_confidence: 0.0,
248-
ordering_uncertainty: 0.0,
249-
quantization_depth_error: 0.0,
250-
covariance_depth_error: 0.0,
251-
total_depth_error: 0.0,
252-
passed: false,
253-
});
244+
out.push(ZERO_CERT);
254245
continue;
255246
}
256247
let cov_zz = depth_var.get(i).copied().unwrap_or(0.0);
257248
out.push(certify_depth_scalar(batch.depth[i], cov_zz, batch.radius[i], params));
258249
}
259250
}
260251

252+
// ════════════════════════════════════════════════════════════════════════════
253+
// P2 — SIMD batch kernels (consume `crate::simd::F32x16`; no dispatch changes)
254+
// ════════════════════════════════════════════════════════════════════════════
255+
256+
/// SIMD chunk width — 16 lanes regardless of native tier, mirroring
257+
/// `project::project_chunk_x16`.
258+
const CHUNK: usize = 16;
259+
260+
/// Stage one logical 16-wide window of a SoA `f32` column into a padded
261+
/// `[f32; 16]`; lanes beyond `len` are left zero.
262+
#[inline]
263+
fn stage16(col: &[f32], start: usize, len: usize) -> [f32; 16] {
264+
let mut buf = [0.0f32; 16];
265+
let end = (start + CHUNK).min(len);
266+
for (k, slot) in buf.iter_mut().enumerate().take(end - start) {
267+
*slot = col[start + k];
268+
}
269+
buf
270+
}
271+
272+
/// SIMD (16-wide) camera-space depth variance `Σ_cam[2][2]` for every gaussian,
273+
/// written into `out[0..gaussians.len]`. Vectorized analogue of
274+
/// [`camera_depth_variance`]; uses the same quaternion→rotation→`W·Σ·Wᵀ`
275+
/// convention as `project::project_chunk_x16` (so it matches that kernel's
276+
/// covariance spine within the same tolerance).
277+
///
278+
/// Does not allocate; `out` must have length ≥ `gaussians.len`.
279+
pub fn camera_depth_variance_batch(gaussians: &GaussianBatch, camera: &Camera, out: &mut [f32]) {
280+
let len = gaussians.len;
281+
assert!(out.len() >= len, "camera_depth_variance_batch: out.len() {} < gaussians.len {len}", out.len());
282+
if len == 0 {
283+
return;
284+
}
285+
let v = &camera.view;
286+
let w20 = F32x16::splat(v[2][0]);
287+
let w21 = F32x16::splat(v[2][1]);
288+
let w22 = F32x16::splat(v[2][2]);
289+
let one = F32x16::splat(1.0);
290+
let two = F32x16::splat(2.0);
291+
let zero = F32x16::splat(0.0);
292+
293+
let mut start = 0;
294+
while start < len {
295+
let qw = F32x16::from_slice(&stage16(&gaussians.quat_w, start, len));
296+
let qx = F32x16::from_slice(&stage16(&gaussians.quat_x, start, len));
297+
let qy = F32x16::from_slice(&stage16(&gaussians.quat_y, start, len));
298+
let qz = F32x16::from_slice(&stage16(&gaussians.quat_z, start, len));
299+
let scx = F32x16::from_slice(&stage16(&gaussians.scale_x, start, len));
300+
let scy = F32x16::from_slice(&stage16(&gaussians.scale_y, start, len));
301+
let scz = F32x16::from_slice(&stage16(&gaussians.scale_z, start, len));
302+
303+
// Quaternion → rotation matrix (mirrors project_chunk_x16).
304+
let xx = qx * qx;
305+
let yy = qy * qy;
306+
let zz = qz * qz;
307+
let xy = qx * qy;
308+
let xz = qx * qz;
309+
let yz = qy * qz;
310+
let wx = qw * qx;
311+
let wy = qw * qy;
312+
let wz = qw * qz;
313+
let r00 = one - two * (yy + zz);
314+
let r01 = two * (xy - wz);
315+
let r02 = two * (xz + wy);
316+
let r10 = two * (xy + wz);
317+
let r11 = one - two * (xx + zz);
318+
let r12 = two * (yz - wx);
319+
let r20 = two * (xz - wy);
320+
let r21 = two * (yz + wx);
321+
let r22 = one - two * (xx + yy);
322+
323+
// M = R · diag(scale²); Σ_world upper triangle = M · Rᵀ.
324+
let s0 = scx * scx;
325+
let s1 = scy * scy;
326+
let s2 = scz * scz;
327+
let m00 = r00 * s0;
328+
let m01 = r01 * s1;
329+
let m02 = r02 * s2;
330+
let m10 = r10 * s0;
331+
let m11 = r11 * s1;
332+
let m12 = r12 * s2;
333+
let m20 = r20 * s0;
334+
let m21 = r21 * s1;
335+
let m22 = r22 * s2;
336+
let sw11 = m00 * r00 + m01 * r01 + m02 * r02;
337+
let sw12 = m00 * r10 + m01 * r11 + m02 * r12;
338+
let sw13 = m00 * r20 + m01 * r21 + m02 * r22;
339+
let sw22 = m10 * r10 + m11 * r11 + m12 * r12;
340+
let sw23 = m10 * r20 + m11 * r21 + m12 * r22;
341+
let sw33 = m20 * r20 + m21 * r21 + m22 * r22;
342+
343+
// Σ_cam[2][2] = w2 · Σ_world · w2ᵀ (w2 = third view row).
344+
let cov = w20 * w20 * sw11
345+
+ w21 * w21 * sw22
346+
+ w22 * w22 * sw33
347+
+ two * (w20 * w21 * sw12 + w20 * w22 * sw13 + w21 * w22 * sw23);
348+
349+
let mut arr = [0.0f32; 16];
350+
cov.simd_max(zero).copy_to_slice(&mut arr);
351+
let end = (start + CHUNK).min(len);
352+
out[start..end].copy_from_slice(&arr[..end - start]);
353+
start += CHUNK;
354+
}
355+
}
356+
357+
/// SIMD (16-wide) certificate reduction over a projected batch — the
358+
/// vectorized twin of [`certify_batch_scalar`]. The element-wise interval
359+
/// math (`σ_z`, `k·σ_z`, total, min/max depth) runs through `F32x16`; the two
360+
/// branchy fields (ordering/occlusion) are finished per lane, mirroring the
361+
/// SIMD-bulk / scalar-writeback split used by `project::project_chunk_x16`.
362+
///
363+
/// `depth_var[i]` is `Σ_cam[2][2]` for splat `i` (from
364+
/// [`camera_depth_variance_batch`]). Culled slots get a zeroed,
365+
/// `passed = false` certificate. Does not panic on malformed input.
366+
pub fn certify_batch_simd(
367+
batch: &ProjectedBatch, depth_var: &[f32], params: &DepthCertParams, out: &mut Vec<RenderDepthCertificate>,
368+
) {
369+
out.clear();
370+
let len = batch.len;
371+
if len == 0 {
372+
return;
373+
}
374+
out.reserve(len);
375+
376+
// All caller budgets are batch constants → one broadcast.
377+
let const_terms = params.camera_transform_error
378+
+ params.quantization_depth_error
379+
+ params.splat_support_overlap_error
380+
+ 0.5 * params.sort_bucket_width
381+
+ params.lod_substitution_error
382+
+ params.sampling_discrepancy;
383+
let const_v = F32x16::splat(const_terms);
384+
let k = F32x16::splat(params.k_sigma);
385+
let zero = F32x16::splat(0.0);
386+
387+
let mut start = 0;
388+
while start < len {
389+
let depth_v = F32x16::from_slice(&stage16(&batch.depth, start, len));
390+
let radius_v = F32x16::from_slice(&stage16(&batch.radius, start, len));
391+
let var_v = F32x16::from_slice(&stage16(depth_var, start, len)).simd_max(zero);
392+
393+
let sigma = var_v.sqrt();
394+
let cov_err = k * sigma;
395+
let total = const_v + cov_err;
396+
let min_d = (depth_v - cov_err).simd_max(zero);
397+
let max_d = depth_v + cov_err;
398+
399+
let mut var_a = [0.0f32; 16];
400+
let mut cov_a = [0.0f32; 16];
401+
let mut tot_a = [0.0f32; 16];
402+
let mut min_a = [0.0f32; 16];
403+
let mut max_a = [0.0f32; 16];
404+
let mut rad_a = [0.0f32; 16];
405+
var_v.copy_to_slice(&mut var_a);
406+
cov_err.copy_to_slice(&mut cov_a);
407+
total.copy_to_slice(&mut tot_a);
408+
min_d.copy_to_slice(&mut min_a);
409+
max_d.copy_to_slice(&mut max_a);
410+
radius_v.copy_to_slice(&mut rad_a);
411+
412+
let end = (start + CHUNK).min(len);
413+
for k in 0..(end - start) {
414+
let idx = start + k;
415+
if batch.valid[idx] == 0 {
416+
out.push(ZERO_CERT);
417+
continue;
418+
}
419+
let cov_e = cov_a[k];
420+
let ordering_uncertainty = if params.sort_bucket_width > 0.0 {
421+
(2.0 * cov_e / params.sort_bucket_width).clamp(0.0, 1.0)
422+
} else {
423+
0.0
424+
};
425+
let occlusion_confidence = if params.splat_support_overlap_error > 0.0 {
426+
(1.0 - params.splat_support_overlap_error / cov_e.max(1e-6)).clamp(0.0, 1.0)
427+
} else {
428+
1.0
429+
};
430+
let total_depth_error = tot_a[k];
431+
out.push(RenderDepthCertificate {
432+
min_depth: min_a[k],
433+
max_depth: max_a[k],
434+
depth_variance: var_a[k],
435+
projected_radius_px: rad_a[k],
436+
occlusion_confidence,
437+
ordering_uncertainty,
438+
quantization_depth_error: params.quantization_depth_error,
439+
covariance_depth_error: cov_e,
440+
total_depth_error,
441+
passed: total_depth_error.is_finite() && total_depth_error <= params.max_total_depth_error,
442+
});
443+
}
444+
start += CHUNK;
445+
}
446+
}
447+
448+
/// Zeroed certificate for culled slots (shared by scalar + SIMD batch paths).
449+
const ZERO_CERT: RenderDepthCertificate = RenderDepthCertificate {
450+
min_depth: 0.0,
451+
max_depth: 0.0,
452+
depth_variance: 0.0,
453+
projected_radius_px: 0.0,
454+
occlusion_confidence: 0.0,
455+
ordering_uncertainty: 0.0,
456+
quantization_depth_error: 0.0,
457+
covariance_depth_error: 0.0,
458+
total_depth_error: 0.0,
459+
passed: false,
460+
};
461+
261462
#[cfg(test)]
262463
mod tests {
464+
use super::super::gaussian::{Gaussian3D, GaussianBatch};
465+
use super::super::project::{project_batch, Camera, ProjectedBatch};
263466
use super::*;
264467

265468
fn approx(a: f32, b: f32, tol: f32) -> bool {
266469
(a - b).abs() <= tol
267470
}
268471

472+
/// xorshift32 → [0,1), deterministic test data.
473+
fn rng(s: &mut u32) -> f32 {
474+
*s ^= *s << 13;
475+
*s ^= *s >> 17;
476+
*s ^= *s << 5;
477+
(*s as f32) / (u32::MAX as f32)
478+
}
479+
269480
const IDENTITY_VIEW: [[f32; 4]; 4] =
270481
[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]];
271482

@@ -405,4 +616,100 @@ mod tests {
405616
assert!(approx(c1.occlusion_confidence, 0.5, 1e-5), "got {}", c1.occlusion_confidence);
406617
assert!((0.0..=1.0).contains(&c1.occlusion_confidence));
407618
}
619+
620+
// ── P2: SIMD batch parity (validation plan Tier 2) ────────────────────────
621+
622+
#[test]
623+
fn depth_variance_batch_matches_scalar_reference() {
624+
// Varied scales + normalized non-identity quaternions under a rotated
625+
// view — exercises the full quat→R→W·Σ·Wᵀ depth spine in SIMD.
626+
let mut batch = GaussianBatch::with_capacity(40);
627+
let mut st = 0x1234_5678u32;
628+
for _ in 0..40 {
629+
let mut g = Gaussian3D::unit();
630+
g.mean = [0.0, 0.0, 2.0];
631+
g.scale = [0.1 + rng(&mut st) * 1.5, 0.1 + rng(&mut st) * 1.5, 0.1 + rng(&mut st) * 1.5];
632+
let q = [rng(&mut st) * 2.0 - 1.0, rng(&mut st) * 2.0 - 1.0, rng(&mut st) * 2.0 - 1.0, rng(&mut st) * 2.0 - 1.0];
633+
let n = (q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]).sqrt().max(1e-6);
634+
g.quat = [q[0] / n, q[1] / n, q[2] / n, q[3] / n];
635+
g.opacity = 1.0;
636+
batch.push(g);
637+
}
638+
// 90° about +Y so the look-axis is non-trivial.
639+
let view = [[0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 0.0], [-1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]];
640+
let cam = Camera {
641+
view, fx: 512.0, fy: 512.0, cx: 256.0, cy: 256.0, near: 0.01, far: 1000.0,
642+
width: 512, height: 512, position: [0.0, 0.0, 0.0],
643+
};
644+
let mut got = vec![0.0f32; batch.len];
645+
camera_depth_variance_batch(&batch, &cam, &mut got);
646+
for i in 0..batch.len {
647+
let want = camera_depth_variance(
648+
[batch.scale_x[i], batch.scale_y[i], batch.scale_z[i]],
649+
[batch.quat_w[i], batch.quat_x[i], batch.quat_y[i], batch.quat_z[i]],
650+
&view,
651+
);
652+
// Same tolerance project.rs uses for SIMD-vs-Spd3 covariance.
653+
assert!((got[i] - want).abs() <= 1e-3 * (1.0 + want.abs()), "i={i} simd={} scalar={want}", got[i]);
654+
}
655+
}
656+
657+
#[test]
658+
fn certify_batch_simd_matches_scalar_end_to_end() {
659+
// Mix of visible, behind-camera, and off-screen splats so both the
660+
// certified and the culled (ZERO_CERT) paths are exercised.
661+
let mut batch = GaussianBatch::with_capacity(50);
662+
let mut st = 0xC0FF_EE11u32;
663+
for i in 0..50 {
664+
let mut g = Gaussian3D::unit();
665+
let z = if i % 7 == 0 { -1.0 } else { 1.0 + rng(&mut st) * 6.0 };
666+
let x = if i % 11 == 0 { 9000.0 } else { rng(&mut st) * 2.0 - 1.0 };
667+
g.mean = [x, rng(&mut st) * 2.0 - 1.0, z];
668+
g.scale = [0.1 + rng(&mut st) * 0.8, 0.1 + rng(&mut st) * 0.8, 0.1 + rng(&mut st) * 0.8];
669+
g.quat = [1.0, 0.0, 0.0, 0.0];
670+
g.opacity = rng(&mut st);
671+
batch.push(g);
672+
}
673+
let cam = Camera::identity_at_origin(256, 256);
674+
let mut proj = ProjectedBatch::with_capacity(batch.capacity);
675+
project_batch(&batch, &cam, &mut proj);
676+
677+
let mut dv = vec![0.0f32; batch.len];
678+
camera_depth_variance_batch(&batch, &cam, &mut dv);
679+
680+
let params = DepthCertParams {
681+
k_sigma: 2.0,
682+
quantization_depth_error: 0.05,
683+
splat_support_overlap_error: 0.2,
684+
sort_bucket_width: 1.0,
685+
sampling_discrepancy: 0.01,
686+
max_total_depth_error: 3.0,
687+
..Default::default()
688+
};
689+
let mut want = Vec::new();
690+
let mut got = Vec::new();
691+
certify_batch_scalar(&proj, &dv, &params, &mut want);
692+
certify_batch_simd(&proj, &dv, &params, &mut got);
693+
694+
assert_eq!(want.len(), got.len(), "certificate count mismatch");
695+
assert_eq!(want.len(), batch.len);
696+
let mut culled = 0;
697+
for i in 0..want.len() {
698+
let (w, g) = (want[i], got[i]);
699+
assert_eq!(w.passed, g.passed, "i={i} passed");
700+
assert!(approx(w.min_depth, g.min_depth, 1e-4), "i={i} min_depth {} vs {}", w.min_depth, g.min_depth);
701+
assert!(approx(w.max_depth, g.max_depth, 1e-4), "i={i} max_depth");
702+
assert!(approx(w.depth_variance, g.depth_variance, 1e-4), "i={i} depth_variance");
703+
assert!(approx(w.projected_radius_px, g.projected_radius_px, 1e-4), "i={i} radius");
704+
assert!(approx(w.occlusion_confidence, g.occlusion_confidence, 1e-5), "i={i} occlusion");
705+
assert!(approx(w.ordering_uncertainty, g.ordering_uncertainty, 1e-5), "i={i} ordering");
706+
assert!(approx(w.covariance_depth_error, g.covariance_depth_error, 1e-4), "i={i} cov_err");
707+
assert!(approx(w.total_depth_error, g.total_depth_error, 1e-4), "i={i} total");
708+
if proj.valid[i] == 0 {
709+
culled += 1;
710+
assert_eq!(g, ZERO_CERT, "i={i} culled slot must be ZERO_CERT");
711+
}
712+
}
713+
assert!(culled > 0, "test should exercise at least one culled slot");
714+
}
408715
}

0 commit comments

Comments
 (0)