|
51 | 51 | //! path (P2) consumes [`super::project::ProjectedBatch`] and must match this |
52 | 52 | //! reference within tolerance. |
53 | 53 |
|
54 | | -use super::project::ProjectedBatch; |
| 54 | +use super::gaussian::GaussianBatch; |
| 55 | +use super::project::{Camera, ProjectedBatch}; |
55 | 56 | use super::spd3::Spd3; |
| 57 | +use crate::simd::F32x16; |
56 | 58 |
|
57 | 59 | /// Cesium / OGC 3D Tiles screen-space error for a feature of world-space |
58 | 60 | /// extent `geometric_error` seen at camera-space `distance`, lifted from |
@@ -239,33 +241,242 @@ pub fn certify_batch_scalar( |
239 | 241 | out.reserve(batch.len); |
240 | 242 | for i in 0..batch.len { |
241 | 243 | if batch.valid[i] == 0 { |
242 | | - out.push(RenderDepthCertificate { |
243 | | - min_depth: 0.0, |
244 | | - max_depth: 0.0, |
245 | | - depth_variance: 0.0, |
246 | | - projected_radius_px: 0.0, |
247 | | - occlusion_confidence: 0.0, |
248 | | - ordering_uncertainty: 0.0, |
249 | | - quantization_depth_error: 0.0, |
250 | | - covariance_depth_error: 0.0, |
251 | | - total_depth_error: 0.0, |
252 | | - passed: false, |
253 | | - }); |
| 244 | + out.push(ZERO_CERT); |
254 | 245 | continue; |
255 | 246 | } |
256 | 247 | let cov_zz = depth_var.get(i).copied().unwrap_or(0.0); |
257 | 248 | out.push(certify_depth_scalar(batch.depth[i], cov_zz, batch.radius[i], params)); |
258 | 249 | } |
259 | 250 | } |
260 | 251 |
|
| 252 | +// ════════════════════════════════════════════════════════════════════════════ |
| 253 | +// P2 — SIMD batch kernels (consume `crate::simd::F32x16`; no dispatch changes) |
| 254 | +// ════════════════════════════════════════════════════════════════════════════ |
| 255 | + |
| 256 | +/// SIMD chunk width — 16 lanes regardless of native tier, mirroring |
| 257 | +/// `project::project_chunk_x16`. |
| 258 | +const CHUNK: usize = 16; |
| 259 | + |
| 260 | +/// Stage one logical 16-wide window of a SoA `f32` column into a padded |
| 261 | +/// `[f32; 16]`; lanes beyond `len` are left zero. |
| 262 | +#[inline] |
| 263 | +fn stage16(col: &[f32], start: usize, len: usize) -> [f32; 16] { |
| 264 | + let mut buf = [0.0f32; 16]; |
| 265 | + let end = (start + CHUNK).min(len); |
| 266 | + for (k, slot) in buf.iter_mut().enumerate().take(end - start) { |
| 267 | + *slot = col[start + k]; |
| 268 | + } |
| 269 | + buf |
| 270 | +} |
| 271 | + |
| 272 | +/// SIMD (16-wide) camera-space depth variance `Σ_cam[2][2]` for every gaussian, |
| 273 | +/// written into `out[0..gaussians.len]`. Vectorized analogue of |
| 274 | +/// [`camera_depth_variance`]; uses the same quaternion→rotation→`W·Σ·Wᵀ` |
| 275 | +/// convention as `project::project_chunk_x16` (so it matches that kernel's |
| 276 | +/// covariance spine within the same tolerance). |
| 277 | +/// |
| 278 | +/// Does not allocate; `out` must have length ≥ `gaussians.len`. |
| 279 | +pub fn camera_depth_variance_batch(gaussians: &GaussianBatch, camera: &Camera, out: &mut [f32]) { |
| 280 | + let len = gaussians.len; |
| 281 | + assert!(out.len() >= len, "camera_depth_variance_batch: out.len() {} < gaussians.len {len}", out.len()); |
| 282 | + if len == 0 { |
| 283 | + return; |
| 284 | + } |
| 285 | + let v = &camera.view; |
| 286 | + let w20 = F32x16::splat(v[2][0]); |
| 287 | + let w21 = F32x16::splat(v[2][1]); |
| 288 | + let w22 = F32x16::splat(v[2][2]); |
| 289 | + let one = F32x16::splat(1.0); |
| 290 | + let two = F32x16::splat(2.0); |
| 291 | + let zero = F32x16::splat(0.0); |
| 292 | + |
| 293 | + let mut start = 0; |
| 294 | + while start < len { |
| 295 | + let qw = F32x16::from_slice(&stage16(&gaussians.quat_w, start, len)); |
| 296 | + let qx = F32x16::from_slice(&stage16(&gaussians.quat_x, start, len)); |
| 297 | + let qy = F32x16::from_slice(&stage16(&gaussians.quat_y, start, len)); |
| 298 | + let qz = F32x16::from_slice(&stage16(&gaussians.quat_z, start, len)); |
| 299 | + let scx = F32x16::from_slice(&stage16(&gaussians.scale_x, start, len)); |
| 300 | + let scy = F32x16::from_slice(&stage16(&gaussians.scale_y, start, len)); |
| 301 | + let scz = F32x16::from_slice(&stage16(&gaussians.scale_z, start, len)); |
| 302 | + |
| 303 | + // Quaternion → rotation matrix (mirrors project_chunk_x16). |
| 304 | + let xx = qx * qx; |
| 305 | + let yy = qy * qy; |
| 306 | + let zz = qz * qz; |
| 307 | + let xy = qx * qy; |
| 308 | + let xz = qx * qz; |
| 309 | + let yz = qy * qz; |
| 310 | + let wx = qw * qx; |
| 311 | + let wy = qw * qy; |
| 312 | + let wz = qw * qz; |
| 313 | + let r00 = one - two * (yy + zz); |
| 314 | + let r01 = two * (xy - wz); |
| 315 | + let r02 = two * (xz + wy); |
| 316 | + let r10 = two * (xy + wz); |
| 317 | + let r11 = one - two * (xx + zz); |
| 318 | + let r12 = two * (yz - wx); |
| 319 | + let r20 = two * (xz - wy); |
| 320 | + let r21 = two * (yz + wx); |
| 321 | + let r22 = one - two * (xx + yy); |
| 322 | + |
| 323 | + // M = R · diag(scale²); Σ_world upper triangle = M · Rᵀ. |
| 324 | + let s0 = scx * scx; |
| 325 | + let s1 = scy * scy; |
| 326 | + let s2 = scz * scz; |
| 327 | + let m00 = r00 * s0; |
| 328 | + let m01 = r01 * s1; |
| 329 | + let m02 = r02 * s2; |
| 330 | + let m10 = r10 * s0; |
| 331 | + let m11 = r11 * s1; |
| 332 | + let m12 = r12 * s2; |
| 333 | + let m20 = r20 * s0; |
| 334 | + let m21 = r21 * s1; |
| 335 | + let m22 = r22 * s2; |
| 336 | + let sw11 = m00 * r00 + m01 * r01 + m02 * r02; |
| 337 | + let sw12 = m00 * r10 + m01 * r11 + m02 * r12; |
| 338 | + let sw13 = m00 * r20 + m01 * r21 + m02 * r22; |
| 339 | + let sw22 = m10 * r10 + m11 * r11 + m12 * r12; |
| 340 | + let sw23 = m10 * r20 + m11 * r21 + m12 * r22; |
| 341 | + let sw33 = m20 * r20 + m21 * r21 + m22 * r22; |
| 342 | + |
| 343 | + // Σ_cam[2][2] = w2 · Σ_world · w2ᵀ (w2 = third view row). |
| 344 | + let cov = w20 * w20 * sw11 |
| 345 | + + w21 * w21 * sw22 |
| 346 | + + w22 * w22 * sw33 |
| 347 | + + two * (w20 * w21 * sw12 + w20 * w22 * sw13 + w21 * w22 * sw23); |
| 348 | + |
| 349 | + let mut arr = [0.0f32; 16]; |
| 350 | + cov.simd_max(zero).copy_to_slice(&mut arr); |
| 351 | + let end = (start + CHUNK).min(len); |
| 352 | + out[start..end].copy_from_slice(&arr[..end - start]); |
| 353 | + start += CHUNK; |
| 354 | + } |
| 355 | +} |
| 356 | + |
| 357 | +/// SIMD (16-wide) certificate reduction over a projected batch — the |
| 358 | +/// vectorized twin of [`certify_batch_scalar`]. The element-wise interval |
| 359 | +/// math (`σ_z`, `k·σ_z`, total, min/max depth) runs through `F32x16`; the two |
| 360 | +/// branchy fields (ordering/occlusion) are finished per lane, mirroring the |
| 361 | +/// SIMD-bulk / scalar-writeback split used by `project::project_chunk_x16`. |
| 362 | +/// |
| 363 | +/// `depth_var[i]` is `Σ_cam[2][2]` for splat `i` (from |
| 364 | +/// [`camera_depth_variance_batch`]). Culled slots get a zeroed, |
| 365 | +/// `passed = false` certificate. Does not panic on malformed input. |
| 366 | +pub fn certify_batch_simd( |
| 367 | + batch: &ProjectedBatch, depth_var: &[f32], params: &DepthCertParams, out: &mut Vec<RenderDepthCertificate>, |
| 368 | +) { |
| 369 | + out.clear(); |
| 370 | + let len = batch.len; |
| 371 | + if len == 0 { |
| 372 | + return; |
| 373 | + } |
| 374 | + out.reserve(len); |
| 375 | + |
| 376 | + // All caller budgets are batch constants → one broadcast. |
| 377 | + let const_terms = params.camera_transform_error |
| 378 | + + params.quantization_depth_error |
| 379 | + + params.splat_support_overlap_error |
| 380 | + + 0.5 * params.sort_bucket_width |
| 381 | + + params.lod_substitution_error |
| 382 | + + params.sampling_discrepancy; |
| 383 | + let const_v = F32x16::splat(const_terms); |
| 384 | + let k = F32x16::splat(params.k_sigma); |
| 385 | + let zero = F32x16::splat(0.0); |
| 386 | + |
| 387 | + let mut start = 0; |
| 388 | + while start < len { |
| 389 | + let depth_v = F32x16::from_slice(&stage16(&batch.depth, start, len)); |
| 390 | + let radius_v = F32x16::from_slice(&stage16(&batch.radius, start, len)); |
| 391 | + let var_v = F32x16::from_slice(&stage16(depth_var, start, len)).simd_max(zero); |
| 392 | + |
| 393 | + let sigma = var_v.sqrt(); |
| 394 | + let cov_err = k * sigma; |
| 395 | + let total = const_v + cov_err; |
| 396 | + let min_d = (depth_v - cov_err).simd_max(zero); |
| 397 | + let max_d = depth_v + cov_err; |
| 398 | + |
| 399 | + let mut var_a = [0.0f32; 16]; |
| 400 | + let mut cov_a = [0.0f32; 16]; |
| 401 | + let mut tot_a = [0.0f32; 16]; |
| 402 | + let mut min_a = [0.0f32; 16]; |
| 403 | + let mut max_a = [0.0f32; 16]; |
| 404 | + let mut rad_a = [0.0f32; 16]; |
| 405 | + var_v.copy_to_slice(&mut var_a); |
| 406 | + cov_err.copy_to_slice(&mut cov_a); |
| 407 | + total.copy_to_slice(&mut tot_a); |
| 408 | + min_d.copy_to_slice(&mut min_a); |
| 409 | + max_d.copy_to_slice(&mut max_a); |
| 410 | + radius_v.copy_to_slice(&mut rad_a); |
| 411 | + |
| 412 | + let end = (start + CHUNK).min(len); |
| 413 | + for k in 0..(end - start) { |
| 414 | + let idx = start + k; |
| 415 | + if batch.valid[idx] == 0 { |
| 416 | + out.push(ZERO_CERT); |
| 417 | + continue; |
| 418 | + } |
| 419 | + let cov_e = cov_a[k]; |
| 420 | + let ordering_uncertainty = if params.sort_bucket_width > 0.0 { |
| 421 | + (2.0 * cov_e / params.sort_bucket_width).clamp(0.0, 1.0) |
| 422 | + } else { |
| 423 | + 0.0 |
| 424 | + }; |
| 425 | + let occlusion_confidence = if params.splat_support_overlap_error > 0.0 { |
| 426 | + (1.0 - params.splat_support_overlap_error / cov_e.max(1e-6)).clamp(0.0, 1.0) |
| 427 | + } else { |
| 428 | + 1.0 |
| 429 | + }; |
| 430 | + let total_depth_error = tot_a[k]; |
| 431 | + out.push(RenderDepthCertificate { |
| 432 | + min_depth: min_a[k], |
| 433 | + max_depth: max_a[k], |
| 434 | + depth_variance: var_a[k], |
| 435 | + projected_radius_px: rad_a[k], |
| 436 | + occlusion_confidence, |
| 437 | + ordering_uncertainty, |
| 438 | + quantization_depth_error: params.quantization_depth_error, |
| 439 | + covariance_depth_error: cov_e, |
| 440 | + total_depth_error, |
| 441 | + passed: total_depth_error.is_finite() && total_depth_error <= params.max_total_depth_error, |
| 442 | + }); |
| 443 | + } |
| 444 | + start += CHUNK; |
| 445 | + } |
| 446 | +} |
| 447 | + |
| 448 | +/// Zeroed certificate for culled slots (shared by scalar + SIMD batch paths). |
| 449 | +const ZERO_CERT: RenderDepthCertificate = RenderDepthCertificate { |
| 450 | + min_depth: 0.0, |
| 451 | + max_depth: 0.0, |
| 452 | + depth_variance: 0.0, |
| 453 | + projected_radius_px: 0.0, |
| 454 | + occlusion_confidence: 0.0, |
| 455 | + ordering_uncertainty: 0.0, |
| 456 | + quantization_depth_error: 0.0, |
| 457 | + covariance_depth_error: 0.0, |
| 458 | + total_depth_error: 0.0, |
| 459 | + passed: false, |
| 460 | +}; |
| 461 | + |
261 | 462 | #[cfg(test)] |
262 | 463 | mod tests { |
| 464 | + use super::super::gaussian::{Gaussian3D, GaussianBatch}; |
| 465 | + use super::super::project::{project_batch, Camera, ProjectedBatch}; |
263 | 466 | use super::*; |
264 | 467 |
|
265 | 468 | fn approx(a: f32, b: f32, tol: f32) -> bool { |
266 | 469 | (a - b).abs() <= tol |
267 | 470 | } |
268 | 471 |
|
| 472 | + /// xorshift32 → [0,1), deterministic test data. |
| 473 | + fn rng(s: &mut u32) -> f32 { |
| 474 | + *s ^= *s << 13; |
| 475 | + *s ^= *s >> 17; |
| 476 | + *s ^= *s << 5; |
| 477 | + (*s as f32) / (u32::MAX as f32) |
| 478 | + } |
| 479 | + |
269 | 480 | const IDENTITY_VIEW: [[f32; 4]; 4] = |
270 | 481 | [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]; |
271 | 482 |
|
@@ -405,4 +616,100 @@ mod tests { |
405 | 616 | assert!(approx(c1.occlusion_confidence, 0.5, 1e-5), "got {}", c1.occlusion_confidence); |
406 | 617 | assert!((0.0..=1.0).contains(&c1.occlusion_confidence)); |
407 | 618 | } |
| 619 | + |
| 620 | + // ── P2: SIMD batch parity (validation plan Tier 2) ──────────────────────── |
| 621 | + |
| 622 | + #[test] |
| 623 | + fn depth_variance_batch_matches_scalar_reference() { |
| 624 | + // Varied scales + normalized non-identity quaternions under a rotated |
| 625 | + // view — exercises the full quat→R→W·Σ·Wᵀ depth spine in SIMD. |
| 626 | + let mut batch = GaussianBatch::with_capacity(40); |
| 627 | + let mut st = 0x1234_5678u32; |
| 628 | + for _ in 0..40 { |
| 629 | + let mut g = Gaussian3D::unit(); |
| 630 | + g.mean = [0.0, 0.0, 2.0]; |
| 631 | + g.scale = [0.1 + rng(&mut st) * 1.5, 0.1 + rng(&mut st) * 1.5, 0.1 + rng(&mut st) * 1.5]; |
| 632 | + let q = [rng(&mut st) * 2.0 - 1.0, rng(&mut st) * 2.0 - 1.0, rng(&mut st) * 2.0 - 1.0, rng(&mut st) * 2.0 - 1.0]; |
| 633 | + let n = (q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]).sqrt().max(1e-6); |
| 634 | + g.quat = [q[0] / n, q[1] / n, q[2] / n, q[3] / n]; |
| 635 | + g.opacity = 1.0; |
| 636 | + batch.push(g); |
| 637 | + } |
| 638 | + // 90° about +Y so the look-axis is non-trivial. |
| 639 | + let view = [[0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 0.0], [-1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]; |
| 640 | + let cam = Camera { |
| 641 | + view, fx: 512.0, fy: 512.0, cx: 256.0, cy: 256.0, near: 0.01, far: 1000.0, |
| 642 | + width: 512, height: 512, position: [0.0, 0.0, 0.0], |
| 643 | + }; |
| 644 | + let mut got = vec![0.0f32; batch.len]; |
| 645 | + camera_depth_variance_batch(&batch, &cam, &mut got); |
| 646 | + for i in 0..batch.len { |
| 647 | + let want = camera_depth_variance( |
| 648 | + [batch.scale_x[i], batch.scale_y[i], batch.scale_z[i]], |
| 649 | + [batch.quat_w[i], batch.quat_x[i], batch.quat_y[i], batch.quat_z[i]], |
| 650 | + &view, |
| 651 | + ); |
| 652 | + // Same tolerance project.rs uses for SIMD-vs-Spd3 covariance. |
| 653 | + assert!((got[i] - want).abs() <= 1e-3 * (1.0 + want.abs()), "i={i} simd={} scalar={want}", got[i]); |
| 654 | + } |
| 655 | + } |
| 656 | + |
| 657 | + #[test] |
| 658 | + fn certify_batch_simd_matches_scalar_end_to_end() { |
| 659 | + // Mix of visible, behind-camera, and off-screen splats so both the |
| 660 | + // certified and the culled (ZERO_CERT) paths are exercised. |
| 661 | + let mut batch = GaussianBatch::with_capacity(50); |
| 662 | + let mut st = 0xC0FF_EE11u32; |
| 663 | + for i in 0..50 { |
| 664 | + let mut g = Gaussian3D::unit(); |
| 665 | + let z = if i % 7 == 0 { -1.0 } else { 1.0 + rng(&mut st) * 6.0 }; |
| 666 | + let x = if i % 11 == 0 { 9000.0 } else { rng(&mut st) * 2.0 - 1.0 }; |
| 667 | + g.mean = [x, rng(&mut st) * 2.0 - 1.0, z]; |
| 668 | + g.scale = [0.1 + rng(&mut st) * 0.8, 0.1 + rng(&mut st) * 0.8, 0.1 + rng(&mut st) * 0.8]; |
| 669 | + g.quat = [1.0, 0.0, 0.0, 0.0]; |
| 670 | + g.opacity = rng(&mut st); |
| 671 | + batch.push(g); |
| 672 | + } |
| 673 | + let cam = Camera::identity_at_origin(256, 256); |
| 674 | + let mut proj = ProjectedBatch::with_capacity(batch.capacity); |
| 675 | + project_batch(&batch, &cam, &mut proj); |
| 676 | + |
| 677 | + let mut dv = vec![0.0f32; batch.len]; |
| 678 | + camera_depth_variance_batch(&batch, &cam, &mut dv); |
| 679 | + |
| 680 | + let params = DepthCertParams { |
| 681 | + k_sigma: 2.0, |
| 682 | + quantization_depth_error: 0.05, |
| 683 | + splat_support_overlap_error: 0.2, |
| 684 | + sort_bucket_width: 1.0, |
| 685 | + sampling_discrepancy: 0.01, |
| 686 | + max_total_depth_error: 3.0, |
| 687 | + ..Default::default() |
| 688 | + }; |
| 689 | + let mut want = Vec::new(); |
| 690 | + let mut got = Vec::new(); |
| 691 | + certify_batch_scalar(&proj, &dv, ¶ms, &mut want); |
| 692 | + certify_batch_simd(&proj, &dv, ¶ms, &mut got); |
| 693 | + |
| 694 | + assert_eq!(want.len(), got.len(), "certificate count mismatch"); |
| 695 | + assert_eq!(want.len(), batch.len); |
| 696 | + let mut culled = 0; |
| 697 | + for i in 0..want.len() { |
| 698 | + let (w, g) = (want[i], got[i]); |
| 699 | + assert_eq!(w.passed, g.passed, "i={i} passed"); |
| 700 | + assert!(approx(w.min_depth, g.min_depth, 1e-4), "i={i} min_depth {} vs {}", w.min_depth, g.min_depth); |
| 701 | + assert!(approx(w.max_depth, g.max_depth, 1e-4), "i={i} max_depth"); |
| 702 | + assert!(approx(w.depth_variance, g.depth_variance, 1e-4), "i={i} depth_variance"); |
| 703 | + assert!(approx(w.projected_radius_px, g.projected_radius_px, 1e-4), "i={i} radius"); |
| 704 | + assert!(approx(w.occlusion_confidence, g.occlusion_confidence, 1e-5), "i={i} occlusion"); |
| 705 | + assert!(approx(w.ordering_uncertainty, g.ordering_uncertainty, 1e-5), "i={i} ordering"); |
| 706 | + assert!(approx(w.covariance_depth_error, g.covariance_depth_error, 1e-4), "i={i} cov_err"); |
| 707 | + assert!(approx(w.total_depth_error, g.total_depth_error, 1e-4), "i={i} total"); |
| 708 | + if proj.valid[i] == 0 { |
| 709 | + culled += 1; |
| 710 | + assert_eq!(g, ZERO_CERT, "i={i} culled slot must be ZERO_CERT"); |
| 711 | + } |
| 712 | + } |
| 713 | + assert!(culled > 0, "test should exercise at least one culled slot"); |
| 714 | + } |
408 | 715 | } |
0 commit comments