From 0c1fb05d8178f1619baf4b784085173e5cd30366 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 5 Jun 2026 10:21:09 +0000
Subject: [PATCH 1/2] docs(splat-native): address review feedback on #212 (2
 fixes)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to PR #212 (merged). Addresses two codex review findings.

## Fix 1 — `batched_opacity_blend` needs ray segmentation (codex P1)

The original signature took a single flat `sorted_amplitudes` slice
and emitted a single `out_alpha[ray]` — but a renderer composites N
independent view rays per frame, each with its own front-to-back-
sorted Gaussian sequence. Without per-ray boundaries the
implementation could not know which Gaussians belong to which output
pixel, so it would either composite the same global sequence for
every ray or guess boundaries outside the API.

Adds a CSR-style `ray_offsets: &[u32]` prefix-sum (length `n_rays + 1`)
that segments the flat amplitude buffer into per-ray ranges.
Documented contract:

- `ray_offsets[0] == 0` and `ray_offsets[n_rays] == sorted_amplitudes.len()`
- Empty ray (`ray_offsets[r] == ray_offsets[r+1]`) yields `out_alpha[r] = 0`
- Rays are independent (no cross-ray data dependence) — outer loop
  is trivially parallelizable
- Per-frame amplitude quantization is caller-side; `opacity_lut` is
  a frame-global constant for that pass

Adds three new tests:
- Multi-ray independence (concatenated rays match per-ray calls)
- Empty-ray boundary case (→ α = 0)
- `ray_offsets` invariant debug-asserts

## Fix 2 — `batched_mahalanobis` needs scratch buffer for Cholesky cache (codex P2)

The original implementation note said L was "heap-free via stack or
caller-provided scratch" but the public signature had no scratch
parameter. At the documented `N = 1_000_000` bench size, the
Cholesky cache is `6 * N * size_of::<f32>() = 24 MiB` — not
stack-feasible. The function would either have to allocate
internally (breaking the zero-allocation contract) or recompute
factors per query (breaking the throughput contract).

Adds explicit `cholesky_scratch: &mut [f32]` parameter (length `6*N`)
with documented sizing guidance:

- `N ≤ 8192` MAY use a stack-resident buffer
- `N > 8192` MUST allocate once at engine init and re-use across frames
- The function MUST NOT allocate internally

Matches the `splat-fit` engine and registration-loop pattern where
the scratch is allocated once per `SplatFitActor` mailbox at boot.

## What's NOT in this PR

- Source code: still none. Plan-spec only.
- The W1c primitive-addition contract (all three backends mandatory,
  parity tests gate, VPABSB-correction-style degenerate-input
  documentation) is unchanged — the fix updates the two signatures
  but not the testing or backend invariants.

## Test plan

- [x] Codex P1 (ray segmentation) — added per-ray offset + 3 new tests
      to the contract.
- [x] Codex P2 (Mahalanobis scratch) — added `cholesky_scratch`
      parameter + sizing note + zero-allocation contract.
- [x] Signatures rebalanced (each line-broken with one arg per line +
      sized comments) for readability.
- [ ] Codex re-review on this PR.
---
 ...lat-native-ultrasound-simd-substrate-v1.md | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)
diff --git a/.claude/plans/splat-native-ultrasound-simd-substrate-v1.md b/.claude/plans/splat-native-ultrasound-simd-substrate-v1.md
index 57fd8d48..7661d5fc 100644
--- a/.claude/plans/splat-native-ultrasound-simd-substrate-v1.md
+++ b/.claude/plans/splat-native-ultrasound-simd-substrate-v1.md
@@ -194,11 +194,15 @@ L₂₂ = √(Σ₂₂ - L₂₀² - L₂₁²)
 /// - Degenerate Σ (Cholesky NaN) yields f32::INFINITY (sortable; never NaN).
 /// - SIMD-batched 16×16 on AVX-512, 4×4 on NEON.
 pub fn batched_mahalanobis(
-    query_xyz: &[f32], mu_xyz: &[f32], sigma_packed: &[f32], out_dist_sq: &mut [f32],
+    query_xyz: &[f32],            // length 3*M
+    mu_xyz: &[f32],               // length 3*N
+    sigma_packed: &[f32],         // length 6*N (upper-triangle Σ per Gaussian)
+    cholesky_scratch: &mut [f32], // length 6*N (caller-provided; holds packed L per Gaussian)
+    out_dist_sq: &mut [f32],      // length M*N (row-major)
 );
 ```
 
-**Implementation note:** internally calls `batched_cholesky_3x3` once on `sigma_packed`, caches L (heap-free via stack or caller-provided scratch), then triangular-solve + squared norm per (m, n) pair.
+**Implementation note:** internally calls `batched_cholesky_3x3` on `sigma_packed` once per call, writing packed L into `cholesky_scratch` (caller-provided; zero-allocation contract). The caller sizes the buffer as `6 * N * size_of::<f32>()` — for `N = 1_000_000` Gaussians this is **24 MiB**, which is not stack-feasible; callers must allocate it once at engine init and re-use across frames (matches the `splat-fit` / registration loop pattern). For small `N` (e.g. `N ≤ 8192`) callers MAY pass a stack-resident buffer. The function MUST NOT allocate internally.
 
 **Tests:**
 - Reference comparison against scipy `scipy.spatial.distance.mahalanobis` on random points + Σ.
@@ -226,14 +230,26 @@ pub fn batched_mahalanobis(
 /// - Composition: α_new = α_old + (1 - α_old) · α_i.
 /// - Internal accumulator: u16 with saturation; truncate to u8 at end.
 pub fn batched_opacity_blend(
-    sorted_amplitudes: &[f32], opacity_lut: &[u8; 256], out_alpha: &mut [u8],
+    sorted_amplitudes: &[f32],    // flat; contains all rays' samples concatenated
+    ray_offsets: &[u32],          // length = n_rays + 1 (CSR-style); ray r's range is [ray_offsets[r]..ray_offsets[r+1])
+    opacity_lut: &[u8; 256],
+    out_alpha: &mut [u8],         // length = n_rays
 );
 ```
 
+**Per-ray segmentation contract.** A renderer composites N independent view rays per frame; each ray has its own front-to-back-sorted Gaussian sequence. `ray_offsets` is a CSR-style prefix-sum (length `n_rays + 1`) so ray `r`'s amplitudes are `sorted_amplitudes[ray_offsets[r] as usize..ray_offsets[r+1] as usize]` and `out_alpha[r]` is its composited alpha. Constraints:
+- `ray_offsets[0] == 0` and `ray_offsets[n_rays] == sorted_amplitudes.len() as u32` (assert-on-debug).
+- A ray with `ray_offsets[r] == ray_offsets[r+1]` (empty) yields `out_alpha[r] = 0`.
+- Per-frame amplitude quantization (the 256-bucket LUT input) is computed by the caller from the per-frame max amplitude; `opacity_lut` is a frame-global constant for that pass.
+
+**Implementation note:** the SIMD inner loop processes one ray's range as a contiguous front-to-back sweep; rays are independent (no cross-ray data dependence) so the outer ray loop is trivially parallelizable.
+
 **Tests:**
-- Reference comparison against scalar reference for known sequences.
+- Reference comparison against scalar reference for known sequences (single-ray + multi-ray).
 - Saturation at full opacity (sequence of high-amplitude Gaussians → α = 255).
-- Empty sequence → α = 0.
+- Empty ray (`ray_offsets[r] == ray_offsets[r+1]`) → α = 0.
+- Multi-ray independence (concatenated rays produce same per-ray output as separate single-ray calls).
+- `ray_offsets` invariant violations (debug assert on `ray_offsets[0] != 0` or `ray_offsets[last] != amplitudes.len()`).
 - SIMD parity.
 
 ### 4.4 `batched_sh_eval_l3`

From 3c7890115aeea4b01b88f7c6c0ce1acaa1624c30 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 5 Jun 2026 10:35:14 +0000
Subject: [PATCH 2/2] =?UTF-8?q?docs(splat-native):=20sync=20module=20skele?=
 =?UTF-8?q?ton=20with=20=C2=A74.2/=C2=A74.3=20(codex=20P2=20on=20#213)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to codex review on PR #213. The per-primitive specs at §4.2
(`batched_mahalanobis`) and §4.3 (`batched_opacity_blend`) were
correctly updated in #213, but the module skeleton at the top of the
same file (the `pub fn` declarations in the `src/simd_splat.rs` sketch
around line 50) still advertised the OLD no-scratch / no-ray-offsets
signatures.

Since this document is the implementation handoff for
`src/simd_splat.rs`, an implementer scanning the skeleton first would
recreate the exact APIs #213 was meant to eliminate, leaving the
zero-allocation contract (Mahalanobis) and per-ray segmentation
contract (opacity blend) unenforceable.

## Fix

Updates the two skeleton signatures to match §4.2 and §4.3 exactly,
with cross-references to the per-primitive sections:

- `batched_mahalanobis` — adds `cholesky_scratch: &mut [f32]` (length
  N × 6); inline comment cites §4.2 sizing guidance and the
  "function MUST NOT allocate" contract.
- `batched_opacity_blend` — adds `ray_offsets: &[u32]` (length
  n_rays + 1, CSR-style); `sorted_amplitudes` re-described as flat
  concatenation; `out_alpha` length now `n_rays` (not "per pixel");
  inline comment cites §4.3 segmentation contract.

The two implementations of the §4.2 / §4.3 detailed specs are
unchanged in this PR — this commit only syncs the up-front skeleton
so the two views of the API agree.

## Test plan

- [x] Skeleton + §4.2 + §4.3 now show identical parameter lists.
- [x] Comments on the new parameters cite the section that owns the
      detailed contract (so an implementer who reads the skeleton
      first gets pointed to the contract section).
- [ ] Codex re-review on this PR.
---
 .../splat-native-ultrasound-simd-substrate-v1.md | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/.claude/plans/splat-native-ultrasound-simd-substrate-v1.md b/.claude/plans/splat-native-ultrasound-simd-substrate-v1.md
index 7661d5fc..cae438a6 100644
--- a/.claude/plans/splat-native-ultrasound-simd-substrate-v1.md
+++ b/.claude/plans/splat-native-ultrasound-simd-substrate-v1.md
@@ -48,16 +48,18 @@ pub fn batched_cholesky_3x3(
 );
 
 pub fn batched_mahalanobis(
-    query_xyz: &[f32],        // M × 3 query points
-    mu_xyz: &[f32],           // N × 3 Gaussian centroids
-    sigma_packed: &[f32],     // N × 6 packed Σ
-    out_dist_sq: &mut [f32],  // M × N output (squared Mahalanobis)
+    query_xyz: &[f32],            // M × 3 query points
+    mu_xyz: &[f32],               // N × 3 Gaussian centroids
+    sigma_packed: &[f32],         // N × 6 packed Σ
+    cholesky_scratch: &mut [f32], // N × 6 — caller-provided packed-L scratch (24 MiB @ N=1M); function MUST NOT allocate (see §4.2)
+    out_dist_sq: &mut [f32],      // M × N output (squared Mahalanobis)
 );
 
 pub fn batched_opacity_blend(
-    sorted_amplitudes: &[f32], // N (front-to-back along view ray)
-    opacity_lut: &[u8; 256],   // amplitude → opacity LUT
-    out_alpha: &mut [u8],      // composited alpha per pixel
+    sorted_amplitudes: &[f32],    // flat; all rays' samples concatenated (front-to-back per ray)
+    ray_offsets: &[u32],          // length = n_rays + 1 (CSR-style); ray r's range is [ray_offsets[r]..ray_offsets[r+1]) (see §4.3)
+    opacity_lut: &[u8; 256],      // amplitude → opacity LUT
+    out_alpha: &mut [u8],         // length = n_rays — composited alpha per ray
 );
 
 pub fn batched_sh_eval_l3(