feat(fusion): use hand-neck anchors as fret priors

Patrick Gilhooley · Patrick Gilhooley · commit a1840d86a14b · 2026-05-07T09:45:03.000-04:00
diff --git a/docs/DECISIONS.md b/docs/DECISIONS.md
@@ -357,3 +357,26 @@ per-cell posterior over (string, fret). Net new code is the projection
 - Fretting-hand identification — start with v0's handedness logic;
   switch to wrist-near-nut (now possible because we have the
   homography) only if the eval shows misidentification.
+
+---
+
+## 2026-05-07 — Phase 5 uses hand-neck anchors as first-class fusion priors
+
+**Phase:** 5 (vision-fusion integration)
+**Decision tree:** Phase 4/5 fusion contract — whether exact per-finger
+posteriors are the primary video signal, or whether coarser neck-region
+evidence should guide candidate selection first.
+**Branch taken:** **Use coarse hand-neck anchors as first-class Phase 5
+priors.** MediaPipe + fretboard homography estimate the fretting hand's
+center fret/span; the pipeline converts each timed anchor into an
+`AudioEvent.fret_prior` before calling Viterbi/chord fusion.
+**Evidence:** Phase 4 manual fingering labels proved too expensive for the
+near-term path, while the detector/fretboard stack is already good at
+identifying the neck coordinate system. Phase 5 fusion already accepts
+`AudioEvent.fret_prior` as emission evidence, so the anchor signal can be
+integrated without changing §8 public function signatures.
+**Reasoning:** Exact fingertip-to-string/fret labels are brittle and costly
+to validate; "the hand is around frets 3-6" is a stronger, more stable
+visual prior for resolving audio's same-pitch string/fret ambiguity. Keeping
+the signal as a prior lets audio and playability override it when the visual
+evidence is weak or wrong.
diff --git a/tabvision/tabvision/fusion/__init__.py b/tabvision/tabvision/fusion/__init__.py
@@ -6,6 +6,16 @@
 (string, fret) sequence respecting playability constraints.
 """
 
+from tabvision.fusion.neck_prior import (
+    TimedNeckAnchor,
+    anchor_position_prior,
+    apply_neck_anchor_priors,
+)
 from tabvision.fusion.viterbi import fuse
 
-__all__ = ["fuse"]
+__all__ = [
+    "TimedNeckAnchor",
+    "anchor_position_prior",
+    "apply_neck_anchor_priors",
+    "fuse",
+]
diff --git a/tabvision/tabvision/fusion/neck_prior.py b/tabvision/tabvision/fusion/neck_prior.py
@@ -0,0 +1,113 @@
+"""Attach coarse hand-neck anchors to audio events as fret priors."""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from dataclasses import replace
+from typing import Protocol
+
+import numpy as np
+
+from tabvision.types import AudioEvent, GuitarConfig
+
+
+class NeckAnchorLike(Protocol):
+    center_fret: float
+    min_fret: float
+    max_fret: float
+    confidence: float
+
+
+TimedNeckAnchor = tuple[float, NeckAnchorLike]
+
+
+def apply_neck_anchor_priors(
+    events: Sequence[AudioEvent],
+    anchors: Sequence[TimedNeckAnchor],
+    cfg: GuitarConfig,
+    *,
+    max_time_distance_s: float = 0.15,
+) -> list[AudioEvent]:
+    """Return events enriched with nearest video-anchor position priors.
+
+    The resulting ``AudioEvent.fret_prior`` has shape
+    ``(cfg.n_strings, cfg.max_fret + 1)`` so Phase 5 playability emission can
+    consume it as a per-position prior.
+    """
+    if not anchors:
+        return list(events)
+
+    out: list[AudioEvent] = []
+    for ev in events:
+        nearest = min(anchors, key=lambda item: abs(item[0] - ev.onset_s))
+        dt = abs(nearest[0] - ev.onset_s)
+        if dt > max_time_distance_s or nearest[1].confidence <= 0.0:
+            out.append(ev)
+            continue
+        prior = anchor_position_prior(nearest[1], cfg)
+        if ev.fret_prior is not None:
+            prior = _combine_priors(ev.fret_prior, prior, cfg)
+        out.append(replace(ev, fret_prior=prior))
+    return out
+
+
+def anchor_position_prior(anchor: NeckAnchorLike, cfg: GuitarConfig) -> np.ndarray:
+    """Return a normalized ``(string, fret)`` prior from a hand-neck anchor."""
+    frets = np.arange(cfg.max_fret + 1, dtype=np.float64)
+    uniform_fret = np.full(cfg.max_fret + 1, 1.0 / (cfg.max_fret + 1), dtype=np.float64)
+    if anchor.confidence <= 0.0:
+        fret_probs = uniform_fret
+    else:
+        sigma = max((float(anchor.max_fret) - float(anchor.min_fret)) / 2.0, 1.0)
+        logits = -0.5 * ((frets - float(anchor.center_fret)) / sigma) ** 2
+        gaussian = np.exp(logits - float(logits.max()))
+        gaussian /= float(gaussian.sum())
+        weight = min(max(float(anchor.confidence), 0.0), 1.0)
+        fret_probs = weight * gaussian + (1.0 - weight) * uniform_fret
+        fret_probs /= float(fret_probs.sum())
+
+    prior = np.tile(fret_probs[None, :], (cfg.n_strings, 1))
+    prior /= float(prior.sum())
+    return prior
+
+
+def _combine_priors(
+    existing: np.ndarray, anchor_prior: np.ndarray, cfg: GuitarConfig
+) -> np.ndarray:
+    existing_position = _as_position_prior(existing, cfg)
+    combined = existing_position * anchor_prior
+    denom = float(combined.sum())
+    if denom <= 0.0:
+        return anchor_prior
+    return np.asarray(combined / denom, dtype=np.float64)
+
+
+def _as_position_prior(prior: np.ndarray, cfg: GuitarConfig) -> np.ndarray:
+    arr = np.asarray(prior, dtype=np.float64)
+    if arr.shape == (cfg.n_strings, cfg.max_fret + 1):
+        denom = float(arr.sum())
+        if denom > 0.0:
+            return np.asarray(arr / denom, dtype=np.float64)
+        return anchor_position_prior(_ZeroAnchor(), cfg)
+    if arr.shape == (cfg.max_fret + 1,):
+        out = np.tile(arr[None, :], (cfg.n_strings, 1))
+        denom = float(out.sum())
+        if denom > 0.0:
+            return np.asarray(out / denom, dtype=np.float64)
+        return anchor_position_prior(_ZeroAnchor(), cfg)
+    return anchor_position_prior(_ZeroAnchor(), cfg)
+
+
+class _ZeroAnchor:
+    center_fret = 0.0
+    min_fret = 0.0
+    max_fret = 0.0
+    confidence = 0.0
+
+
+__all__ = [
+    "NeckAnchorLike",
+    "TimedNeckAnchor",
+    "anchor_position_prior",
+    "apply_neck_anchor_priors",
+]
diff --git a/tabvision/tabvision/fusion/playability.py b/tabvision/tabvision/fusion/playability.py
@@ -101,8 +101,9 @@ def emission_cost(
 
     - ``-log(event.confidence)`` — per-event constant (does not affect
       ranking within a single event but matters across events).
-    - ``-log(event.fret_prior[s, f])`` — only when the audio backend
-      provides a per-position prior (e.g. Phase 2 ``tabcnn``).
+    - ``-log(event.fret_prior[s, f])`` — only when the audio backend or
+      video neck-anchor path provides a prior. A one-dimensional fret-only
+      prior is also accepted and read as ``event.fret_prior[f]``.
     - ``lambda_vision * -log(P_vision[s, f])`` — vision marginal at
       ``event.onset_s``. Skipped when ``fingering is None``.
     - ``LOW_FRET_BIAS * fret`` — gentle low-fret preference.
@@ -111,7 +112,7 @@ def emission_cost(
     cost = -math.log(max(event.confidence, EPS))
 
     if event.fret_prior is not None:
-        prior = float(event.fret_prior[candidate.string_idx, candidate.fret])
+        prior = _candidate_prior(event.fret_prior, candidate)
         cost += -math.log(max(prior, EPS))
 
     if fingering is not None:
@@ -126,6 +127,20 @@ def emission_cost(
     return cost
 
 
+def _candidate_prior(prior: object, candidate: Candidate) -> float:
+    """Read a candidate prior from either a 2D position prior or 1D fret prior."""
+    try:
+        arr = prior  # keep mypy's object handling local to this helper
+        shape = getattr(arr, "shape", ())
+        if len(shape) == 2:
+            return float(arr[candidate.string_idx, candidate.fret])  # type: ignore[index]
+        if len(shape) == 1:
+            return float(arr[candidate.fret])  # type: ignore[index]
+    except (IndexError, TypeError, ValueError):
+        return 0.0
+    return 0.0
+
+
 def transition_cost(prev: Candidate, curr: Candidate, cfg: GuitarConfig) -> float:
     """Transition cost from ``prev`` to ``curr``.
 
diff --git a/tabvision/tabvision/pipeline.py b/tabvision/tabvision/pipeline.py
@@ -23,13 +23,15 @@
 
 import logging
 from collections.abc import Iterable, Iterator
-from dataclasses import replace
+from dataclasses import dataclass, replace
 from pathlib import Path
+from typing import cast
 
 import numpy as np
 
 from tabvision.demux import demux
-from tabvision.fusion import fuse
+from tabvision.fusion import TimedNeckAnchor, apply_neck_anchor_priors, fuse
+from tabvision.fusion.neck_prior import NeckAnchorLike
 from tabvision.types import (
     AudioBackend,
     AudioEvent,
@@ -38,6 +40,7 @@
     GuitarBackend,
     GuitarConfig,
     HandBackend,
+    Homography,
     SessionConfig,
     TabEvent,
 )
@@ -49,6 +52,12 @@ class _VideoImportError(RuntimeError):
     """Internal signal: a soft-optional video dep failed to import."""
 
 
+@dataclass(frozen=True)
+class _VideoStackResult:
+    fingerings: list[FrameFingering]
+    neck_anchors: list[TimedNeckAnchor]
+
+
 def run_pipeline(
     video_path: str | Path,
     *,
@@ -80,22 +89,29 @@ def run_pipeline(
     logger.info("audio backend produced %d events", len(audio_events))
 
     fingerings: list[FrameFingering] = []
+    neck_anchors: list[TimedNeckAnchor] = []
     if video_enabled:
         try:
-            fingerings = _run_video_stack(
+            video_result = _run_video_stack(
                 demuxed.frame_iterator,
                 stride=video_stride,
                 cfg=cfg,
                 guitar_backend=guitar_backend,
                 fretboard_backend=fretboard_backend,
                 hand_backend=hand_backend,
             )
+            fingerings = video_result.fingerings
+            neck_anchors = video_result.neck_anchors
         except _VideoImportError as exc:
             logger.warning(
                 "video stack unavailable, falling back to audio-only: %s",
                 exc,
             )
 
+    if lambda_vision > 0.0 and neck_anchors:
+        audio_events = apply_neck_anchor_priors(audio_events, neck_anchors, cfg)
+        logger.info("attached %d hand-neck anchors as audio fret priors", len(neck_anchors))
+
     logger.info(
         "running fuse() with %d audio events, %d fingerings, lambda_vision=%.2f",
         len(audio_events),
@@ -118,7 +134,7 @@ def _run_video_stack(
     guitar_backend: GuitarBackend | None,
     fretboard_backend: FretboardBackend | None,
     hand_backend: HandBackend | None,
-) -> list[FrameFingering]:
+) -> _VideoStackResult:
     """Single-pass walk producing one ``FrameFingering`` per sampled frame.
 
     Skipped-by-stride frames produce nothing; sampled frames produce
@@ -138,6 +154,7 @@ def _run_video_stack(
         hand_backend = _make_hand_backend()
 
     fingerings: list[FrameFingering] = []
+    neck_anchors: list[TimedNeckAnchor] = []
     n_fingers = 4  # fretting fingers; matches Phase 4 convention.
     empty_logits = np.zeros((n_fingers, cfg.n_strings, cfg.max_fret + 1), dtype=np.float64)
 
@@ -167,8 +184,28 @@ def _run_video_stack(
         ff = hand_backend.detect(frame, H, cfg)
         # Backends produce a degenerate t=0.0; stamp the real timestamp here.
         fingerings.append(replace(ff, t=t))
+        anchor = _detect_neck_anchor(hand_backend, frame, H, cfg)
+        if anchor is not None and anchor.confidence > 0.0:
+            neck_anchors.append((t, anchor))
+
+    return _VideoStackResult(fingerings=fingerings, neck_anchors=neck_anchors)
 
-    return fingerings
+
+def _detect_neck_anchor(
+    hand_backend: HandBackend,
+    frame: np.ndarray,
+    H: Homography,  # noqa: N803 — optional extension outside the §8 protocol
+    cfg: GuitarConfig,
+) -> NeckAnchorLike | None:
+    """Use a backend's optional coarse neck-anchor hook when available."""
+    detect_anchor = getattr(hand_backend, "detect_anchor", None)
+    if detect_anchor is None:
+        return None
+    try:
+        return cast(NeckAnchorLike | None, detect_anchor(frame, H, cfg))
+    except Exception as exc:  # noqa: BLE001 — optional evidence must degrade softly
+        logger.debug("hand-neck anchor unavailable on frame: %s", exc)
+        return None
 
 
 # ---------------------------------------------------------------------------
diff --git a/tabvision/tabvision/types.py b/tabvision/tabvision/types.py
@@ -167,7 +167,7 @@ def transcribe(
 class GuitarBackend(Protocol):
     name: str
 
-    def detect(self, frame: np.ndarray) -> GuitarBBox:
+    def detect(self, frame: np.ndarray) -> GuitarBBox | None:
         ...
 
 
diff --git a/tabvision/tabvision/video/hand/__init__.py b/tabvision/tabvision/video/hand/__init__.py
@@ -25,6 +25,7 @@
     HandBackend,
     Homography,
 )
+from tabvision.video.hand.neck_anchor import HandNeckAnchor, NeckAnchorConfig, compute_neck_anchor
 
 
 def track_hand(
@@ -52,4 +53,9 @@ def track_hand(
     return out
 
 
-__all__ = ["track_hand"]
+__all__ = [
+    "HandNeckAnchor",
+    "NeckAnchorConfig",
+    "compute_neck_anchor",
+    "track_hand",
+]
diff --git a/tabvision/tabvision/video/hand/mediapipe_backend.py b/tabvision/tabvision/video/hand/mediapipe_backend.py
@@ -29,6 +29,7 @@
     PosteriorConfig,
     compute_fingering,
 )
+from tabvision.video.hand.neck_anchor import HandNeckAnchor, compute_neck_anchor
 
 logger = logging.getLogger(__name__)
 
@@ -106,6 +107,21 @@ def detect(
             self.config.posterior,
         )
 
+    def detect_anchor(
+        self, frame: np.ndarray, H: Homography, cfg: GuitarConfig  # noqa: N803
+    ) -> HandNeckAnchor:
+        """Return the coarse fretting-hand neck region for fusion.
+
+        This deliberately sits outside the §8 ``HandBackend`` protocol:
+        exact per-finger posteriors remain available through ``detect()``,
+        while Phase 5 fusion can use this more robust hand-region prior.
+        """
+        if frame.ndim != 3 or frame.shape[-1] != 3:
+            raise BackendError(f"expected BGR frame, got shape {frame.shape}")
+
+        landmarks = self._extract_fretting_hand(frame)
+        return compute_neck_anchor(landmarks, H, cfg)
+
     def close(self) -> None:
         if self._landmarker is not None:
             self._landmarker.close()
diff --git a/tabvision/tabvision/video/hand/neck_anchor.py b/tabvision/tabvision/video/hand/neck_anchor.py
diff --git a/tabvision/tests/unit/test_neck_anchor.py b/tabvision/tests/unit/test_neck_anchor.py
diff --git a/tabvision/tests/unit/test_neck_prior.py b/tabvision/tests/unit/test_neck_prior.py
diff --git a/tabvision/tests/unit/test_pipeline.py b/tabvision/tests/unit/test_pipeline.py