Partha-dev01
diff --git a/‎DOCS.md‎
Lines changed: 25 additions & 0 deletions b/‎DOCS.md‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎app/api/chat/conversation/route.ts‎
Lines changed: 8 additions & 4 deletions b/‎app/api/chat/conversation/route.ts‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎app/hooks/useActionCamera.ts‎
Lines changed: 267 additions & 0 deletions b/‎app/hooks/useActionCamera.ts‎
Lines changed: 267 additions & 0 deletions
@@ -407,6 +407,8 @@ npx playwright test    # Run all 30 tests
 | R10 | **No consent before cloud sync** | Results auto-synced to cloud without user consent. Fixed: consent checkbox added at Stage 10 completion. Summary page (Stage 11) respects the preference — skips sync if user opts out. |
 | R11 | **Step 7 static instructions — no adaptive assessment** | Step 7 used 5 hardcoded instructions with parent-reported "Did it!" buttons. Replaced with a dynamic AI voice agent: Amazon Nova Lite (Bedrock) generates age-appropriate conversation, Amazon Polly speaks to the child, Web Speech API listens for responses. Collects richer biomarkers (response latency, engagement rate, comprehension). Falls back to pre-defined conversation when Bedrock unavailable. |
 | R12 | **CI Playwright failures — AWS SDK "Region is missing"** | `next.config.ts` inlines env vars at build time with `?? ""` defaults. On CI (no `.env.local`), `BEDROCK_REGION`/`POLLY_REGION` resolved to `""` (empty string). Nullish coalescing (`??`) doesn't catch empty strings, so `process.env.BEDROCK_REGION ?? "us-east-1"` → `""`. AWS SDK threw `Error: Region is missing` outside try/catch → uncaught 500. Fix: changed `??` to `||` in all 4 API routes (summary, clinical, tts, conversation) and moved client creation inside try/catch blocks. TTS error status changed from 500 → 503. |
+| R13 | **Step 7 auto-advances without verifying motor actions** | Voice agent spoke motor instructions ("touch your nose", "wave") but immediately moved on without checking if the child performed the action. Also, agent text wasn't displayed prominently. Fix: added camera-based motor action verification using existing YOLO pose detection pipeline. Motor turns activate camera → YOLO extracts 17 keypoints → rule-based ActionDetector checks keypoint geometry → ActionTracker requires 5 consecutive positive frames → confirmed. Agent text now displayed in large centered speech bubble with domain emoji headers. |
+| R14 | **Stage 10 worker URL parse error** | `Failed to execute 'fetch' on 'WorkerGlobalScope': Failed to parse URL from /models/yolo26n-pose-int8.onnx`. ONNX model paths were relative URLs (`/models/...`) which fail inside Web Workers because relative paths resolve against the worker script URL (blob: or /_next/static/), not the page origin. Fix: prefixed all 4 model paths with `${self.location.origin}` in PipelineOrchestrator.ts and MultimodalOrchestrator.ts. |
 
 ---
 
@@ -513,3 +515,26 @@ npx playwright test    # Run all 30 tests
 - Fixed: `app/api/report/clinical/route.ts` (`??` → `||`, client inside try/catch)
 - Fixed: `app/api/tts/route.ts` (`??` → `||`, client inside try/catch, 500 → 503)
 - Fixed: `app/api/chat/conversation/route.ts` (`??` → `||`, client inside try/catch)
+
+### v1.4.0 — 2026-03-04 (Camera Action Verification + Worker URL Fix)
+
+**Major Change:**
+- **Step 7 motor action verification via YOLO camera**: Motor instruction turns now activate the camera and use the existing YOLO26n-pose model to detect whether the child actually performed the requested action (wave, touch nose, clap, raise arms, touch head, touch ears). Rule-based ActionDetector analyzes 17 COCO keypoints with body-scale-normalized distance thresholds. ActionTracker requires 5 consecutive positive frames to confirm detection, preventing false positives.
+
+**New:**
+- `app/lib/actions/actionDetector.ts` — Pure rule-based action detection from YOLO keypoints: 6 actions with geometry rules, `ActionTracker` class for sustained detection, `ACTION_META` map for UI labels/emoji
+- `app/hooks/useActionCamera.ts` — Camera + YOLO inference + action detection hook: manages getUserMedia, inference worker (body-only mode), requestAnimationFrame loop, skeleton overlay drawing, ActionTracker integration
+- New `"verifying"` phase in Step 7 state machine: camera feed shown with COCO-17 skeleton overlay, detection progress bar, 15-second timeout with skip option
+- Domain emoji headers in agent text display (social, cognitive, language, motor, general)
+- `action` field added to conversation API TurnMetadata — LLM includes action ID for motor turns
+
+**Fixed:**
+- **Stage 10 ONNX worker URL parse error**: Model paths in `PipelineOrchestrator.ts` and `MultimodalOrchestrator.ts` changed from relative (`/models/...`) to absolute (`${self.location.origin}/models/...`) — resolves correctly in Web Worker scope
+
+**Files:**
+- Created: `app/lib/actions/actionDetector.ts`
+- Created: `app/hooks/useActionCamera.ts`
+- Rewritten: `app/intake/preparation/page.tsx` (camera verification integration)
+- Updated: `app/api/chat/conversation/route.ts` (action field in metadata)
+- Fixed: `app/lib/inference/PipelineOrchestrator.ts` (absolute model URLs)
+- Fixed: `app/lib/inference/MultimodalOrchestrator.ts` (absolute model URLs)
@@ -41,6 +41,7 @@ interface TurnMetadata {
   responseRelevance: number;
   shouldEnd: boolean;
   domain: "social" | "cognitive" | "language" | "motor" | "general";
+  action?: string; // For motor instructions: "wave", "touch_nose", "clap", "raise_arms", "touch_head", "touch_ears"
 }
 
 interface ConversationResponse {
@@ -87,10 +88,11 @@ RULES:
 10. For motor instructions, phrase them as fun games — "Let's play a game! Can you..."
 
 You MUST respond with ONLY valid JSON (no markdown, no code blocks) in this exact format:
-{"text":"Your spoken response here","turnType":"greeting|question|instruction|follow_up|farewell","expectsResponse":true,"responseRelevance":0.5,"shouldEnd":false,"domain":"social|cognitive|language|motor|general"}
+{"text":"Your spoken response here","turnType":"greeting|question|instruction|follow_up|farewell","expectsResponse":true,"responseRelevance":0.5,"shouldEnd":false,"domain":"social|cognitive|language|motor|general","action":null}
 
 For responseRelevance: rate how relevant the child's LAST response was to your LAST question (0.0 = no response or completely irrelevant, 0.5 = somewhat relevant, 1.0 = perfect response). Use 0.5 for the first turn.
-For shouldEnd: set to true ONLY on your farewell turn (after 5-8 assistant turns).`;
+For shouldEnd: set to true ONLY on your farewell turn (after 5-8 assistant turns).
+For action: when domain is "motor" and turnType is "instruction", include one of: "wave", "touch_nose", "clap", "raise_arms", "touch_head", "touch_ears". For non-motor turns, set to null.`;
 }
 
 /* ------------------------------------------------------------------ */
@@ -108,7 +110,7 @@ function buildFallbackTurn(
     },
     {
       text: `Awesome! Let's start with something fun. Can you wave hello to me?`,
-      metadata: { turnType: "instruction", expectsResponse: true, responseRelevance: 0.5, shouldEnd: false, domain: "motor" },
+      metadata: { turnType: "instruction", expectsResponse: true, responseRelevance: 0.5, shouldEnd: false, domain: "motor", action: "wave" },
     },
     {
       text: `Great job! Now tell me, what color is the sky?`,
@@ -120,7 +122,7 @@ function buildFallbackTurn(
     },
     {
       text: `That's wonderful! Now let's try something silly. Can you touch your nose?`,
-      metadata: { turnType: "instruction", expectsResponse: true, responseRelevance: 0.5, shouldEnd: false, domain: "motor" },
+      metadata: { turnType: "instruction", expectsResponse: true, responseRelevance: 0.5, shouldEnd: false, domain: "motor", action: "touch_nose" },
     },
     {
       text: `You're a superstar! What's your favorite animal?`,
@@ -153,6 +155,7 @@ function parseAgentResponse(raw: string): Omit<ConversationResponse, "fallback">
           responseRelevance: typeof parsed.responseRelevance === "number" ? parsed.responseRelevance : 0.5,
           shouldEnd: parsed.shouldEnd === true,
           domain: parsed.domain ?? "general",
+          ...(parsed.action ? { action: parsed.action } : {}),
         },
       };
     }
@@ -175,6 +178,7 @@ function parseAgentResponse(raw: string): Omit<ConversationResponse, "fallback">
             responseRelevance: typeof parsed.responseRelevance === "number" ? parsed.responseRelevance : 0.5,
             shouldEnd: parsed.shouldEnd === true,
             domain: parsed.domain ?? "general",
+            ...(parsed.action ? { action: parsed.action } : {}),
           },
         };
       }
 
@@ -0,0 +1,267 @@
+/**
+ * useActionCamera — manages camera + YOLO inference + action detection
+ * for Step 7 motor instruction verification.
+ *
+ * Reuses the existing inference.worker.ts in body-only mode (YOLO + TCN)
+ * but only extracts keypoints for rule-based action detection.
+ */
+
+"use client";
+import { useState, useEffect, useRef, useCallback } from "react";
+import type { PipelineResult, WorkerOutMessage } from "../types/inference";
+import { ActionTracker, type ActionId, type ActionResult } from "../lib/actions/actionDetector";
+
+// COCO-17 skeleton connections (same as DetectorVideoCanvas)
+const SKELETON: [number, number][] = [
+  [0, 1], [0, 2], [1, 3], [2, 4],
+  [5, 7], [7, 9], [6, 8], [8, 10],
+  [5, 6], [5, 11], [6, 12], [11, 12],
+  [11, 13], [13, 15], [12, 14], [14, 16],
+];
+
+export interface UseActionCameraReturn {
+  videoRef: React.RefObject<HTMLVideoElement | null>;
+  overlayRef: React.RefObject<HTMLCanvasElement | null>;
+  isModelLoaded: boolean;
+  isActive: boolean;
+  cameraError: string | null;
+  startCamera: () => Promise<void>;
+  stopCamera: () => void;
+  startDetecting: (action: ActionId) => void;
+  stopDetecting: () => void;
+  actionResult: ActionResult | null;
+  actionDetected: boolean;
+  keypoints: Float32Array | null;
+  confidence: Float32Array | null;
+}
+
+export function useActionCamera(): UseActionCameraReturn {
+  const videoRef = useRef<HTMLVideoElement | null>(null);
+  const overlayRef = useRef<HTMLCanvasElement | null>(null);
+  const captureCanvasRef = useRef<HTMLCanvasElement | null>(null);
+
+  const [isModelLoaded, setIsModelLoaded] = useState(false);
+  const [isActive, setIsActive] = useState(false);
+  const [cameraError, setCameraError] = useState<string | null>(null);
+  const [actionResult, setActionResult] = useState<ActionResult | null>(null);
+  const [actionDetected, setActionDetected] = useState(false);
+  const [keypoints, setKeypoints] = useState<Float32Array | null>(null);
+  const [confidence, setConfidence] = useState<Float32Array | null>(null);
+
+  const workerRef = useRef<Worker | null>(null);
+  const busyRef = useRef(false);
+  const rafRef = useRef(0);
+  const streamRef = useRef<MediaStream | null>(null);
+  const trackerRef = useRef(new ActionTracker());
+  const targetActionRef = useRef<ActionId | null>(null);
+  const detectingRef = useRef(false);
+
+  // Create & initialise worker on mount
+  useEffect(() => {
+    let worker: Worker;
+    try {
+      worker = new Worker(
+        new URL("../../workers/inference.worker.ts", import.meta.url),
+        { type: "module" },
+      );
+    } catch (err) {
+      setCameraError(`Failed to create inference worker: ${err instanceof Error ? err.message : String(err)}`);
+      return;
+    }
+    workerRef.current = worker;
+
+    worker.onmessage = (e: MessageEvent<WorkerOutMessage>) => {
+      const msg = e.data;
+      switch (msg.type) {
+        case "initialized":
+          setIsModelLoaded(true);
+          // Set body-only mode
+          worker.postMessage({ type: "setModality", modality: "body" });
+          break;
+        case "result":
+          handleResult(msg.data);
+          busyRef.current = false;
+          break;
+        case "error":
+          busyRef.current = false;
+          break;
+      }
+    };
+
+    worker.postMessage({ type: "init" });
+
+    return () => {
+      worker.terminate();
+      workerRef.current = null;
+    };
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, []);
+
+  // Handle inference result
+  const handleResult = useCallback((result: PipelineResult) => {
+    const kps = result.keypoints;
+    const conf = result.confidence;
+    if (kps && conf) {
+      setKeypoints(kps);
+      setConfidence(conf);
+      drawSkeleton(kps, conf);
+
+      if (detectingRef.current && targetActionRef.current) {
+        const tracked = trackerRef.current.update(kps, conf, targetActionRef.current);
+        setActionResult(tracked);
+        if (tracked.confirmed) {
+          setActionDetected(true);
+          detectingRef.current = false;
+        }
+      }
+    }
+  // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, []);
+
+  // Draw skeleton overlay
+  const drawSkeleton = useCallback((kps: Float32Array, conf: Float32Array) => {
+    const canvas = overlayRef.current;
+    if (!canvas) return;
+    const ctx = canvas.getContext("2d");
+    if (!ctx) return;
+
+    const w = canvas.width;
+    const h = canvas.height;
+    ctx.clearRect(0, 0, w, h);
+
+    if (kps.length < 34) return;
+
+    // Scale keypoints from 320×240 to canvas size
+    const scaleX = w / 320;
+    const scaleY = h / 240;
+
+    // Draw bones
+    ctx.strokeStyle = "rgba(104, 159, 56, 0.8)";
+    ctx.lineWidth = 2.5;
+    for (const [a, b] of SKELETON) {
+      if (conf[a] < 0.3 || conf[b] < 0.3) continue;
+      ctx.beginPath();
+      ctx.moveTo(kps[a * 2] * scaleX, kps[a * 2 + 1] * scaleY);
+      ctx.lineTo(kps[b * 2] * scaleX, kps[b * 2 + 1] * scaleY);
+      ctx.stroke();
+    }
+
+    // Draw keypoints
+    for (let i = 0; i < 17; i++) {
+      if (conf[i] < 0.3) continue;
+      ctx.fillStyle = "rgba(104, 159, 56, 0.9)";
+      ctx.beginPath();
+      ctx.arc(kps[i * 2] * scaleX, kps[i * 2 + 1] * scaleY, 4, 0, Math.PI * 2);
+      ctx.fill();
+    }
+  }, []);
+
+  // Frame capture loop
+  const sendFrame = useCallback(() => {
+    const worker = workerRef.current;
+    const video = videoRef.current;
+
+    if (!worker || !video || !isActive || !isModelLoaded || busyRef.current || video.paused) {
+      if (isActive) rafRef.current = requestAnimationFrame(sendFrame);
+      return;
+    }
+
+    try {
+      if (!captureCanvasRef.current) {
+        captureCanvasRef.current = document.createElement("canvas");
+        captureCanvasRef.current.width = 320;
+        captureCanvasRef.current.height = 240;
+      }
+      const ctx = captureCanvasRef.current.getContext("2d", { willReadFrequently: true });
+      if (!ctx) { rafRef.current = requestAnimationFrame(sendFrame); return; }
+
+      ctx.drawImage(video, 0, 0, 320, 240);
+      const imageData = ctx.getImageData(0, 0, 320, 240);
+
+      busyRef.current = true;
+      worker.postMessage({ type: "processFrame", imageData }, [imageData.data.buffer]);
+    } catch {
+      // Frame capture error — skip
+    }
+
+    rafRef.current = requestAnimationFrame(sendFrame);
+  }, [isActive, isModelLoaded]);
+
+  // Start/stop frame loop when active changes
+  useEffect(() => {
+    if (isActive && isModelLoaded) {
+      rafRef.current = requestAnimationFrame(sendFrame);
+    }
+    return () => cancelAnimationFrame(rafRef.current);
+  }, [isActive, isModelLoaded, sendFrame]);
+
+  const startCamera = useCallback(async () => {
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({
+        video: { width: 320, height: 240, facingMode: "user" },
+      });
+      streamRef.current = stream;
+      if (videoRef.current) {
+        videoRef.current.srcObject = stream;
+        await videoRef.current.play().catch(() => {});
+      }
+      setIsActive(true);
+      setCameraError(null);
+    } catch (err) {
+      setCameraError(
+        err instanceof Error ? err.message : "Camera access denied",
+      );
+    }
+  }, []);
+
+  const stopCamera = useCallback(() => {
+    setIsActive(false);
+    cancelAnimationFrame(rafRef.current);
+    if (streamRef.current) {
+      streamRef.current.getTracks().forEach((t) => t.stop());
+      streamRef.current = null;
+    }
+    if (videoRef.current) {
+      videoRef.current.srcObject = null;
+    }
+  }, []);
+
+  const startDetecting = useCallback((action: ActionId) => {
+    targetActionRef.current = action;
+    detectingRef.current = true;
+    trackerRef.current.reset();
+    setActionDetected(false);
+    setActionResult(null);
+  }, []);
+
+  const stopDetecting = useCallback(() => {
+    targetActionRef.current = null;
+    detectingRef.current = false;
+  }, []);
+
+  // Cleanup on unmount
+  useEffect(() => {
+    return () => {
+      cancelAnimationFrame(rafRef.current);
+      if (streamRef.current) {
+        streamRef.current.getTracks().forEach((t) => t.stop());
+      }
+    };
+  }, []);
+
+  return {
+    videoRef,
+    overlayRef,
+    isModelLoaded,
+    isActive,
+    cameraError,
+    startCamera,
+    stopCamera,
+    startDetecting,
+    stopDetecting,
+    actionResult,
+    actionDetected,
+    keypoints,
+    confidence,
+  };
+}