feat(depth-estimation): refactor to TransformSkillBase + privacy-first defaults

solderzzc · solderzzc · commit 772473de91a8 · 2026-03-14T16:29:36.000-07:00
Refactors depth-estimation skill to subclass TransformSkillBase, reducing
transform.py from ~160 lines of boilerplate to ~100 lines of pure skill logic.

Key changes:
- Default blend_mode changed to 'depth_only' for privacy anonymization
- Version bumped to 1.1.0, category set to 'privacy'
- SKILL.md documents the TransformSkillBase interface for new skill authors
- Protocol updated: frame_id tracking, config-update command, base64 output
- Adds on_config_update() for live parameter changes from Aegis
diff --git a/skills/transformation/depth-estimation/SKILL.md b/skills/transformation/depth-estimation/SKILL.md
@@ -1,7 +1,8 @@
 ---
 name: depth-estimation
-description: "Real-time depth map estimation using Depth Anything v2"
-version: 1.0.0
+description: "Real-time depth map estimation for privacy transforms using Depth Anything v2"
+version: 1.1.0
+category: privacy
 
 parameters:
   - name: model
@@ -14,8 +15,8 @@ parameters:
   - name: blend_mode
     label: "Display Mode"
     type: select
-    options: ["overlay", "side_by_side", "depth_only"]
-    default: "overlay"
+    options: ["depth_only", "overlay", "side_by_side"]
+    default: "depth_only"
     group: Display
 
   - name: opacity
@@ -46,27 +47,50 @@ capabilities:
     description: "Real-time depth estimation overlay on live feed"
 ---
 
-# Depth Estimation
+# Depth Estimation (Privacy)
 
 Real-time monocular depth estimation using Depth Anything v2. Transforms camera feeds with colorized depth maps — near objects appear warm, far objects appear cool.
 
+When used for **privacy mode**, the `depth_only` blend mode fully anonymizes the scene while preserving spatial layout and activity, enabling security monitoring without revealing identities.
+
 ## What You Get
 
+- **Privacy anonymization** — depth-only mode hides all visual identity
 - **Depth overlays** on live camera feeds
 - **Distance estimation** — approximate distance to detected objects
 - **3D scene understanding** — spatial layout of the scene
 
+## Interface: TransformSkillBase
+
+This skill implements the `TransformSkillBase` interface. Any new privacy skill can be created by subclassing `TransformSkillBase` and implementing two methods:
+
+```python
+from transform_base import TransformSkillBase
+
+class MyPrivacySkill(TransformSkillBase):
+    def load_model(self, config):
+        # Load your model, return {"model": "...", "device": "..."}
+        ...
+
+    def transform_frame(self, image, metadata):
+        # Transform BGR image, return BGR image
+        ...
+```
+
 ## Protocol
 
 ### Aegis → Skill (stdin)
 ```jsonl
-{"event": "frame", "camera_id": "front_door", "frame_path": "/tmp/frame.jpg", "timestamp": "..."}
+{"event": "frame", "frame_id": "cam1_1710001", "camera_id": "front_door", "frame_path": "/tmp/frame.jpg", "timestamp": "..."}
+{"command": "config-update", "config": {"opacity": 0.8, "blend_mode": "overlay"}}
+{"command": "stop"}
 ```
 
 ### Skill → Aegis (stdout)
 ```jsonl
 {"event": "ready", "model": "depth-anything-v2-small", "device": "mps"}
-{"event": "transformed_frame", "camera_id": "front_door", "frame_path": "/tmp/depth_001.jpg", "metadata": {"min_depth": 0.2, "max_depth": 15.0}}
+{"event": "transform", "frame_id": "cam1_1710001", "camera_id": "front_door", "transform_data": "<base64 JPEG>"}
+{"event": "perf_stats", "total_frames": 50, "timings_ms": {"transform": {"avg": 45.2, ...}}}
 ```
 
 ## Setup
diff --git a/skills/transformation/depth-estimation/scripts/transform.py b/skills/transformation/depth-estimation/scripts/transform.py
@@ -1,56 +1,25 @@
 #!/usr/bin/env python3
 """
-Depth Estimation Skill — Real-time monocular depth maps.
+Depth Estimation Privacy Skill — Monocular depth maps via Depth Anything v2.
 
-Transforms camera frames with Depth Anything v2 colorized depth overlays.
+Implements the TransformSkillBase interface to provide real-time depth map
+overlays on camera feeds. When used as a privacy skill, the depth-only mode
+anonymizes the scene while preserving spatial layout and activity recognition.
+
+Usage:
+  python transform.py --model depth-anything-v2-small --device auto
+  python transform.py --config config.json
 """
 
 import sys
-import json
 import argparse
-import signal
-import tempfile
 from pathlib import Path
 
+# Import the base class from the same directory
+_script_dir = Path(__file__).resolve().parent
+sys.path.insert(0, str(_script_dir))
 
-def parse_args():
-    parser = argparse.ArgumentParser(description="Depth Estimation Skill")
-    parser.add_argument("--config", type=str)
-    parser.add_argument("--model", type=str, default="depth-anything-v2-small")
-    parser.add_argument("--colormap", type=str, default="inferno")
-    parser.add_argument("--blend-mode", type=str, default="overlay")
-    parser.add_argument("--opacity", type=float, default=0.5)
-    parser.add_argument("--device", type=str, default="auto")
-    return parser.parse_args()
-
-
-def load_config(args):
-    if args.config and Path(args.config).exists():
-        with open(args.config) as f:
-            return json.load(f)
-    return {
-        "model": args.model,
-        "colormap": args.colormap,
-        "blend_mode": args.blend_mode,
-        "opacity": args.opacity,
-        "device": args.device,
-    }
-
-
-def select_device(pref):
-    if pref != "auto":
-        return pref
-    try:
-        import torch
-        if torch.cuda.is_available(): return "cuda"
-        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): return "mps"
-    except ImportError:
-        pass
-    return "cpu"
-
-
-def emit(event):
-    print(json.dumps(event), flush=True)
+from transform_base import TransformSkillBase, _log  # noqa: E402
 
 
 COLORMAP_MAP = {
@@ -62,94 +31,101 @@ def emit(event):
 }
 
 
-def main():
-    args = parse_args()
-    config = load_config(args)
-    device = select_device(config.get("device", "auto"))
+class DepthEstimationSkill(TransformSkillBase):
+    """
+    Depth estimation using Depth Anything v2.
+
+    Produces colorized depth maps that can be blended with the original frame
+    (overlay mode), shown side-by-side, or displayed as depth-only anonymized view.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._tag = "DepthEstimation"
+        self.model = None
+        self.colormap_id = 1
+        self.opacity = 0.5
+        self.blend_mode = "depth_only"  # Default for privacy: depth_only anonymizes
+
+    def parse_extra_args(self, parser: argparse.ArgumentParser):
+        parser.add_argument("--model", type=str, default="depth-anything-v2-small",
+                            choices=["depth-anything-v2-small", "depth-anything-v2-base",
+                                     "depth-anything-v2-large", "midas-small"])
+        parser.add_argument("--colormap", type=str, default="inferno",
+                            choices=list(COLORMAP_MAP.keys()))
+        parser.add_argument("--blend-mode", type=str, default="depth_only",
+                            choices=["overlay", "side_by_side", "depth_only"])
+        parser.add_argument("--opacity", type=float, default=0.5)
+
+    def load_model(self, config: dict) -> dict:
+        import torch
 
-    try:
+        model_name = config.get("model", "depth-anything-v2-small")
+        self.colormap_id = COLORMAP_MAP.get(config.get("colormap", "inferno"), 1)
+        self.opacity = config.get("opacity", 0.5)
+        self.blend_mode = config.get("blend_mode", "depth_only")
+
+        _log(f"Loading {model_name} on {self.device}", self._tag)
+
+        # Load model via torch hub
+        hub_name = model_name.replace("-", "_")
+        self.model = torch.hub.load(
+            "LiheYoung/Depth-Anything-V2",
+            hub_name,
+            trust_repo=True,
+        )
+        self.model.to(self.device)
+        self.model.eval()
+
+        _log(f"Model loaded: {model_name} on {self.device}", self._tag)
+
+        return {
+            "model": model_name,
+            "device": self.device,
+            "blend_mode": self.blend_mode,
+            "colormap": config.get("colormap", "inferno"),
+        }
+
+    def transform_frame(self, image, metadata: dict):
         import torch
         import cv2
         import numpy as np
 
-        model_name = config.get("model", "depth-anything-v2-small")
-        model = torch.hub.load("LiheYoung/Depth-Anything-V2", model_name.replace("-", "_"), trust_repo=True)
-        model.to(device)
-        model.eval()
-
-        emit({"event": "ready", "model": model_name, "device": device})
-    except Exception as e:
-        emit({"event": "error", "message": f"Failed to load model: {e}", "retriable": False})
-        sys.exit(1)
-
-    running = True
-    def handle_signal(s, f):
-        nonlocal running
-        running = False
-    signal.signal(signal.SIGTERM, handle_signal)
-    signal.signal(signal.SIGINT, handle_signal)
-
-    colormap_id = COLORMAP_MAP.get(config.get("colormap", "inferno"), 1)
-    opacity = config.get("opacity", 0.5)
-    blend_mode = config.get("blend_mode", "overlay")
-
-    for line in sys.stdin:
-        if not running:
-            break
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            msg = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-
-        if msg.get("command") == "stop":
-            break
-
-        if msg.get("event") == "frame":
-            frame_path = msg.get("frame_path")
-            if not frame_path or not Path(frame_path).exists():
-                continue
-
-            try:
-                import torch
-                import cv2
-                import numpy as np
-
-                image = cv2.imread(frame_path)
-                rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-
-                with torch.no_grad():
-                    depth = model.infer_image(rgb)
-
-                # Normalize depth to 0-255
-                depth_norm = ((depth - depth.min()) / (depth.max() - depth.min() + 1e-8) * 255).astype(np.uint8)
-                depth_colored = cv2.applyColorMap(depth_norm, colormap_id)
-
-                if blend_mode == "overlay":
-                    output = cv2.addWeighted(image, 1 - opacity, depth_colored, opacity, 0)
-                elif blend_mode == "side_by_side":
-                    output = np.hstack([image, depth_colored])
-                else:  # depth_only
-                    output = depth_colored
-
-                out_path = tempfile.mktemp(suffix=".jpg", dir="/tmp")
-                cv2.imwrite(out_path, output, [cv2.IMWRITE_JPEG_QUALITY, 90])
-
-                emit({
-                    "event": "transformed_frame",
-                    "camera_id": msg.get("camera_id", "unknown"),
-                    "timestamp": msg.get("timestamp", ""),
-                    "frame_path": out_path,
-                    "metadata": {
-                        "min_depth": float(depth.min()),
-                        "max_depth": float(depth.max()),
-                    },
-                })
-            except Exception as e:
-                emit({"event": "error", "message": f"Depth error: {e}", "retriable": True})
+        rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        with torch.no_grad():
+            depth = self.model.infer_image(rgb)
+
+        # Normalize depth to 0-255
+        d_min, d_max = depth.min(), depth.max()
+        depth_norm = ((depth - d_min) / (d_max - d_min + 1e-8) * 255).astype(np.uint8)
+        depth_colored = cv2.applyColorMap(depth_norm, self.colormap_id)
+
+        if self.blend_mode == "overlay":
+            output = cv2.addWeighted(image, 1 - self.opacity, depth_colored, self.opacity, 0)
+        elif self.blend_mode == "side_by_side":
+            output = np.hstack([image, depth_colored])
+        else:  # depth_only — full anonymization
+            output = depth_colored
+
+        return output
+
+    def on_config_update(self, config: dict):
+        """Handle live config updates from Aegis."""
+        if "colormap" in config:
+            self.colormap_id = COLORMAP_MAP.get(config["colormap"], self.colormap_id)
+            _log(f"Colormap updated: {config['colormap']}", self._tag)
+        if "opacity" in config:
+            self.opacity = float(config["opacity"])
+            _log(f"Opacity updated: {self.opacity}", self._tag)
+        if "blend_mode" in config:
+            self.blend_mode = config["blend_mode"]
+            _log(f"Blend mode updated: {self.blend_mode}", self._tag)
+
+    def get_output_mode(self) -> str:
+        """Use base64 for privacy transforms — avoids temp file cleanup issues."""
+        return "base64"
 
 
 if __name__ == "__main__":
-    main()
+    DepthEstimationSkill().run()