SharpAI
diff --git a/‎skills.json‎
Lines changed: 0 additions & 70 deletions b/‎skills.json‎
Lines changed: 0 additions & 70 deletions
diff --git a/‎skills/analysis/sam2-segmentation/requirements.txt‎
Lines changed: 7 additions & 0 deletions b/‎skills/analysis/sam2-segmentation/requirements.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎skills/analysis/sam2-segmentation/scripts/segment.py‎
Lines changed: 149 additions & 0 deletions b/‎skills/analysis/sam2-segmentation/scripts/segment.py‎
Lines changed: 149 additions & 0 deletions
diff --git a/‎skills/analysis/vlm-scene-analysis/requirements.txt‎
Lines changed: 5 additions & 0 deletions b/‎skills/analysis/vlm-scene-analysis/requirements.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎skills/analysis/vlm-scene-analysis/scripts/analyze.py‎
Lines changed: 149 additions & 0 deletions b/‎skills/analysis/vlm-scene-analysis/scripts/analyze.py‎
Lines changed: 149 additions & 0 deletions
@@ -242,41 +242,6 @@
         "training_pipeline"
       ]
     },
-    {
-      "id": "camera-provider-blink",
-      "name": "Blink Cameras",
-      "description": "Amazon Blink camera integration — motion clips, snapshots, arm/disarm.",
-      "version": "1.0.0",
-      "category": "camera-providers",
-      "path": "skills/camera-providers/blink",
-      "tags": [
-        "blink",
-        "amazon",
-        "camera",
-        "clips"
-      ],
-      "platforms": [
-        "linux-x64",
-        "linux-arm64",
-        "darwin-arm64",
-        "darwin-x64",
-        "win-x64"
-      ],
-      "requirements": {
-        "python": ">=3.9",
-        "ram_gb": 1
-      },
-      "capabilities": [
-        "clip_feed",
-        "discover_cameras",
-        "snapshot",
-        "arm_disarm"
-      ],
-      "ui_unlocks": [
-        "camera_timeline",
-        "clip_feed"
-      ]
-    },
     {
       "id": "camera-provider-eufy",
       "name": "Eufy Cameras",
@@ -386,41 +351,6 @@
         "live_view"
       ]
     },
-    {
-      "id": "camera-provider-ring",
-      "name": "Ring Cameras",
-      "description": "Ring camera integration — event clips and live view.",
-      "version": "1.0.0",
-      "category": "camera-providers",
-      "path": "skills/camera-providers/ring",
-      "tags": [
-        "ring",
-        "amazon",
-        "camera",
-        "doorbell"
-      ],
-      "platforms": [
-        "linux-x64",
-        "linux-arm64",
-        "darwin-arm64",
-        "darwin-x64",
-        "win-x64"
-      ],
-      "requirements": {
-        "python": ">=3.9",
-        "ram_gb": 1
-      },
-      "capabilities": [
-        "clip_feed",
-        "discover_cameras",
-        "live_stream"
-      ],
-      "ui_unlocks": [
-        "camera_timeline",
-        "clip_feed",
-        "live_view"
-      ]
-    },
     {
       "id": "go2rtc-cameras",
       "name": "go2rtc Multi-Camera Streaming",
 
@@ -0,0 +1,7 @@
+# SAM2 Segmentation
+segment-anything-2>=0.1.0
+torch>=2.0.0
+torchvision>=0.15.0
+numpy>=1.24.0
+opencv-python-headless>=4.8.0
+Pillow>=10.0.0
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+SAM2 Segmentation Skill — Interactive click-to-segment.
+
+Generates pixel-perfect masks from point/box prompts using Segment Anything 2.
+"""
+
+import sys
+import json
+import argparse
+import signal
+import tempfile
+from pathlib import Path
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="SAM2 Segmentation Skill")
+    parser.add_argument("--config", type=str)
+    parser.add_argument("--model", type=str, default="sam2-small")
+    parser.add_argument("--device", type=str, default="auto")
+    return parser.parse_args()
+
+
+def load_config(args):
+    if args.config and Path(args.config).exists():
+        with open(args.config) as f:
+            return json.load(f)
+    return {"model": args.model, "device": args.device}
+
+
+def select_device(pref):
+    if pref != "auto":
+        return pref
+    try:
+        import torch
+        if torch.cuda.is_available(): return "cuda"
+        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): return "mps"
+    except ImportError:
+        pass
+    return "cpu"
+
+
+def emit(event):
+    print(json.dumps(event), flush=True)
+
+
+def main():
+    args = parse_args()
+    config = load_config(args)
+    device = select_device(config.get("device", "auto"))
+
+    try:
+        import torch
+        import numpy as np
+        import cv2
+        from sam2.build_sam import build_sam2
+        from sam2.sam2_image_predictor import SAM2ImagePredictor
+
+        model_cfg = {
+            "sam2-tiny": "sam2_hiera_t.yaml",
+            "sam2-small": "sam2_hiera_s.yaml",
+            "sam2-base": "sam2_hiera_b+.yaml",
+            "sam2-large": "sam2_hiera_l.yaml",
+        }
+
+        model_name = config.get("model", "sam2-small")
+        checkpoint = f"models/{model_name}.pt"
+
+        sam2 = build_sam2(model_cfg.get(model_name, "sam2_hiera_s.yaml"), checkpoint)
+        predictor = SAM2ImagePredictor(sam2)
+        predictor.model.to(device)
+
+        emit({"event": "ready", "model": model_name, "device": device})
+    except Exception as e:
+        emit({"event": "error", "message": f"Failed to load SAM2: {e}", "retriable": False})
+        sys.exit(1)
+
+    running = True
+    current_image = None
+
+    def handle_signal(s, f):
+        nonlocal running
+        running = False
+    signal.signal(signal.SIGTERM, handle_signal)
+    signal.signal(signal.SIGINT, handle_signal)
+
+    for line in sys.stdin:
+        if not running:
+            break
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            msg = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+
+        if msg.get("command") == "stop":
+            break
+
+        event = msg.get("event")
+
+        if event == "frame":
+            frame_path = msg.get("frame_path")
+            if frame_path and Path(frame_path).exists():
+                current_image = cv2.imread(frame_path)
+                current_image = cv2.cvtColor(current_image, cv2.COLOR_BGR2RGB)
+                predictor.set_image(current_image)
+
+        elif event == "click" and current_image is not None:
+            x, y = msg.get("x", 0), msg.get("y", 0)
+            label = msg.get("label", 1)  # 1=foreground, 0=background
+
+            try:
+                point = np.array([[x, y]])
+                point_label = np.array([label])
+
+                masks, scores, _ = predictor.predict(
+                    point_coords=point,
+                    point_labels=point_label,
+                    multimask_output=True,
+                )
+
+                # Use highest-scoring mask
+                best_idx = np.argmax(scores)
+                mask = masks[best_idx]
+                score = float(scores[best_idx])
+
+                # Save mask
+                mask_path = tempfile.mktemp(suffix=".png", dir="/tmp")
+                cv2.imwrite(mask_path, (mask * 255).astype(np.uint8))
+
+                # Compute bbox from mask
+                ys, xs = np.where(mask)
+                bbox = [int(xs.min()), int(ys.min()), int(xs.max()), int(ys.max())]
+
+                emit({
+                    "event": "segmentation",
+                    "frame_number": msg.get("frame_number", 0),
+                    "mask_path": mask_path,
+                    "score": round(score, 3),
+                    "bbox": bbox,
+                })
+            except Exception as e:
+                emit({"event": "error", "message": f"Segmentation error: {e}", "retriable": True})
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,5 @@
+# VLM Scene Analysis
+llama-cpp-python>=0.3.0
+numpy>=1.24.0
+opencv-python-headless>=4.8.0
+Pillow>=10.0.0
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+VLM Scene Analysis Skill — Offline clip understanding via vision language models.
+
+Analyzes recorded video clips and generates natural language descriptions.
+"""
+
+import sys
+import json
+import argparse
+import signal
+from pathlib import Path
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="VLM Scene Analysis Skill")
+    parser.add_argument("--config", type=str)
+    parser.add_argument("--model", type=str, default="smolvlm2-500m")
+    parser.add_argument("--prompt", type=str,
+                        default="Describe what is happening in this security camera footage. Focus on people, vehicles, and any unusual activity.")
+    parser.add_argument("--max-frames", type=int, default=4)
+    parser.add_argument("--device", type=str, default="auto")
+    return parser.parse_args()
+
+
+def load_config(args):
+    if args.config and Path(args.config).exists():
+        with open(args.config) as f:
+            return json.load(f)
+    return {
+        "model": args.model,
+        "prompt": args.prompt,
+        "max_frames": args.max_frames,
+        "device": args.device,
+    }
+
+
+def emit(event):
+    print(json.dumps(event), flush=True)
+
+
+def extract_frames(video_path, max_frames=4):
+    """Extract evenly spaced frames from a video clip."""
+    import cv2
+    cap = cv2.VideoCapture(video_path)
+    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if total <= 0:
+        cap.release()
+        return []
+
+    indices = [int(i * total / max_frames) for i in range(max_frames)]
+    frames = []
+    for idx in indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+        ret, frame = cap.read()
+        if ret:
+            frames.append(frame)
+    cap.release()
+    return frames
+
+
+def main():
+    args = parse_args()
+    config = load_config(args)
+
+    try:
+        from llama_cpp import Llama
+        from llama_cpp.llama_chat_format import MiniCPMv26ChatHandler
+        import cv2
+        import base64
+
+        model_path = Path(f"models/{config['model']}.gguf")
+        if not model_path.exists():
+            emit({"event": "error", "message": f"Model not found: {model_path}. Run: python scripts/download_model.py --model {config['model']}", "retriable": False})
+            sys.exit(1)
+
+        chat_handler = MiniCPMv26ChatHandler(clip_model_path=str(model_path.with_suffix(".mmproj")))
+        llm = Llama(model_path=str(model_path), chat_handler=chat_handler, n_ctx=4096)
+
+        emit({"event": "ready", "model": config["model"], "device": config.get("device", "cpu")})
+    except Exception as e:
+        emit({"event": "error", "message": f"Failed to load model: {e}", "retriable": False})
+        sys.exit(1)
+
+    running = True
+    def handle_signal(s, f):
+        nonlocal running
+        running = False
+    signal.signal(signal.SIGTERM, handle_signal)
+    signal.signal(signal.SIGINT, handle_signal)
+
+    for line in sys.stdin:
+        if not running:
+            break
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            msg = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+
+        if msg.get("command") == "stop":
+            break
+
+        if msg.get("event") == "clip_ready":
+            video_path = msg.get("video_path")
+            clip_id = msg.get("clip_id", "unknown")
+            camera_id = msg.get("camera_id", "unknown")
+
+            if not video_path or not Path(video_path).exists():
+                emit({"event": "error", "message": f"Video not found: {video_path}", "retriable": True})
+                continue
+
+            try:
+                frames = extract_frames(video_path, config.get("max_frames", 4))
+                if not frames:
+                    emit({"event": "error", "message": "No frames extracted", "retriable": True})
+                    continue
+
+                # Encode frames as base64 for VLM
+                images = []
+                for frame in frames:
+                    _, buf = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
+                    images.append(f"data:image/jpeg;base64,{base64.b64encode(buf).decode()}")
+
+                content = [{"type": "text", "text": config["prompt"]}]
+                for img in images:
+                    content.append({"type": "image_url", "image_url": {"url": img}})
+
+                result = llm.create_chat_completion(messages=[
+                    {"role": "user", "content": content}
+                ])
+
+                description = result["choices"][0]["message"]["content"]
+                emit({
+                    "event": "analysis_result",
+                    "clip_id": clip_id,
+                    "camera_id": camera_id,
+                    "description": description,
+                    "objects": [],  # Could be extracted from description
+                    "confidence": 0.9,
+                })
+            except Exception as e:
+                emit({"event": "error", "message": f"Analysis error: {e}", "retriable": True})
+
+
+if __name__ == "__main__":
+    main()