fix: fix sam3 infer + add convert/infer scripts

vietanhdev · vietanhdev · commit b83a7b710b31 · 2026-02-21T20:37:04.000+07:00
diff --git a/convert_sam3.sh b/convert_sam3.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Export SAM3 ViT-H to ONNX.
+#
+# Requirements:
+#   - sam3 submodule initialised: git submodule update --init sam3
+#   - osam installed for CLIP tokenisation: pip install osam
+#
+# Optional: pass --simplify to run onnxsim after export (reduces some
+# redundant ops; vision_pos_enc_0/1 may be removed from the decoder).
+
+set -euo pipefail
+
+OUTPUT_DIR="${1:-output_models/sam3}"
+SIMPLIFY="${SIMPLIFY:-}"
+
+echo "Exporting SAM3 ViT-H to ONNX → $OUTPUT_DIR"
+
+if [ -n "$SIMPLIFY" ]; then
+    python -m samexporter.export_sam3 \
+        --output_dir "$OUTPUT_DIR" \
+        --opset 18 \
+        --simplify
+else
+    python -m samexporter.export_sam3 \
+        --output_dir "$OUTPUT_DIR" \
+        --opset 18
+fi
+
+echo "Done – models written to $OUTPUT_DIR/"
+echo "  sam3_image_encoder.onnx"
+echo "  sam3_language_encoder.onnx"
+echo "  sam3_decoder.onnx"
diff --git a/infer_sam3.sh b/infer_sam3.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# SAM3 inference examples.
+#
+# SAM3 supports three prompt modes:
+#   1. Text only        – open-vocabulary detection (no geometric hint needed)
+#   2. Text + point     – refine detection around a clicked pixel
+#   3. Text + rectangle – constrain detection to a bounding box
+#
+# The --text_prompt flag drives the language encoder; always supply it for
+# best results.  If omitted the model falls back to "visual" (no language
+# guidance) which may return fewer or no detections.
+
+set -euo pipefail
+
+ENC="output_models/sam3/sam3_image_encoder.onnx"
+DEC="output_models/sam3/sam3_decoder.onnx"
+LANG="output_models/sam3/sam3_language_encoder.onnx"
+IMG="images/truck.jpg"
+
+echo "--- SAM3: text-only prompt ('truck') ---"
+python -m samexporter.inference \
+    --encoder_model "$ENC" \
+    --decoder_model "$DEC" \
+    --language_encoder_model "$LANG" \
+    --image "$IMG" \
+    --prompt images/truck_sam3.json \
+    --text_prompt "truck" \
+    --output output_images/sam3_truck_text.png \
+    --sam_variant sam3
+
+echo "--- SAM3: text + rectangle prompt ---"
+python -m samexporter.inference \
+    --encoder_model "$ENC" \
+    --decoder_model "$DEC" \
+    --language_encoder_model "$LANG" \
+    --image "$IMG" \
+    --prompt images/truck_sam3_box.json \
+    --text_prompt "truck" \
+    --output output_images/sam3_truck_box.png \
+    --sam_variant sam3
+
+echo "--- SAM3: text + point prompt ---"
+python -m samexporter.inference \
+    --encoder_model "$ENC" \
+    --decoder_model "$DEC" \
+    --language_encoder_model "$LANG" \
+    --image "$IMG" \
+    --prompt images/truck_sam3_point.json \
+    --text_prompt "truck" \
+    --output output_images/sam3_truck_point.png \
+    --sam_variant sam3
+
+echo "Done – outputs saved to output_images/"
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
     "onnxsim==0.5.0",
     "numpy==1.26.4",
     "onnxscript==0.6.2",
+    "osam",  # CLIP tokeniser for SAM3 text prompts
 ]
 
 [project.urls]
diff --git a/samexporter/inference.py b/samexporter/inference.py
@@ -36,6 +36,12 @@ def str2bool(v):
     default=None,
     help="Path to the ONNX language encoder model (for SAM3)",
 )
+argparser.add_argument(
+    "--text_prompt",
+    type=str,
+    default=None,
+    help="Text prompt for SAM3 (e.g. 'truck'). Overrides any text entry in the prompt JSON.",
+)
 argparser.add_argument(
     "--image",
     type=str,
@@ -90,11 +96,14 @@ def str2bool(v):
 
 text_prompt = None
 if args.sam_variant == "sam3":
-    # Extract text prompt from JSON if available, otherwise default to "visual"
-    for p in prompt:
-        if p["type"] == "text":
-            text_prompt = p["data"]
-            break
+    # --text_prompt takes priority; fall back to any text entry in the JSON.
+    if args.text_prompt:
+        text_prompt = args.text_prompt
+    else:
+        for p in prompt:
+            if p["type"] == "text":
+                text_prompt = p["data"]
+                break
     if text_prompt is None:
         text_prompt = "visual"
 
@@ -109,14 +118,14 @@ def str2bool(v):
 # Merge masks
 mask = np.zeros((masks.shape[2], masks.shape[3], 3), dtype=np.uint8)
 if args.sam_variant == "sam3":
-    # SAM3 returns (N, 1, H, W) – render all N detected instances.
+    # SAM3 returns bool (N, 1, H, W) – render all N detected instances.
     for i in range(masks.shape[0]):
-        m = masks[i, 0]  # (H, W)
-        mask[m > 0.5] = [255, 0, 0]
+        m = masks[i, 0]  # (H, W) bool
+        mask[m] = [255, 0, 0]
 else:
-    # SAM1/SAM2 returns (1, 3, H, W) – merge all quality levels.
+    # SAM1/SAM2 return raw logits (1, 3, H, W) – threshold at 0 (= sigmoid 0.5).
     for m in masks[0, :, :, :]:
-        mask[m > 0.5] = [255, 0, 0]
+        mask[m > 0.0] = [255, 0, 0]
 
 # Binding image and mask
 visualized = cv2.addWeighted(image, 0.5, mask, 0.5, 0)
diff --git a/samexporter/sam3_onnx.py b/samexporter/sam3_onnx.py
@@ -68,7 +68,12 @@ def encode(self, cv_image: np.ndarray, text_prompt=None) -> dict[str, Any]:
 
         return embedding
 
-    def predict_masks(self, embedding: dict[str, Any], prompt) -> np.ndarray:
+    def predict_masks(
+        self,
+        embedding: dict[str, Any],
+        prompt,
+        confidence_threshold: float = 0.5,
+    ) -> np.ndarray:
         """Run the decoder for the given geometric prompt.
 
         Parameters
@@ -78,6 +83,9 @@ def predict_masks(self, embedding: dict[str, Any], prompt) -> np.ndarray:
         prompt:
             List of mark dicts, each with keys ``"type"`` (``"rectangle"``
             or ``"point"``) and ``"data"``.
+        confidence_threshold:
+            Minimum score to keep a detection.  Detections with score below
+            this value are discarded.  Defaults to ``0.5``.
 
         Returns
         -------
@@ -114,7 +122,7 @@ def predict_masks(self, embedding: dict[str, Any], prompt) -> np.ndarray:
         box_labels_np = np.array([box_labels], dtype=np.int64)
         box_masks_np = np.array([box_masks], dtype=np.bool_)
 
-        masks, _scores, _boxes = self.decoder(
+        masks, scores, _boxes = self.decoder(
             original_size,
             embedding["vision_pos_enc_0"],
             embedding["vision_pos_enc_1"],
@@ -130,6 +138,15 @@ def predict_masks(self, embedding: dict[str, Any], prompt) -> np.ndarray:
             box_masks_np,
         )
 
+        # Filter detections by confidence score.
+        if len(scores) > 0:
+            keep = np.where(scores > confidence_threshold)[0]
+            masks = (
+                masks[keep]
+                if len(keep) > 0
+                else np.zeros((0,) + masks.shape[1:], dtype=masks.dtype)
+            )
+
         return masks
 
     def transform_masks(self, masks, original_size, transform_matrix):

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ dependencies = [`
`31`	`31`	`"onnxsim==0.5.0",`
`32`	`32`	`"numpy==1.26.4",`
`33`	`33`	`"onnxscript==0.6.2",`
	`34`	`+ "osam", # CLIP tokeniser for SAM3 text prompts`
`34`	`35`	`]`
`35`	`36`
`36`	`37`	`[project.urls]`