This code is to run Segment Anything Model 3 ONNX models in c++ code and implemented on the macOS app RectLabel.
We recommend working through this blog post side-by-side with the Google Colab notebook.
sam3_polygon.mp4
Install CUDA, cuDNN, PyTorch, and ONNX Runtime.
Install Segment Anything Model 3 CPP Wrapper.
git clone https://github.com/ryouchinsa/sam3-cpp-macos.git
pip install git+https://github.com/huggingface/transformers@2a61590a479d3b1f77059f75caee7cc22760019dInstall tokenizers-cpp.
For macOS, download tokenizers-cpp from Hugging Face.
For Ubuntu GPU.
git clone --recursive https://github.com/mlc-ai/tokenizers-cpp.git
cp /content/sam3-cpp-macos/tokenizers-cpp/CMakeLists.txt .
cp /content/sam3-cpp-macos/tokenizers-cpp/msgpack/CMakeLists.txt msgpack
cp /content/sam3-cpp-macos/tokenizers-cpp/sentencepiece/CMakeLists.txt sentencepiece
apt update
apt install -y curl gcc make
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
import os
os.environ['CARGO_HOME'] = '/root/.cargo'
os.environ['PATH'] = f"{os.environ['CARGO_HOME']}/bin:{os.environ['PATH']}"
rustc --version
cargo --version
cd tokenizers-cpp/example
./build_and_run.sh
cd ..
mkdir lib
cp ./example/build/tokenizers/sentencepiece/src/libsentencepiece.a lib/
cp ./example/build/tokenizers/libtokenizers_c.a lib/
cp ./example/build/tokenizers/libtokenizers_cpp.a lib/
cd ..Download SAM 3 model from Hugging Face.
hf auth login
hf download facebook/sam3 model.safetensors tokenizer.jsonExport ONNX models. This script is originated from sam3-image.
Edit --model-path according to your downloaded huggingface path.
cd sam3-cpp-macos
# macOS
python export_v2.py --all --model-path /Users/ryo/Downloads/sam3-model --output-dir sam3 --device cpu --image-height 1008 --image-width 1008 --quantize
# Ubuntu GPU
python export_v2.py --all --model-path /root/.cache/huggingface/hub/models--facebook--sam3/snapshots/3c879f39826c281e95690f02c7821c4de09afae7 --output-dir sam3 --image-height 1008 --image-width 1008If you skip exporting, download exported SAM 3 ONNX models from Hugging Face.
Build and run.
# macOS
cmake -S . -B build -DONNXRUNTIME_ROOT_DIR=/Users/ryo/Downloads/onnxruntime-osx-universal2-1.23.2 -DTOKENIZERS_ROOT_DIR=/Users/ryo/Downloads/tokenizers-cpp
# Ubuntu GPU
cmake -S . -B build -DONNXRUNTIME_ROOT_DIR=/content/onnxruntime-linux-x64-gpu-1.23.2 -DTOKENIZERS_ROOT_DIR=/content/tokenizers-cpp
cmake --build build
# macOS
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cpu" -text="zebra" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cpu" -boxes="pos:124,113,183,329" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cpu" -boxes="pos:124,113,183,329;neg:379,454,329,297" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cpu" -text="zebra" -boxes="pos:379,454,329,297" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cpu" -text="zebra,water" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cpu" -boxes="pos:0,0,364,187-pos:379,454,329,297" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cpu" -text="tree,zebra" -boxes="pos:0,0,364,187-pos:379,454,329,297" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cpu" -text="zebra,water,tree" -threshold=0.25
# Ubuntu GPU
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cuda:0" -text="zebra" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cuda:0" -boxes="pos:124,113,183,329" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cuda:0" -boxes="pos:124,113,183,329;neg:379,454,329,297" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cuda:0" -text="zebra" -boxes="pos:379,454,329,297" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cuda:0" -text="zebra,water" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cuda:0" -boxes="pos:0,0,364,187-pos:379,454,329,297" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cuda:0" -text="tree,zebra" -boxes="pos:0,0,364,187-pos:379,454,329,297" -threshold=0.5
./build/sam3_cpp_test -vision_encoder="sam3/vision-encoder.onnx" -text_encoder="sam3/text-encoder.onnx" -decoder="sam3/decoder.onnx" -tokenizer="sam3/tokenizer.json" -image="david-tomaseti-Vw2HZQ1FGjU-unsplash.jpg" -device="cuda:0" -text="zebra,water,tree" -threshold=0.25