Add files via upload

LSH9832 · web-flow · commit ff8f78f0d3c4 · 2023-06-28T13:27:24.000+08:00
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# SAMLabeler Pro: 使用 [Segment Anyting Model](https://github.com/facebookresearch/segment-anything) 辅助的图像标注工具，支持远程多人同时标注
+# SAMLabeler Pro: 使用 [SAM](https://github.com/facebookresearch/segment-anything) 及[MobileSAM](https://github.com/ChaoningZhang/MobileSAM) 辅助的图像标注工具，支持远程多人同时标注
 
 ![image](https://user-images.githubusercontent.com/69880398/235317010-2ec560cf-1de9-436d-81a4-79654e533de1.png)
 
@@ -10,9 +10,12 @@
 - 使用本工具时为避免导入冲突，请务必不要在运行环境中安装SAM源码，本项目中的segment_anything文件夹便是作了一定改动的SAM源码。
 - 如果有QT报错，大概率是opencv的原因，请把requirements.txt文件中的opencv_python换为opencv_python_headless
 
+## 0 最近更新
+[2023/6/28] 目前已经支持轻量化模型[MobileSAM](https://github.com/ChaoningZhang/MobileSAM) ，以与SAM相近的精度达到近百倍的速度，且显存占用更少。[点此下载模型](https://github.com/LSH9832/SAMLabelerPro/releases/download/v0.2.0/mobile_sam.pt) 。若无特殊设置，将会优先加载此模型。
+
 ## 1 即将更新
 
-- CV真的太卷了，还好我毕业了，真的卷不动了。有空的话会加入FastSAM模型和MobileSAM模型（MobileSAM优先考虑）
+- 暂无，发现bug请在Issues中留言
 
 ## 2 相对于原版的新特性
 
diff --git a/demo/box2segment.py b/demo/box2segment.py
@@ -68,7 +68,7 @@ def draw_mask(self, mask, image=None, color=(0, 255, 0), ratio=0.5):
 
 
 def main(args):
-    assert args.size.lower() in ["b", "l", "h"]
+    assert args.size.lower() in ["mobile", "b", "l", "h"]
     boxSegmenter = SegBox(force_size=args.size, half=args.half)
 
     if not (args.cfg.startswith("/") or args.cfg[1] == ":"):
diff --git a/requirements.txt b/requirements.txt
@@ -10,6 +10,7 @@ torchvision
 pycocotools
 matplotlib
 requests
+timm
 flask   # only server need this.
 # onnxruntime
 # onnx
diff --git a/segment_any/segment_any.py b/segment_any/segment_any.py
@@ -11,13 +11,16 @@ def __init__(self, checkpoint, half=True, force_model_type=None):
 
         success = False
 
-        all_model_type = ["h", "l", "b"]
+        all_model_type = ["mobile", "h", "l", "b"]
         if force_model_type is not None and force_model_type in all_model_type:
             all_model_type = [force_model_type]
 
         for self.model_type in [f"vit_{t}" for t in all_model_type]:
             try:
                 half = half and torch.cuda.is_available() and not self.model_type.endswith("h")
+                if (self.model_type.endswith("h") or self.model_type.endswith("mobile")) and half:
+                    print(f"{self.model_type} can not run with half precision, using full precision.")
+                    half = False
                 print(f"try load weights '{checkpoint}' with model size '{self.model_type}'")
                 sam = sam_model_registry[self.model_type](checkpoint=checkpoint)
                 sam.to(device=self.device, dtype=torch.float16 if half else torch.float32)
@@ -35,14 +38,15 @@ def __init__(self, checkpoint, half=True, force_model_type=None):
         self.success = success
 
     def set_image(self, image):
+        # print("set image")
         self.predictor.set_image(image)
+        # print("done")
 
     def reset_image(self):
         self.predictor.reset_image()
         self.image = None
         torch.cuda.empty_cache()
 
-
     def predict_box(self, box, xyxy=True, expand=0):
 
         def modify_box(bbox: (list, np.ndarray), xy2=True):
diff --git a/segment_anything/build_sam.py b/segment_anything/build_sam.py
@@ -8,7 +8,7 @@
 
 from functools import partial
 
-from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
+from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer, TinyViT
 
 
 def build_sam_vit_h(checkpoint=None):
@@ -44,11 +44,60 @@ def build_sam_vit_b(checkpoint=None):
     )
 
 
+def build_sam_vit_mobile(checkpoint=None):
+    prompt_embed_dim = 256
+    image_size = 1024
+    vit_patch_size = 16
+    image_embedding_size = image_size // vit_patch_size
+    mobile_sam = Sam(
+            image_encoder=TinyViT(img_size=1024, in_chans=3, num_classes=1000,
+                embed_dims=[64, 128, 160, 320],
+                depths=[2, 2, 6, 2],
+                num_heads=[2, 4, 5, 10],
+                window_sizes=[7, 7, 14, 7],
+                mlp_ratio=4.,
+                drop_rate=0.,
+                drop_path_rate=0.0,
+                use_checkpoint=False,
+                mbconv_expand_ratio=4.0,
+                local_conv_size=3,
+                layer_lr_decay=0.8
+            ),
+            prompt_encoder=PromptEncoder(
+            embed_dim=prompt_embed_dim,
+            image_embedding_size=(image_embedding_size, image_embedding_size),
+            input_image_size=(image_size, image_size),
+            mask_in_chans=16,
+            ),
+            mask_decoder=MaskDecoder(
+                    num_multimask_outputs=3,
+                    transformer=TwoWayTransformer(
+                    depth=2,
+                    embedding_dim=prompt_embed_dim,
+                    mlp_dim=2048,
+                    num_heads=8,
+                ),
+                transformer_dim=prompt_embed_dim,
+                iou_head_depth=3,
+                iou_head_hidden_dim=256,
+            ),
+            pixel_mean=[123.675, 116.28, 103.53],
+            pixel_std=[58.395, 57.12, 57.375],
+        )
+    mobile_sam.eval()
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f)
+        mobile_sam.load_state_dict(state_dict)
+    return mobile_sam
+
+
 sam_model_registry = {
     "default": build_sam_vit_h,
     "vit_h": build_sam_vit_h,
     "vit_l": build_sam_vit_l,
     "vit_b": build_sam_vit_b,
+    "vit_mobile": build_sam_vit_mobile
 }
 
 
diff --git a/segment_anything/modeling/__init__.py b/segment_anything/modeling/__init__.py
@@ -9,3 +9,4 @@
 from .mask_decoder import MaskDecoder
 from .prompt_encoder import PromptEncoder
 from .transformer import TwoWayTransformer
+from .tiny_vit_sam import TinyViT
diff --git a/segment_anything/modeling/sam.py b/segment_anything/modeling/sam.py
@@ -11,6 +11,7 @@
 from typing import Any, Dict, List, Tuple
 
 from .image_encoder import ImageEncoderViT
+from .tiny_vit_sam import TinyViT
 from .mask_decoder import MaskDecoder
 from .prompt_encoder import PromptEncoder
 
@@ -22,7 +23,7 @@ class Sam(nn.Module):
 
     def __init__(
         self,
-        image_encoder: ImageEncoderViT,
+        image_encoder: [ImageEncoderViT, TinyViT],
         prompt_encoder: PromptEncoder,
         mask_decoder: MaskDecoder,
         pixel_mean: List[float] = [123.675, 116.28, 103.53],
@@ -32,7 +33,7 @@ def __init__(
         SAM predicts object masks from an image and input prompts.
 
         Arguments:
-          image_encoder (ImageEncoderViT): The backbone used to encode the
+          image_encoder (ImageEncoderViT, TinyVit): The backbone used to encode the
             image into image embeddings that allow for efficient mask prediction.
           prompt_encoder (PromptEncoder): Encodes various types of input prompts.
           mask_decoder (MaskDecoder): Predicts masks from the image embeddings
diff --git a/segment_anything/modeling/tiny_vit_sam.py b/segment_anything/modeling/tiny_vit_sam.py
diff --git a/widgets/canvas.py b/widgets/canvas.py
diff --git a/widgets/mainwindow.py b/widgets/mainwindow.py