feat: add vit classification models to model exporter (#564)

mgumowsk · web-flow · commit 10e1df52cdb1 · 2026-05-05T07:37:46.000Z
* add vit models

* add vit models to converter
diff --git a/tools/model_converter/config.json b/tools/model_converter/config.json
@@ -579,6 +579,45 @@
       "license": "apache-2.0",
       "license_link": "https://spdx.org/licenses/Apache-2.0.html",
       "labels": "IMAGENET1K_V1"
+    },
+    {
+      "model_short_name": "vit_tiny_patch16_224_augreg_in21k",
+      "huggingface_repo": "timm/vit_tiny_patch16_224.augreg_in21k",
+      "huggingface_revision": "3d5f75e2fe58abe541d5651356278a1df3fd3ab3",
+      "model_library": "timm",
+      "model_full_name": "ViT-Tiny Patch16 224 AugReg ImageNet-21k",
+      "description": "Vision Transformer Tiny with 16x16 patches trained on ImageNet-21k with augmentation and regularization",
+      "docs": "https://huggingface.co/timm/vit_tiny_patch16_224.augreg_in21k",
+      "input_shape": [1, 3, 224, 224],
+      "input_names": ["image"],
+      "output_names": ["logits"],
+      "model_params": null,
+      "model_type": "Classification",
+      "reverse_input_channels": true,
+      "mean_values": "123.675 116.28 103.53",
+      "scale_values": "58.395 57.12 57.375",
+      "license": "apache-2.0",
+      "license_link": "https://spdx.org/licenses/Apache-2.0.html",
+      "labels": "IMAGENET21K"
+    },
+    {
+      "model_short_name": "vit_small_patch14_dinov2.lvd142m",
+      "huggingface_repo": "timm/vit_small_patch14_dinov2.lvd142m",
+      "huggingface_revision": "4610ca143709d58a633b6397a74412c2c3842454",
+      "model_library": "timm",
+      "model_full_name": "DINOv2-Small Patch14 518 LVD-142M",
+      "description": "DINOv2 Small ViT backbone for image feature extraction with 384-dimensional features",
+      "docs": "https://huggingface.co/timm/vit_small_patch14_dinov2.lvd142m",
+      "input_shape": [1, 3, 518, 518],
+      "input_names": ["image"],
+      "output_names": ["output"],
+      "model_params": null,
+      "model_type": "Classification",
+      "reverse_input_channels": true,
+      "mean_values": "123.675 116.28 103.53",
+      "scale_values": "58.395 57.12 57.375",
+      "license": "apache-2.0",
+      "license_link": "https://spdx.org/licenses/Apache-2.0.html"
     }
   ]
 }
diff --git a/tools/model_converter/model_converter.py b/tools/model_converter/model_converter.py
@@ -79,6 +79,14 @@ def get_labels(self, label_set: str) -> str | None:
             categories = [label.replace(" ", "_") for label in categories]
             return " ".join(categories)
 
+        if label_set == "IMAGENET21K":
+            from timm.data import ImageNetInfo
+
+            info = ImageNetInfo("imagenet21k")
+            categories = info.label_descriptions()
+            categories = [desc.split(",")[0].strip().replace(" ", "_") for desc in categories]
+            return " ".join(categories)
+
         return None
 
     def download_from_huggingface(
@@ -459,7 +467,7 @@ def create_calibration_dataset(
             return_labels: Whether to return labels along with images
 
         Returns:
-            List of preprocessed image arrays, or tuple of (images, labels) if return_labels=True
+            List of preprocessed image arrays, or tuple of (images, labels)
         """
         if not self.dataset_path or not self.dataset_path.exists():
             self.logger.warning("Dataset path not provided or doesn't exist. Skipping quantization.")
@@ -476,12 +484,12 @@ def create_calibration_dataset(
         image_dir = self.dataset_path
         if not image_dir.exists():
             self.logger.error(f"Image directory not found: {image_dir}")
-            return ([], []) if return_labels else []
+            return ([], [])
 
         image_entries = self._collect_dataset_entries(image_dir)
         if not image_entries:
             self.logger.error("No images found in dataset")
-            return ([], []) if return_labels else []
+            return ([], [])
 
         self.logger.info(f"Found {len(image_entries)} images in dataset")
         self.logger.info(f"Using {min(subset_size, len(image_entries))} images for calibration")
@@ -537,7 +545,7 @@ def create_calibration_dataset(
                 continue
 
         self.logger.info(f"✓ Created calibration dataset with {len(calibration_data)} images")
-        return calibration_data
+        return calibration_data, []
 
     def validate_model(
         self,
@@ -938,6 +946,7 @@ def process_model_config(self, config: dict[str, Any]) -> bool:
             # Quantize the model if dataset is available
             if self.dataset_path:
                 self.logger.info("Creating calibration dataset for INT8 quantization")
+                has_labels = bool(config.get("labels"))
 
                 self.logger.info("Creating validation dataset for accuracy measurement")
                 validation_data, validation_labels = self.create_calibration_dataset(
@@ -946,7 +955,7 @@ def process_model_config(self, config: dict[str, Any]) -> bool:
                     scale_values=scale_values,
                     reverse_input_channels=reverse_input_channels,
                     subset_size=300,
-                    return_labels=True,
+                    return_labels=has_labels,
                 )
 
                 if validation_data:
@@ -957,7 +966,7 @@ def process_model_config(self, config: dict[str, Any]) -> bool:
                         model_config=config,
                         preset="mixed",
                         validation_data=validation_data if validation_labels else None,
-                        validation_labels=validation_labels,
+                        validation_labels=validation_labels or None,
                     )
 
                 # Clean up temporary FP32 model after quantization