Merge pull request #1232 from llmware-ai/update-010426-onnx-vision

doberst · web-flow · commit 5dbc93e66263 · 2026-01-04T07:37:05.000-05:00
update onnx vision model support
diff --git a/examples/Models/using-onnx-vision-model.py b/examples/Models/using-onnx-vision-model.py
@@ -0,0 +1,19 @@
+""" This example shows how to use multimedia vision-to-text model with onnxruntime -
+
+    to run, pip install onnxruntime_genai
+"""
+
+from llmware.models import ModelCatalog
+
+model = ModelCatalog().load_model("phi-3-vision-onnx")
+
+# supported image types: jpg, png
+img_path = "/path/to/local/image"
+
+# to run a streaming response
+for token in model.stream("Describe this image",img_path):
+    print(token, end="")
+
+# to get a complete response upon completion only
+response = model.inference("Describe this image", img_path)
+print("--vision response - ", response)
diff --git a/llmware/model_configs.py b/llmware/model_configs.py
@@ -3565,8 +3565,20 @@
          "model_family": "ONNXEmbeddingModel", "model_category": "embedding",
          "model_location": "llmware_repo", "use_case": "classifier",
          "embedding_dims": 768, "context_window": 512, "link": "https://none",
-         "custom_model_repo": "", "hf_repo": "llmware/unitary-unbiased-toxic-roberta-onnx"}
+         "custom_model_repo": "", "hf_repo": "llmware/unitary-unbiased-toxic-roberta-onnx"},
 
+        {"model_name": "phi-3-vision-onnx", "display_name": "phi-3-vision-3b",
+         "model_family": "ONNXVisionGenerativeModel", "model_category": "generative_local",
+         "model_location": "llmware_repo", "context_window": 4096,  "instruction_following": False,
+         "prompt_wrapper": "phi_3_vision", "temperature": 0.0, "trailing_space": "",
+         "hf_repo": "llmware/phi-3-vision-onnx",
+         "link": "https://huggingface.co/llmware/phi-3-vision-onnx",
+         "tokenizer_local": "tokenizer_phi3.json",
+         "fetch": {"module": "llmware.models", "method": "pull_snapshot_from_hf"},
+         "validation_files": ["phi-3-v-128k-instruct-text.onnx.data",
+                              "phi-3-v-128k-instruct-vision.onnx.data",
+                              "phi-3-v-128k-instruct-embedding.onnx.data"],
+         "custom_model_files": [], "custom_model_repo": "", "parameters": 3.8}
 
 ]
 
@@ -3630,6 +3642,12 @@
         "phi_3": {"system_start": "<|system|>\n", "system_stop": "<|end|>\n",
                   "main_start": "<|user|>\n", "main_stop": "<|end|>\n", "start_llm_response": "<|assistant|>"},
 
+        # intended for embedding one image only currently
+        "phi_3_vision": {"system_start": "", "system_stop": "",
+                         "main_start": "<|user|>\n<|image_1|>\n",
+                         "main_stop": "<|end|>\n",
+                         "start_llm_response": "<|assistant|>\n"},
+
         "phi_4": {"system_start": "<|im_start|>system<|im_sep|>\n",
                   "system_stop": "<|im_end|>\n",
                   "main_start": "<|im_start|>user<|im_sep|>\n",
diff --git a/llmware/models.py b/llmware/models.py