INFERENCE_MODEL

makelinux · makelinux · commit fa627c384c58 · 2026-01-25T20:25:02.000+02:00
diff --git a/llm/client-openai-vector_stores-responses.py b/llm/client-openai-vector_stores-responses.py
@@ -2,7 +2,7 @@
 
 # ./llama-stack/docs/docs/getting_started/demo_script.py
 
-import io, requests
+import io, os, requests
 from openai import OpenAI
 
 c = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
@@ -14,7 +14,7 @@
 c.vector_stores.files.create(vs.id, file_id=fid)
 
 resp = c.responses.create(
-    model="vertexai/google/gemini-2.5-flash",
+    model=os.getenv("INFERENCE_MODEL", "vertexai/google/gemini-2.5-flash"),
     input="What is the color of abc?",
     tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
 #    include=["file_search_call.results"],
diff --git a/llm/llama-stack-client-completions-min.py b/llm/llama-stack-client-completions-min.py
@@ -6,12 +6,13 @@
 ## start llama-stack:
 # uv run --with llama-stack llama stack build --template ollama --image-type venv --image-name ~/my-ollama-llama-stack --run
 
+import os
 from llama_stack_client import LlamaStackClient
 
 c = LlamaStackClient(base_url="http://localhost:8321")
 
 r = c.chat.completions.create(
-    model="gemini/models/gemini-2.5-pro",
+    model=os.getenv("INFERENCE_MODEL", "gemini/models/gemini-2.5-pro"),
     messages=[
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": "Write a haiku about coding"},
diff --git a/llm/llama-stack-client-vector_stores-agent.py b/llm/llama-stack-client-vector_stores-agent.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python3
 
+import os
 from llama_stack_client import Agent, AgentEventLogger, LlamaStackClient
 import requests
 from io import BytesIO
@@ -16,7 +17,7 @@
 
 agent = Agent(
     c,
-    model="vertexai/google/gemini-2.5-flash",
+    model=os.getenv("INFERENCE_MODEL", "vertexai/google/gemini-2.5-flash"),
     instructions="You are a helpful assistant",
     tools=[
         { "type": "file_search", "vector_store_ids": [vs.id], }