mellea/docs/examples/image_text_models/vision_openai_examples.py at main · leothomas/mellea · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# pytest: ollama, e2e

"""Examples using vision models with OpenAI backend."""

import pathlib

from PIL import Image

from mellea import MelleaSession
from mellea.backends.openai import OpenAIBackend
from mellea.core import ImageBlock
from mellea.stdlib.context import ChatContext

# # using anthropic AI model ...
# anth_key = os.environ.get("ANTHROPIC_API_KEY")
# m = MelleaSession(OpenAIBackend(model_id="claude-3-haiku-20240307",
#                                 api_key=anth_key,  # Your Anthropic API key
#                                 base_url="https://api.anthropic.com/v1/"  # Anthropic's API endpoint
#                                 ),
#                 ctx=ChatContext())

# using LM Studio model locally
# m = MelleaSession(
#     OpenAIBackend(model_id="qwen/qwen2.5-vl-7b", base_url="http://127.0.0.1:1234/v1"), ctx=ChatContext()
# )

m = MelleaSession(
    OpenAIBackend(
        model_id="qwen2.5vl:7b", base_url="http://localhost:11434/v1", api_key="ollama"
    ),
    ctx=ChatContext(),
)

# load PIL image and convert to mellea ImageBlock
image_path = pathlib.Path(__file__).parent.joinpath("pointing_up.jpg")
test_pil = Image.open(image_path)
test_img = ImageBlock.from_pil_image(test_pil)

# check if model is able to do text chat
ch = m.chat("What's 1+1?")
print(str(ch.content))

# now test with MELLEA image
res = m.instruct(
    "Is there a person on the image? Is the subject in the image smiling?",
    images=[test_img],
)
print(str(res))
# print(m.last_prompt())

# and now with PIL image and using m.chat
chat_res = m.chat(
    "How many eyes can you identify in the image? Explain.",
    images=[test_pil],  # type: ignore[arg-type]
)
print(str(chat_res.content))

# and now without images again...
res = m.instruct("How many eyes can you identify in the image?", images=[])
print(str(res))