Add reference image support for style-consistent generation

anbeckham · claude · anbeckham · commit e299a42e905b · 2026-04-09T20:41:03.000-05:00
generate_image now accepts an optional reference_image path so the
model can match an existing image's art style, color palette, and
visual mood when creating new images.  This solves cross-session
consistency problems (e.g. game characters that all share the same
look) by sending the reference pixels alongside the prompt via
Gemini's multimodal contents API.

Key changes:
- gemini_client: generate_image_gemini builds multi-part contents
  when reference image bytes are provided
- image_gen: threads reference_image through generate_with_gemini
  and auto_generate with style-reference prompt framing; auto-loads
  from style profile when no explicit reference is passed
- server: adds reference_image to generate_image and
  init_style_profile MCP tool schemas
- style_profile: adds reference_image field to DEFAULT_PROFILE and
  create_profile for cross-session persistence

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/gemini_visual_mcp/gemini_client.py b/src/gemini_visual_mcp/gemini_client.py
@@ -106,18 +106,32 @@ async def generate_image_gemini(
         self,
         prompt: str,
         aspect_ratio: str = "16:9",
+        reference_image_data: bytes | None = None,
+        reference_mime_type: str | None = None,
     ) -> list[dict]:
         """Generate image(s) using Gemini native image generation.
 
         Uses responseModalities: ["TEXT", "IMAGE"] to get inline image data.
+        When reference_image_data is provided, the image is sent alongside the
+        prompt so the model can match its style.
 
         Returns list of dicts with keys: 'data' (bytes), 'mime_type' (str), 'text' (str|None)
         """
 
         def _call():
+            if reference_image_data and reference_mime_type:
+                contents = [
+                    types.Part.from_bytes(
+                        data=reference_image_data, mime_type=reference_mime_type
+                    ),
+                    prompt,
+                ]
+            else:
+                contents = prompt
+
             response = self._client.models.generate_content(
                 model=GEMINI_FLASH_IMAGE,
-                contents=prompt,
+                contents=contents,
                 config=types.GenerateContentConfig(
                     response_modalities=["TEXT", "IMAGE"],
                 ),
diff --git a/src/gemini_visual_mcp/image_gen.py b/src/gemini_visual_mcp/image_gen.py
@@ -5,11 +5,13 @@
 """
 
 import logging
+from pathlib import Path
 from typing import Optional
 
 from .asset_manager import save_generated
 from .config import DEFAULT_ASPECT_RATIO, DEFAULT_IMAGE_COUNT
 from .gemini_client import GeminiClient
+from .image_utils import read_image
 from .prompt_engine import enhance
 from .style_profile import load_profile
 
@@ -23,21 +25,45 @@ async def generate_with_gemini(
     cwd: str = ".",
     use_profile: bool = True,
     template: Optional[str] = None,
+    reference_image: Optional[str] = None,
 ) -> list[dict]:
     """Generate image(s) using Gemini 2.5 Flash (fast, iterative drafts).
 
+    When reference_image is provided, the model receives the image alongside
+    the prompt and is instructed to match its art style in the new generation.
+
     Returns list of dicts with: path, enhanced_prompt, warnings, model, metadata
     """
     # Load profile
     profile = load_profile(cwd) if use_profile else None
 
+    # Auto-load reference image from profile if none provided explicitly
+    if not reference_image and profile and profile.get("reference_image"):
+        ref_path = profile["reference_image"]
+        if Path(ref_path).is_file():
+            reference_image = ref_path
+
     # Enhance prompt
     enhanced_prompt, warnings = enhance(prompt, profile=profile, template=template)
 
+    # Build style-reference prompt and read image bytes when a reference is provided
+    ref_data = None
+    ref_mime = None
+    if reference_image:
+        ref_data, ref_mime = read_image(reference_image)
+        enhanced_prompt = (
+            "Use the provided image ONLY as a style and aesthetic reference. "
+            "Do NOT reproduce or edit the reference image. Generate a completely "
+            "new image matching its art style, color palette, rendering technique, "
+            "and visual mood. The new image should depict: " + enhanced_prompt
+        )
+
     # Generate
     results = await client.generate_image_gemini(
         prompt=enhanced_prompt,
         aspect_ratio=aspect_ratio,
+        reference_image_data=ref_data,
+        reference_mime_type=ref_mime,
     )
 
     # Save results
@@ -49,6 +75,7 @@ async def generate_with_gemini(
             "model": "gemini-2.5-flash-image",
             "aspect_ratio": aspect_ratio,
             "template": template or "",
+            "reference_image": reference_image or "",
             "warnings": [w.to_dict() for w in warnings],
         }
 
@@ -141,17 +168,32 @@ async def auto_generate(
     cwd: str = ".",
     use_profile: bool = True,
     template: Optional[str] = None,
+    reference_image: Optional[str] = None,
 ) -> list[dict]:
     """Generate with automatic model selection.
 
     - "gemini": Use Gemini Flash (fast drafts, iterative editing)
     - "imagen": Use Imagen 4 (high quality finals)
     - "auto": Use Gemini for drafts, Imagen for production-quality assets
 
+    When reference_image is provided, the Gemini path is always used
+    (Imagen's text-to-image API does not accept reference images).
+
     Auto logic: Use Gemini by default. Use Imagen when:
     - User explicitly says "final", "production", "high quality", "polished"
     - Template recommends Imagen
     """
+    # Reference images require Gemini — Imagen doesn't support image input for generation
+    if reference_image:
+        if model == "imagen":
+            logger.warning(
+                "Reference image provided with model='imagen'. "
+                "Falling back to Gemini (Imagen does not support reference images)."
+            )
+        return await generate_with_gemini(
+            client, prompt, aspect_ratio, cwd, use_profile, template, reference_image
+        )
+
     if model == "imagen":
         return await generate_with_imagen(
             client, prompt, count, aspect_ratio, cwd, use_profile, template
diff --git a/src/gemini_visual_mcp/server.py b/src/gemini_visual_mcp/server.py
@@ -122,6 +122,17 @@ async def list_tools() -> list[Tool]:
                                 "default": True,
                                 "description": "Apply project style profile to the prompt",
                             },
+                            "reference_image": {
+                                "type": "string",
+                                "minLength": 1,
+                                "description": (
+                                    "Path to an existing image to use as a style reference. "
+                                    "The new image will match the reference's art style, colors, "
+                                    "and visual feel while depicting what the prompt describes. "
+                                    "Use this for visual consistency (e.g., game characters in the same style). "
+                                    "Auto-selects Gemini model when provided."
+                                ),
+                            },
                         },
                         "required": ["prompt"],
                     },
@@ -322,6 +333,14 @@ async def list_tools() -> list[Tool]:
                                 "type": "string",
                                 "description": "Design system (e.g., 'Material Design 3', 'custom')",
                             },
+                            "reference_image": {
+                                "type": "string",
+                                "description": (
+                                    "Path to a default reference image for style consistency. "
+                                    "When set, all image generations will match this image's style "
+                                    "unless overridden by an explicit reference_image in generate_image."
+                                ),
+                            },
                         },
                         "required": ["project_type"],
                     },
@@ -382,6 +401,7 @@ async def _handle_tool(self, name: str, args: dict) -> Any:
                 cwd=self._cwd(),
                 use_profile=args.get("use_profile", True),
                 template=args.get("template"),
+                reference_image=args.get("reference_image"),
             )
             # Clean up old previews on generation
             cleanup_old()
@@ -480,6 +500,7 @@ async def _handle_tool(self, name: str, args: dict) -> Any:
                 visual_style=args.get("visual_style"),
                 framework=args.get("framework"),
                 design_system=args.get("design_system"),
+                reference_image=args.get("reference_image"),
             )
 
         elif name == "get_prompt_templates":
@@ -586,6 +607,7 @@ def _init_style_profile(
         visual_style: str | None = None,
         framework: str | None = None,
         design_system: str | None = None,
+        reference_image: str | None = None,
     ) -> dict:
         """Create or update the project style profile."""
         cwd = self._cwd()
@@ -610,6 +632,8 @@ def _init_style_profile(
             detected["framework"] = framework
         if design_system:
             detected["design_system"] = design_system
+        if reference_image:
+            detected["reference_image"] = reference_image
 
         # Create the profile
         path = create_profile(
@@ -624,6 +648,7 @@ def _init_style_profile(
             image_style=detected.get("image_style", ""),
             aspect_ratio=detected.get("default_aspect_ratio", "16:9"),
             resolution=detected.get("default_resolution", "1K"),
+            reference_image=detected.get("reference_image", ""),
         )
 
         return {
diff --git a/src/gemini_visual_mcp/style_profile.py b/src/gemini_visual_mcp/style_profile.py
@@ -35,6 +35,7 @@
     "image_style": "modern illustrations",
     "default_aspect_ratio": "16:9",
     "default_resolution": "1K",
+    "reference_image": "",
 }
 
 
@@ -80,6 +81,7 @@ def create_profile(
     image_style: str = "",
     aspect_ratio: str = "16:9",
     resolution: str = "1K",
+    reference_image: str = "",
 ) -> Path:
     """Create a new style profile in the target directory."""
     profile = dict(DEFAULT_PROFILE)
@@ -88,6 +90,7 @@ def create_profile(
     profile["design_system"] = design_system
     profile["default_aspect_ratio"] = aspect_ratio
     profile["default_resolution"] = resolution
+    profile["reference_image"] = reference_image
 
     if colors:
         profile["colors"] = {**profile["colors"], **colors}
diff --git a/tests/test_gemini_client.py b/tests/test_gemini_client.py
@@ -65,6 +65,73 @@ def test_sync_call_success_after_retry(self):
                 assert func.call_count == 2
 
 
+class TestGenerateImageWithReference:
+    """Tests for reference image support in generate_image_gemini."""
+
+    @pytest.mark.asyncio
+    async def test_reference_image_sent_as_multipart_contents(self):
+        """When reference image is provided, contents should be a list with Part + prompt."""
+        with patch("gemini_visual_mcp.gemini_client.genai") as mock_genai:
+            mock_models = MagicMock()
+            mock_genai.Client.return_value.models = mock_models
+
+            # Build a fake response with an image part
+            mock_part = MagicMock()
+            mock_part.inline_data = MagicMock()
+            mock_part.inline_data.data = b"generated-image"
+            mock_part.inline_data.mime_type = "image/png"
+            mock_part.text = None
+
+            mock_candidate = MagicMock()
+            mock_candidate.content.parts = [mock_part]
+            mock_response = MagicMock()
+            mock_response.candidates = [mock_candidate]
+            mock_models.generate_content = MagicMock(return_value=mock_response)
+
+            client = GeminiClient(api_key="test-key")
+            await client.generate_image_gemini(
+                prompt="A warrior in matching style",
+                reference_image_data=b"ref-image-bytes",
+                reference_mime_type="image/png",
+            )
+
+            call_args = mock_models.generate_content.call_args
+            contents = call_args.kwargs["contents"]
+            # Should be a list with image Part and text prompt
+            assert isinstance(contents, list)
+            assert len(contents) == 2
+            assert contents[1] == "A warrior in matching style"
+
+    @pytest.mark.asyncio
+    async def test_no_reference_sends_plain_string(self):
+        """Without reference image, contents should be a plain string."""
+        with patch("gemini_visual_mcp.gemini_client.genai") as mock_genai:
+            mock_models = MagicMock()
+            mock_genai.Client.return_value.models = mock_models
+
+            mock_part = MagicMock()
+            mock_part.inline_data = MagicMock()
+            mock_part.inline_data.data = b"generated-image"
+            mock_part.inline_data.mime_type = "image/png"
+            mock_part.text = None
+
+            mock_candidate = MagicMock()
+            mock_candidate.content.parts = [mock_part]
+            mock_response = MagicMock()
+            mock_response.candidates = [mock_candidate]
+            mock_models.generate_content = MagicMock(return_value=mock_response)
+
+            client = GeminiClient(api_key="test-key")
+            await client.generate_image_gemini(
+                prompt="A simple landscape",
+            )
+
+            call_args = mock_models.generate_content.call_args
+            contents = call_args.kwargs["contents"]
+            assert isinstance(contents, str)
+            assert contents == "A simple landscape"
+
+
 class TestVideoModelMap:
     """Tests for video model name mapping."""
 
diff --git a/tests/test_image_gen.py b/tests/test_image_gen.py
diff --git a/tests/test_server.py b/tests/test_server.py