huggingface
diff --git a/‎docs/source/en/api/pipelines/ideogram4.md‎
Lines changed: 66 additions & 0 deletions b/‎docs/source/en/api/pipelines/ideogram4.md‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/modular_pipelines/ideogram4/encoders.py‎
Lines changed: 141 additions & 1 deletion b/‎src/diffusers/modular_pipelines/ideogram4/encoders.py‎
Lines changed: 141 additions & 1 deletion
diff --git a/‎src/diffusers/modular_pipelines/ideogram4/modular_blocks_ideogram4.py‎
Lines changed: 14 additions & 5 deletions b/‎src/diffusers/modular_pipelines/ideogram4/modular_blocks_ideogram4.py‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/ideogram4/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎src/diffusers/pipelines/ideogram4/__init__.py‎
Lines changed: 3 additions & 0 deletions
@@ -40,12 +40,78 @@ image = pipe(prompt, height=1024, width=1024, generator=torch.Generator("cuda").
 image.save("ideogram4.png")
 ```
 
+## Prompt upsampling
+
+Ideogram 4 is trained on a structured JSON caption rather than a free-form prompt, so a short prompt is best
+expanded into that native schema before generation. There are two ways to produce the caption.
+
+### Remote (Ideogram API)
+
+For the best results, expand the prompt with Ideogram's hosted magic-prompt API and pass the returned caption
+straight to the pipeline (get a key at [developer.ideogram.ai](https://developer.ideogram.ai/)):
+
+```python
+import json
+import requests
+import torch
+from diffusers import Ideogram4Pipeline
+
+pipe = Ideogram4Pipeline.from_pretrained("ideogram-ai/ideogram-4-nf4", torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+
+# Expand the prompt into a structured JSON caption with Ideogram's hosted magic-prompt API.
+response = requests.post(
+    "https://api.ideogram.ai/v1/ideogram-v4/magic-prompt",
+    headers={"Api-Key": "your_ideogram_api_key"},
+    json={"text_prompt": "A photo of a cat holding a sign that says hello world", "aspect_ratio": "1x1"},
+).json()
+caption = json.dumps(response["json_prompt"])
+
+# The caption is already upsampled, so pass it directly (no prompt_upsampling).
+image = pipe(caption, height=1024, width=1024, generator=torch.Generator("cuda").manual_seed(0)).images[0]
+image.save("ideogram4_upsampled.png")
+```
+
+### Local (on-device)
+
+For a fully local pipeline, load a small [`Ideogram4PromptEnhancerHead`] (the Qwen3-VL LM head) as the optional
+`prompt_enhancer_head` component and pass `prompt_upsampling=True`. The head is grafted onto the shared
+`text_encoder`, so no second text encoder is loaded. Install `outlines` for schema-constrained captions (the nf4
+checkpoint also needs `bitsandbytes`):
+
+```python
+import torch
+from diffusers import Ideogram4Pipeline, Ideogram4PromptEnhancerHead
+
+prompt_enhancer_head = Ideogram4PromptEnhancerHead.from_pretrained(
+    "diffusers/qwen3-vl-8b-instruct-lm-head", torch_dtype=torch.bfloat16
+)
+pipe = Ideogram4Pipeline.from_pretrained(
+    "ideogram-ai/ideogram-4-nf4", prompt_enhancer_head=prompt_enhancer_head, torch_dtype=torch.bfloat16
+)
+pipe.to("cuda")
+
+prompt = "A photo of a cat holding a sign that says hello world"
+image = pipe(
+    prompt,
+    height=1024,
+    width=1024,
+    prompt_upsampling=True,
+    generator=torch.Generator("cuda").manual_seed(0),
+).images[0]
+image.save("ideogram4_upsampled.png")
+```
+
 ## Ideogram4Pipeline
 
 [[autodoc]] Ideogram4Pipeline
 	- all
 	- __call__
 
+## Ideogram4PromptEnhancerHead
+
+[[autodoc]] Ideogram4PromptEnhancerHead
+
 ## Ideogram4PipelineOutput
 
 [[autodoc]] pipelines.ideogram4.pipeline_output.Ideogram4PipelineOutput
@@ -594,6 +594,7 @@
             "HunyuanVideoPipeline",
             "I2VGenXLPipeline",
             "Ideogram4Pipeline",
+            "Ideogram4PromptEnhancerHead",
             "IFImg2ImgPipeline",
             "IFImg2ImgSuperResolutionPipeline",
             "IFInpaintingPipeline",
@@ -1413,6 +1414,7 @@
             HunyuanVideoPipeline,
             I2VGenXLPipeline,
             Ideogram4Pipeline,
+            Ideogram4PromptEnhancerHead,
             IFImg2ImgPipeline,
             IFImg2ImgSuperResolutionPipeline,
             IFInpaintingPipeline,
 
@@ -17,7 +17,14 @@
 from transformers import Qwen2Tokenizer, Qwen3VLModel
 from transformers.masking_utils import create_causal_mask
 
-from ...utils import logging
+from ...pipelines.ideogram4.prompt_enhancer import (
+    PROMPT_UPSAMPLE_TEMPERATURE,
+    Ideogram4PromptEnhancerHead,
+    build_caption_logits_processor,
+    build_prompt_enhancer,
+    generate_captions,
+)
+from ...utils import is_outlines_available, logging
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import Ideogram4ModularPipeline
@@ -31,6 +38,139 @@
 QWEN3_VL_ACTIVATION_LAYERS = (0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 35)
 
 
+# auto_docstring
+class Ideogram4PromptUpsampleStep(ModularPipelineBlocks):
+    """
+    Optional step that rewrites the prompt(s) into Ideogram4's native structured JSON caption (the format the model is
+    trained on) when ``prompt_upsampling=True``. Requires the optional ``prompt_enhancer_head`` component, which is
+    grafted onto the shared ``text_encoder`` body to make it generative; install ``outlines`` for schema-constrained
+    captions.
+
+      Components:
+          text_encoder (`Qwen3VLModel`): The Qwen3-VL text encoder. tokenizer (`Qwen2Tokenizer`): The tokenizer paired
+          with the text encoder. prompt_enhancer_head (`Ideogram4PromptEnhancerHead`): The LM head grafted onto the
+          text encoder for upsampling.
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          prompt_upsampling (`bool`, *optional*, defaults to False):
+              If True, rewrite the prompt into the native JSON caption before encoding.
+          prompt_upsampling_temperature (`float`, *optional*, defaults to 1.0):
+              Sampling temperature for prompt upsampling.
+          height (`int`, *optional*):
+              Together with width, sets the caption's target aspect ratio.
+          width (`int`, *optional*):
+              Together with height, sets the caption's target aspect ratio.
+          generator (`Generator`, *optional*):
+              Reused to make the upsampling reproducible.
+
+      Outputs:
+          prompt (`str`):
+              The (possibly upsampled) prompt forwarded to the text encoder.
+    """
+
+    model_name = "ideogram4"
+
+    def __init__(self):
+        # Built lazily on first upsample: the head-less encoder body + `prompt_enhancer_head`, combined.
+        self._prompt_enhancer = None
+        # Outlines logits processor for schema-constrained captions; built lazily on first upsample.
+        self._caption_logits_processor = None
+        super().__init__()
+
+    @property
+    def description(self) -> str:
+        return (
+            "Optional step that rewrites the prompt(s) into Ideogram4's native structured JSON caption when "
+            "`prompt_upsampling=True` (the format the model is trained on). Requires a generative `text_encoder` "
+            "(a `Qwen3VLForConditionalGeneration`); install `outlines` for schema-constrained captions."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", Qwen3VLModel, description="The Qwen3-VL text encoder."),
+            ComponentSpec("tokenizer", Qwen2Tokenizer, description="The tokenizer paired with the text encoder."),
+            ComponentSpec(
+                "prompt_enhancer_head",
+                Ideogram4PromptEnhancerHead,
+                description="LM head grafted onto the text encoder for prompt upsampling.",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("prompt", required=True),
+            InputParam(
+                name="prompt_upsampling",
+                type_hint=bool,
+                default=False,
+                description="If True, rewrite the prompt into Ideogram4's native JSON caption before encoding.",
+            ),
+            InputParam(
+                name="prompt_upsampling_temperature",
+                type_hint=float,
+                default=PROMPT_UPSAMPLE_TEMPERATURE,
+                description="Sampling temperature for prompt upsampling.",
+            ),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("max_sequence_length", default=2048),
+            InputParam.template("generator"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                name="prompt",
+                type_hint=list,
+                description="The (possibly upsampled) prompt forwarded to the text encoder.",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        if block_state.prompt_upsampling:
+            if components.prompt_enhancer_head is None:
+                raise ValueError(
+                    "Prompt upsampling requires the `prompt_enhancer_head` component, which is not loaded. Load an "
+                    "`Ideogram4PromptEnhancerHead` and add it to the pipeline."
+                )
+            if self._prompt_enhancer is None:
+                self._prompt_enhancer = build_prompt_enhancer(components.text_encoder, components.prompt_enhancer_head)
+            if self._caption_logits_processor is None and is_outlines_available():
+                self._caption_logits_processor = build_caption_logits_processor(
+                    self._prompt_enhancer, components.tokenizer
+                )
+            if self._caption_logits_processor is None:
+                logger.warning_once(
+                    "`outlines` is not installed; prompt upsampling runs unconstrained and may not return "
+                    "schema-valid JSON. Install with `pip install outlines` for structured captions."
+                )
+            height = block_state.height or components.default_height
+            width = block_state.width or components.default_width
+            block_state.prompt = generate_captions(
+                self._prompt_enhancer,
+                components.tokenizer,
+                self._caption_logits_processor,
+                block_state.prompt,
+                height,
+                width,
+                temperature=block_state.prompt_upsampling_temperature,
+                max_new_tokens=block_state.max_sequence_length,
+                generator=block_state.generator,
+                device=components._execution_device,
+            )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 # auto_docstring
 class Ideogram4TextEncoderStep(ModularPipelineBlocks):
     """
 
@@ -24,7 +24,7 @@
 )
 from .decoders import Ideogram4DecodeStep
 from .denoise import Ideogram4AfterDenoiseStep, Ideogram4DenoiseStep
-from .encoders import Ideogram4TextEncoderStep
+from .encoders import Ideogram4PromptUpsampleStep, Ideogram4TextEncoderStep
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -123,6 +123,10 @@ class Ideogram4AutoBlocks(SequentialPipelineBlocks):
       Inputs:
           prompt (`str`):
               The prompt or prompts to guide image generation.
+          prompt_upsampling (`bool`, *optional*, defaults to False):
+              Rewrite the prompt into Ideogram4's native structured JSON caption before encoding.
+          prompt_upsampling_temperature (`float`, *optional*, defaults to 1.0):
+              Sampling temperature for prompt upsampling.
           max_sequence_length (`int`, *optional*, defaults to 2048):
               Maximum sequence length for prompt encoding.
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -154,8 +158,13 @@ class Ideogram4AutoBlocks(SequentialPipelineBlocks):
     """
 
     model_name = "ideogram4"
-    block_classes = [Ideogram4TextEncoderStep(), Ideogram4CoreDenoiseStep(), Ideogram4DecodeStep()]
-    block_names = ["text_encoder", "denoise", "decode"]
+    block_classes = [
+        Ideogram4PromptUpsampleStep(),
+        Ideogram4TextEncoderStep(),
+        Ideogram4CoreDenoiseStep(),
+        Ideogram4DecodeStep(),
+    ]
+    block_names = ["prompt_upsample", "text_encoder", "denoise", "decode"]
 
     # Workflow map declaring the trigger conditions for each supported workflow.
     # `True` means the workflow triggers when the input is not None.
@@ -166,8 +175,8 @@ class Ideogram4AutoBlocks(SequentialPipelineBlocks):
     @property
     def description(self) -> str:
         return (
-            "Auto Modular pipeline for text-to-image generation using Ideogram4: encode text -> core denoise "
-            "(asymmetric CFG over two transformers) -> decode."
+            "Auto Modular pipeline for text-to-image generation using Ideogram4: (optional) prompt upsampling -> "
+            "encode text -> core denoise (asymmetric CFG over two transformers) -> decode."
         )
 
     @property
 
@@ -288,7 +288,7 @@
     ]
     _import_structure["hunyuan_video1_5"] = ["HunyuanVideo15Pipeline", "HunyuanVideo15ImageToVideoPipeline"]
     _import_structure["hunyuan_image"] = ["HunyuanImagePipeline", "HunyuanImageRefinerPipeline"]
-    _import_structure["ideogram4"] = ["Ideogram4Pipeline"]
+    _import_structure["ideogram4"] = ["Ideogram4Pipeline", "Ideogram4PromptEnhancerHead"]
     _import_structure["kandinsky"] = [
         "KandinskyCombinedPipeline",
         "KandinskyImg2ImgCombinedPipeline",
@@ -748,7 +748,7 @@
         )
         from .hunyuan_video1_5 import HunyuanVideo15ImageToVideoPipeline, HunyuanVideo15Pipeline
         from .hunyuandit import HunyuanDiTPipeline
-        from .ideogram4 import Ideogram4Pipeline
+        from .ideogram4 import Ideogram4Pipeline, Ideogram4PromptEnhancerHead
         from .joyimage import JoyImageEditPipeline, JoyImageEditPipelineOutput
         from .kandinsky import (
             KandinskyCombinedPipeline,
 
@@ -25,6 +25,8 @@
 
     _import_structure["pipeline_output"] = ["Ideogram4PipelineOutput"]
 
+    _import_structure["prompt_enhancer"] = ["Ideogram4PromptEnhancerHead"]
+
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
         if not (is_transformers_available() and is_torch_available()):
@@ -34,6 +36,7 @@
     else:
         from .pipeline_ideogram4 import Ideogram4Pipeline
         from .pipeline_output import Ideogram4PipelineOutput
+        from .prompt_enhancer import Ideogram4PromptEnhancerHead
 else:
     import sys